Skip to content

Commit 10aded9

Browse files
committed
Combine verified and bash-only leaderboards
1 parent f28b73f commit 10aded9

9 files changed

Lines changed: 476 additions & 236 deletions

File tree

css/leaderboard-filters.css

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,46 @@
345345
}
346346
}
347347

348+
/* =============================================================================
349+
VERIFIED FILTERS (Agent + Models dropdowns)
350+
============================================================================= */
351+
352+
.verified-filters {
353+
display: flex;
354+
align-items: center;
355+
gap: var(--size-sm);
356+
flex-wrap: wrap;
357+
}
358+
359+
.verified-filters .filter-title {
360+
margin-bottom: 0;
361+
}
362+
363+
.verified-filters select {
364+
height: 31px;
365+
font-size: var(--text-sm);
366+
padding: var(--size-xs) var(--size-sm);
367+
padding-right: 2rem;
368+
width: auto;
369+
}
370+
371+
#standard-filters {
372+
display: flex;
373+
align-items: center;
374+
gap: var(--size-sm);
375+
flex-wrap: wrap;
376+
}
377+
378+
#standard-filters .filter-title {
379+
margin-bottom: 0;
380+
}
381+
382+
/* Compare button: allow hover for tooltip even when disabled */
383+
#compare-btn.button-disabled {
384+
pointer-events: auto;
385+
cursor: not-allowed;
386+
}
387+
348388
/* =============================================================================
349389
UTILITY OVERRIDES
350390
============================================================================= */

js/analysis.js

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,15 @@
3030
// Get full leaderboard data (it's a direct array, not wrapped in a property)
3131
const leaderboardData = getLeaderboardData();
3232

33-
const activeLeaderboard = leaderboardData?.find(lb => {
34-
return active.id === `leaderboard-${lb.name}`;
35-
});
33+
// For the Verified tab, always use bash-only data for comparison
34+
let activeLeaderboard;
35+
if (active.id === 'leaderboard-Verified') {
36+
activeLeaderboard = leaderboardData?.find(lb => lb.name === 'bash-only');
37+
} else {
38+
activeLeaderboard = leaderboardData?.find(lb => {
39+
return active.id === `leaderboard-${lb.name}`;
40+
});
41+
}
3642

3743
return Array.from(checkboxes).map(cb => {
3844
const row = cb.closest('tr');
@@ -469,13 +475,21 @@
469475
// Get current leaderboard
470476
const container = document.getElementById('leaderboard-container');
471477
const active = container ? container.querySelector('.tabcontent.active') : null;
472-
const leaderboardId = active ? active.id.replace('leaderboard-', '') : 'verified';
478+
let leaderboardId = active ? active.id.replace('leaderboard-', '') : 'Verified';
479+
480+
// For Verified tab, include agent mode in the URL
481+
const agentMode = (leaderboardId === 'Verified' && typeof getVerifiedAgentMode === 'function')
482+
? getVerifiedAgentMode()
483+
: null;
473484

474485
// Build URL parameters
475486
const params = new URLSearchParams();
476487
params.set('leaderboard', leaderboardId);
477488
params.set('chart', chartTypeValue);
478489
params.set('models', selected.map(m => m.name).join(','));
490+
if (agentMode) {
491+
params.set('agent', agentMode);
492+
}
479493

480494
// Create full URL
481495
const baseUrl = window.location.origin + window.location.pathname;
@@ -514,13 +528,29 @@
514528
return; // No state to restore
515529
}
516530

517-
const leaderboardName = params.get('leaderboard') || 'verified';
531+
let leaderboardName = params.get('leaderboard') || 'Verified';
518532
const chartType = params.get('chart') || 'bar';
533+
const agentMode = params.get('agent') || null;
519534
const modelNames = params.get('models').split(',').filter(m => m.trim());
520535

521536
if (modelNames.length === 0) {
522537
return;
523538
}
539+
540+
// Backward compat: map old bash-only to Verified with mini-SWE-agent
541+
if (leaderboardName === 'bash-only') {
542+
leaderboardName = 'Verified';
543+
// Set agent dropdown to all-mini to include legacy versions
544+
const agentDropdown = document.getElementById('agent-dropdown');
545+
if (agentDropdown) {
546+
agentDropdown.value = agentMode || 'all-mini';
547+
}
548+
} else if (leaderboardName === 'Verified' && agentMode) {
549+
const agentDropdown = document.getElementById('agent-dropdown');
550+
if (agentDropdown) {
551+
agentDropdown.value = agentMode;
552+
}
553+
}
524554

525555
// Switch to the correct leaderboard tab
526556
const leaderboardTab = document.querySelector(`[data-leaderboard="${leaderboardName}"]`);
@@ -579,7 +609,7 @@
579609
// Open via delegated event to handle dynamic rendering
580610
document.addEventListener('click', (e) => {
581611
const trigger = e.target && typeof e.target.closest === 'function' ? e.target.closest('#compare-btn') : null;
582-
if (trigger) {
612+
if (trigger && !trigger.disabled) {
583613
e.preventDefault();
584614
e.stopPropagation();
585615
openModal();

js/leaderboardFilters.js

Lines changed: 95 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -268,22 +268,47 @@ function updateTagsForLeaderboard(leaderboardName) {
268268
// Make function globally accessible
269269
window.updateTagsForLeaderboard = updateTagsForLeaderboard;
270270

271+
// Function to get the current agent mode for the Verified tab
272+
function getVerifiedAgentMode() {
273+
const dropdown = document.getElementById('agent-dropdown');
274+
return dropdown ? dropdown.value : 'mini-v2';
275+
}
276+
277+
// Function to get the current models filter for the Verified tab
278+
function getVerifiedModelsFilter() {
279+
const dropdown = document.getElementById('models-dropdown');
280+
return dropdown ? dropdown.value : 'all';
281+
}
282+
283+
window.getVerifiedAgentMode = getVerifiedAgentMode;
284+
window.getVerifiedModelsFilter = getVerifiedModelsFilter;
285+
271286
// Function to show/hide filter elements based on leaderboard type
272287
function updateFilterVisibility(leaderboardName) {
288+
const verifiedFilters = document.getElementById('verified-filters');
289+
const standardFilters = document.getElementById('standard-filters');
273290
const mainFiltersContainer = document.getElementById('main-filters');
274291
const tagFiltersContainer = document.getElementById('tag-filters');
292+
const legacyVersionFilter = document.getElementById('legacy-version-filter');
275293

276294
const leaderboardNameLower = leaderboardName.toLowerCase();
277-
const isBashOnly = leaderboardNameLower === 'bash-only';
295+
const isVerified = leaderboardNameLower === 'verified';
278296
const isMultilingual = leaderboardNameLower === 'multilingual';
279-
const hideMainFilters = isBashOnly || isMultilingual;
280297

281-
// Hide main filters (open scaffold/weight/checked) for bash-only and multilingual, but keep tag filters visible
282-
if (mainFiltersContainer) mainFiltersContainer.style.display = hideMainFilters ? 'none' : '';
283-
if (tagFiltersContainer) tagFiltersContainer.style.display = '';
298+
if (isVerified) {
299+
// Show Verified-specific dropdowns, hide standard filters
300+
if (verifiedFilters) verifiedFilters.style.display = '';
301+
if (standardFilters) standardFilters.style.display = 'none';
302+
} else {
303+
// Show standard filters, hide Verified dropdowns
304+
if (verifiedFilters) verifiedFilters.style.display = 'none';
305+
if (standardFilters) standardFilters.style.display = '';
284306

285-
const legacyVersionFilter = document.getElementById('legacy-version-filter');
286-
if (legacyVersionFilter) legacyVersionFilter.style.display = isBashOnly ? '' : 'none';
307+
const hideMainFilters = isMultilingual;
308+
if (mainFiltersContainer) mainFiltersContainer.style.display = hideMainFilters ? 'none' : '';
309+
if (tagFiltersContainer) tagFiltersContainer.style.display = '';
310+
if (legacyVersionFilter) legacyVersionFilter.style.display = 'none';
311+
}
287312
}
288313

289314
// Table Update Logic - Optimized for lazy loading
@@ -297,41 +322,58 @@ function updateTable() {
297322

298323
const tableRows = visibleLeaderboard.querySelectorAll('.data-table tbody tr:not(.no-results)');
299324
let visibleRowCount = 0;
325+
326+
// Determine if we're on the Verified tab
327+
const isVerifiedTab = visibleLeaderboard.id === 'leaderboard-Verified';
328+
const modelsFilter = isVerifiedTab ? getVerifiedModelsFilter() : null;
300329

301330
tableRows.forEach(row => {
302331
// Show row by default
303332
let showRow = true;
304333

305-
// Check filters
306-
for (const filter of activeFilters) {
307-
if (row.getAttribute(`data-${filter}`) !== 'true') {
308-
showRow = false;
309-
break;
334+
if (isVerifiedTab) {
335+
// For Verified tab, apply models filter
336+
if (modelsFilter === 'open-source') {
337+
if (row.getAttribute('data-os_model') !== 'true') {
338+
showRow = false;
339+
}
340+
} else if (modelsFilter === 'proprietary') {
341+
if (row.getAttribute('data-os_model') === 'true') {
342+
showRow = false;
343+
}
310344
}
311-
}
312-
313-
// Check legacy version filter
314-
if (showRow) {
315-
const legacyFilterContainer = document.getElementById('legacy-version-filter');
316-
const showLegacyCheckbox = document.getElementById('show-legacy-versions');
317-
if (legacyFilterContainer && legacyFilterContainer.style.display !== 'none' &&
318-
showLegacyCheckbox && !showLegacyCheckbox.checked &&
319-
row.classList.contains('legacy-version-row')) {
320-
showRow = false;
345+
} else {
346+
// For non-Verified tabs, apply standard filters
347+
for (const filter of activeFilters) {
348+
if (row.getAttribute(`data-${filter}`) !== 'true') {
349+
showRow = false;
350+
break;
351+
}
321352
}
322-
}
323-
324-
// Check tag filter
325-
if (showRow && window.tagFiltersDropdown) {
326-
const selectedTags = window.tagFiltersDropdown.getSelectedValues();
327-
const allTagsSelected = window.tagFiltersDropdown.isAllSelected();
328353

329-
if (!allTagsSelected) {
330-
const rowTags = (row.getAttribute('data-tags') || '').split(',').map(t => t.trim()).filter(Boolean);
331-
if (!rowTags.some(tag => selectedTags.includes(tag))) {
354+
// Check legacy version filter
355+
if (showRow) {
356+
const legacyFilterContainer = document.getElementById('legacy-version-filter');
357+
const showLegacyCheckbox = document.getElementById('show-legacy-versions');
358+
if (legacyFilterContainer && legacyFilterContainer.style.display !== 'none' &&
359+
showLegacyCheckbox && !showLegacyCheckbox.checked &&
360+
row.classList.contains('legacy-version-row')) {
332361
showRow = false;
333362
}
334363
}
364+
365+
// Check tag filter
366+
if (showRow && window.tagFiltersDropdown) {
367+
const selectedTags = window.tagFiltersDropdown.getSelectedValues();
368+
const allTagsSelected = window.tagFiltersDropdown.isAllSelected();
369+
370+
if (!allTagsSelected) {
371+
const rowTags = (row.getAttribute('data-tags') || '').split(',').map(t => t.trim()).filter(Boolean);
372+
if (!rowTags.some(tag => selectedTags.includes(tag))) {
373+
showRow = false;
374+
}
375+
}
376+
}
335377
}
336378

337379
// Toggle row visibility
@@ -340,11 +382,10 @@ function updateTable() {
340382
});
341383

342384
const noResultsMessage = visibleLeaderboard.querySelector('.no-results');
343-
// Show/hide no results message
344-
if (visibleRowCount === 0 && (activeFilters.size > 0 || !isAllTagsSelected())) {
345-
noResultsMessage.style.display = 'table-row';
385+
if (visibleRowCount === 0) {
386+
if (noResultsMessage) noResultsMessage.style.display = 'table-row';
346387
} else {
347-
noResultsMessage.style.display = 'none';
388+
if (noResultsMessage) noResultsMessage.style.display = 'none';
348389
}
349390

350391
// Update the select-all checkbox state after filtering
@@ -393,7 +434,7 @@ document.addEventListener('DOMContentLoaded', function() {
393434
}
394435
});
395436

396-
// Initialize with tags for the default leaderboard (bash-only)
437+
// Initialize with tags for the default leaderboard (Verified with bash-only data)
397438
updateTagsForLeaderboard('bash-only');
398439

399440
// Set initial selection for main filters
@@ -411,14 +452,29 @@ document.addEventListener('DOMContentLoaded', function() {
411452
showLegacyCheckbox.addEventListener('change', updateTable);
412453
}
413454

414-
// Check for initial leaderboard visibility (in case landing directly on bash-only)
455+
// Wire up Verified-specific dropdowns
456+
const agentDropdown = document.getElementById('agent-dropdown');
457+
if (agentDropdown) {
458+
agentDropdown.addEventListener('change', () => {
459+
if (typeof openLeaderboard === 'function') {
460+
openLeaderboard('Verified');
461+
}
462+
});
463+
}
464+
465+
const modelsDropdown = document.getElementById('models-dropdown');
466+
if (modelsDropdown) {
467+
modelsDropdown.addEventListener('change', updateTable);
468+
}
469+
470+
// Check for initial leaderboard visibility
415471
setTimeout(() => {
416472
const activeLeaderboard = document.querySelector('.tabcontent.active');
417473
if (activeLeaderboard) {
418474
const leaderboardId = activeLeaderboard.id;
419475
const leaderboardName = leaderboardId.replace('leaderboard-', '');
420476
updateFilterVisibility(leaderboardName);
421-
updateTagsForLeaderboard(leaderboardName); // Update tags for the initial leaderboard
477+
updateTagsForLeaderboard(leaderboardName);
422478
}
423479
}, 100);
424480
});
@@ -429,10 +485,9 @@ function updateLeaderboardDescription(leaderboardName) {
429485
if (!textContainer) return;
430486

431487
const descriptions = {
432-
'bash-only': '<em>Bash Only</em> evaluates all LMs with <a href="https://github.com/SWE-agent/mini-swe-agent">mini-SWE-agent</a> on SWE-bench Verified (<a href="bash-only.html">details</a>).',
488+
'verified': '<em>Verified</em> is a human-filtered subset of 500 instances. We use <a href="https://github.com/SWE-agent/mini-swe-agent">mini-SWE-agent</a> to evaluate all models with the same harness (<a href="verified.html">details</a>).',
433489
'multilingual': '<em>Multilingual</em> features 300 tasks across 9 programming languages (<a href="multilingual-leaderboard.html">details</a>)',
434490
'lite': '<em>Lite</em> is a subset of 300 instances for less costly evaluation (<a href="lite.html">details</a>)',
435-
'verified': '<em>Verified</em> is a human-filtered subset of 500 instances (<a href="https://openai.com/index/introducing-swe-bench-verified/">details</a>)',
436491
'test': '<em>Full</em> is a large benchmark made of 2000 instances (<a href="original.html">details</a>)',
437492
'multimodal': '<em>Multimodal</em> features issues with visual elements (<a href="multimodal.html">details</a>)',
438493
};

0 commit comments

Comments
 (0)