Skip to content

Commit b43f697

Browse files
committed
fix mypy errors
Signed-off-by: Oleg Silkin <97077423+RobotSail@users.noreply.github.com>
1 parent e2b41bd commit b43f697

1 file changed

Lines changed: 63 additions & 30 deletions

File tree

src/instructlab/eval/leaderboard.py

Lines changed: 63 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ class TaskGrouping(t.TypedDict):
7777

7878

7979
def evaluate_with_vllm(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
80-
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "true"
8180
results = simple_evaluate(
8281
tasks=args["tasks"],
8382
model="vllm",
@@ -93,12 +92,12 @@ def evaluate_with_vllm(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
9392
apply_chat_template=True,
9493
fewshot_as_multiturn=True,
9594
batch_size="auto",
95+
confirm_run_unsafe_code=True,
9696
)
9797
return results
9898

9999

100100
def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
101-
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = "true"
102101
os.environ["RANK"] = str(rank)
103102
os.environ["WORLD_SIZE"] = str(world_size)
104103
os.environ["LOCAL_RANK"] = str(rank)
@@ -122,17 +121,15 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
122121
batch_size="auto",
123122
device=f"cuda:{device.index}",
124123
cache_requests=True,
124+
confirm_run_unsafe_code=True,
125125
)
126126

127-
print(f"Rank {rank} got results: {type(results)}, putting them in the bucket")
128127
result_queue.put((rank, results))
129-
print(f"Rank {rank} done putting results in the bucket")
130128

131129
# clear torch memory
132130
gc.collect()
133131
torch.cuda.empty_cache()
134132

135-
print(f"Rank {rank} destroying process group")
136133
dist.destroy_process_group()
137134

138135

@@ -159,10 +156,8 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
159156

160157
results = {}
161158
for _ in range(num_processes):
162-
print(f"[master] getting results from the bucket")
163159
rank, result = result_queue.get()
164160
results[rank] = result
165-
print(f"[master] got results from rank {rank}")
166161

167162
# Wait for all processes to complete
168163
for p in processes:
@@ -209,7 +204,7 @@ def parse_multitask_results(
209204
210205
The end result is an unweighted average of all the subtasks, as well as a per-subtask breakdown.
211206
"""
212-
parsed_scores = {"score": 0.0, "subtasks": {}}
207+
parsed_scores: ParsedScores = {"score": 0.0, "subtasks": {}}
213208
subtask_scores = {}
214209
target_subtasks = result_dict["group_subtasks"].get(benchmark)
215210
if not target_subtasks:
@@ -335,7 +330,7 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
335330
return parsed_scores
336331

337332

338-
def get_parser(subtask: str) -> t.Callable[[t.Dict, str, str], ParsedScores]:
333+
def get_parser(subtask: str) -> t.Callable[[t.Dict[str, t.Any]], ParsedScores]:
339334
parser_map = {
340335
LeaderboardV2Tasks.BBH.value: parse_bbh,
341336
LeaderboardV2Tasks.GPQA.value: parse_gpqa,
@@ -349,13 +344,45 @@ def get_parser(subtask: str) -> t.Callable[[t.Dict, str, str], ParsedScores]:
349344
] # this will either parse and map into the correct section, or error
350345

351346

347+
def build_leaderboard_v2_result(
348+
parsed_scores: t.Dict[str, ParsedScores],
349+
) -> LeaderboardV2EvalResult:
350+
"""
351+
Build the leaderboard v2 result from the parsed scores.
352+
"""
353+
# now let's build the overall score
354+
leaderboard_result: LeaderboardV2EvalResult = {
355+
"overall_score": calculate_overall_leaderboard_score(parsed_scores),
356+
}
357+
358+
# explicitly set the score for each subtask in order to satisfy mypy
359+
if "leaderboard_bbh" in parsed_scores:
360+
leaderboard_result["leaderboard_bbh"] = parsed_scores["leaderboard_bbh"]
361+
if "leaderboard_gpqa" in parsed_scores:
362+
leaderboard_result["leaderboard_gpqa"] = parsed_scores["leaderboard_gpqa"]
363+
if "leaderboard_ifeval" in parsed_scores:
364+
leaderboard_result["leaderboard_ifeval"] = parsed_scores["leaderboard_ifeval"]
365+
if "leaderboard_math_hard" in parsed_scores:
366+
leaderboard_result["leaderboard_math_hard"] = parsed_scores[
367+
"leaderboard_math_hard"
368+
]
369+
if "leaderboard_mmlu_pro" in parsed_scores:
370+
leaderboard_result["leaderboard_mmlu_pro"] = parsed_scores[
371+
"leaderboard_mmlu_pro"
372+
]
373+
if "leaderboard_musr" in parsed_scores:
374+
leaderboard_result["leaderboard_musr"] = parsed_scores["leaderboard_musr"]
375+
376+
return leaderboard_result
377+
378+
352379
def get_scores_from_result_dicts(
353-
*result_dicts: t.List[t.Dict[str, t.Any]],
354-
) -> t.Dict[str, ParsedScores]:
380+
*result_dicts: t.Dict[str, t.Any],
381+
) -> LeaderboardV2EvalResult:
355382
"""
356383
Parse out the scores of all the subtasks of leaderboard and return.
357384
"""
358-
parsed_scores = {}
385+
parsed_scores: t.Dict[str, ParsedScores] = {}
359386
for result_dict in result_dicts:
360387
benchmarks_we_got = set(result_dict["results"].keys())
361388
benchmarks_we_care_about = set(
@@ -375,7 +402,7 @@ def get_scores_from_result_dicts(
375402
parse_benchmark_fn = get_parser(benchmark)
376403
parsed_scores[benchmark] = parse_benchmark_fn(result_dict)
377404

378-
return parsed_scores
405+
return build_leaderboard_v2_result(parsed_scores)
379406

380407

381408
def validate_output_path(output_file: str) -> None:
@@ -453,9 +480,9 @@ class LeaderboardV2Evaluator(Evaluator):
453480
def __init__(
454481
self,
455482
model_path: str,
456-
tasks: t.List[str] = None,
457-
num_gpus: int = None,
458-
output_file: str = None,
483+
tasks: t.Optional[t.List[str]] = None,
484+
num_gpus: t.Optional[int] = None,
485+
output_file: t.Optional[str] = None,
459486
):
460487
self.model_path = model_path
461488
if not cuda.is_available():
@@ -469,11 +496,13 @@ def __init__(
469496

470497
# validate output file
471498
self.output_file = output_file
472-
self._results = None
473-
self._lm_eval_results = [] # TODO: make it merge everything back into a single result
499+
self._results: t.Optional[LeaderboardV2EvalResult] = None
500+
self._lm_eval_results: t.List[
501+
t.Dict[str, t.Any]
502+
] = [] # TODO: make it merge everything back into a single result
474503

475504
@property
476-
def results(self) -> LeaderboardV2EvalResult:
505+
def results(self) -> t.Optional[LeaderboardV2EvalResult]:
477506
"""
478507
Returns the results of the most reccent leaderboard evaluation.
479508
@@ -492,7 +521,7 @@ def lm_eval_results(self) -> t.List[t.Dict[str, t.Any]]:
492521
"""
493522
return self._lm_eval_results
494523

495-
def save_to_file(self, output_file: str = None):
524+
def save_to_file(self, output_file: t.Optional[str] = None) -> None:
496525
"""
497526
Saves the results to a file.
498527
@@ -513,10 +542,10 @@ def save_to_file(self, output_file: str = None):
513542

514543
def run(
515544
self,
516-
model_path: str | None = None,
517-
tasks: t.List[str] = None,
518-
num_gpus: int = None,
519-
output_file: str = None,
545+
model_path: t.Optional[str] = None,
546+
tasks: t.Optional[t.List[str]] = None,
547+
num_gpus: t.Optional[int] = None,
548+
output_file: t.Optional[str] = None,
520549
) -> LeaderboardV2EvalResult:
521550
"""
522551
Run the Open LLM Leaderboard v2 evaluation.
@@ -538,6 +567,9 @@ def run(
538567
num_gpus = self.num_gpus if not num_gpus else num_gpus
539568
output_file = self.output_file if not output_file else output_file
540569

570+
if not tasks:
571+
tasks = LEADERBOARD_V2_MCQ_TASKS + LEADERBOARD_V2_GENERATIVE_TASKS
572+
541573
# validation logic
542574
# no need to validate model path -- the inference libraries will either be able to
543575
# load it, or they won't
@@ -562,25 +594,26 @@ def run(
562594
self._lm_eval_results = []
563595
vllm_results, hf_results = None, None
564596
if vllm_tasks := grouped_tasks["vllm"]:
565-
args: LeaderboardArgs = {
597+
args_vllm: LeaderboardArgs = {
566598
"model_path": model_path,
567599
"num_gpus": num_gpus,
568600
"tasks": vllm_tasks,
569601
}
570-
vllm_results = evaluate_with_vllm(args)
602+
vllm_results = evaluate_with_vllm(args_vllm)
571603
self._lm_eval_results.append(vllm_results)
572604
if hf_tasks := grouped_tasks["huggingface"]:
573-
args: LeaderboardArgs = {
605+
args_hf: LeaderboardArgs = {
574606
"model_path": model_path,
575607
"num_gpus": num_gpus,
576608
"tasks": hf_tasks,
577609
}
578-
hf_results = evaluate_with_hf(args)
610+
hf_results = evaluate_with_hf(args_hf)
579611
self._lm_eval_results.append(hf_results)
580612

581613
# convert the output of lm-eval into something that's already parsed
582-
results = get_scores_from_result_dicts(*self._lm_eval_results)
583-
results["overall_score"] = calculate_overall_leaderboard_score(results)
614+
results: LeaderboardV2EvalResult = get_scores_from_result_dicts(
615+
*self._lm_eval_results
616+
)
584617

585618
self._results = results
586619
if output_file:

0 commit comments

Comments
 (0)