forked from stanford-crfm/helm
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/stanford-crfm/benchmarking …
…into queriesjsonl
- Loading branch information
Showing
3 changed files
with
133 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import argparse | ||
import os | ||
import shutil | ||
import subprocess | ||
from typing import List, Optional | ||
|
||
from common.hierarchical_logger import hlog, htrack, htrack_block | ||
|
||
""" | ||
Verifies that the Scenario construction and generation of prompts are reproducible: | ||
1. Performs dryrun | ||
2. Performs dryrun again. | ||
3. Compares the requests in the scenario_state.json files of the two dryrun output folders. | ||
Usage: | ||
python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b | ||
""" | ||
DRYRUN_SUITE1: str = "dryrun_results1" | ||
DRYRUN_SUITE2: str = "dryrun_results2" | ||
|
||
|
||
@htrack("Performing dryrun") | ||
def do_dry_run( | ||
dryrun_suite: str, conf_path: str, max_eval_instances: int, priority: int, models: Optional[List[str]] | ||
) -> str: | ||
"""Performs dry run. Blocks until the run finishes.""" | ||
output_path: str = f"benchmark_output/runs/{dryrun_suite}" | ||
shutil.rmtree(output_path, ignore_errors=True) | ||
hlog(f"Deleted old results at path: {output_path}.") | ||
|
||
command: List[str] = [ | ||
"benchmark-present", | ||
f"--suite={dryrun_suite}", | ||
f"--conf-path={conf_path}", | ||
f"--max-eval-instances={max_eval_instances}", | ||
"--local", | ||
"--dry-run", | ||
f"--priority={priority}", | ||
] | ||
if models: | ||
command.append("--models-to-run") | ||
command.extend(models) | ||
|
||
hlog(" ".join(command)) | ||
subprocess.call(command) | ||
hlog(f"Results are written out to path: {output_path}.") | ||
return output_path | ||
|
||
|
||
@htrack("Verifying reproducibility") | ||
def verify_reproducibility(conf_path: str, max_eval_instances: int, priority: int, models: Optional[List[str]]): | ||
dryrun_path1: str = do_dry_run(DRYRUN_SUITE1, conf_path, max_eval_instances, priority, models) | ||
dryrun_path2: str = do_dry_run(DRYRUN_SUITE2, conf_path, max_eval_instances, priority, models) | ||
|
||
hlog(f"Comparing results in {dryrun_path1} vs. {dryrun_path2}") | ||
for run_dir in os.listdir(dryrun_path1): | ||
run_path1: str = os.path.join(dryrun_path1, run_dir) | ||
|
||
if not os.path.isdir(run_path1): | ||
continue | ||
|
||
scenario_state_path1: str = os.path.join(run_path1, "scenario_state.json") | ||
if not os.path.isfile(scenario_state_path1): | ||
continue | ||
|
||
run_path2: str = os.path.join(dryrun_path2, run_dir) | ||
scenario_state_path2: str = os.path.join(run_path2, "scenario_state.json") | ||
|
||
with htrack_block(f"Comparing `ScenarioState`s for {run_dir}"): | ||
with open(scenario_state_path1) as f: | ||
scenario_state1 = f.readlines() | ||
|
||
with open(scenario_state_path2) as f: | ||
scenario_state2 = f.readlines() | ||
|
||
same: bool = True | ||
# Check the difference between two scenario_state.json files | ||
for i, (line1, line2) in enumerate(zip(scenario_state1, scenario_state2)): | ||
if line1 != line2: | ||
line_number: int = i + 1 | ||
same = False | ||
hlog( | ||
"ERROR: Not reproducible - content of " | ||
f"{scenario_state_path1} and {scenario_state_path2} are different. " | ||
f"Line {line_number}:" | ||
) | ||
hlog(f"--- scenario_state.json (1): {line1}") | ||
hlog(f"+++ scenario_state.json (2): {line2}") | ||
break | ||
|
||
if same: | ||
hlog("Verified reproducible.") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"-m", "--max-eval-instances", type=int, default=1000, help="Maximum number of eval instances.", | ||
) | ||
parser.add_argument( | ||
"-c", | ||
"--conf-path", | ||
type=str, | ||
help="Where to read RunSpecs to run from", | ||
default="src/benchmark/presentation/run_specs.conf", | ||
) | ||
parser.add_argument( | ||
"--models-to-run", | ||
nargs="+", | ||
help="Only RunSpecs with these models specified. If no model is specified, run with all models.", | ||
default=None, | ||
) | ||
parser.add_argument( | ||
"--priority", type=int, default=2, help="Run RunSpecs with priority less than or equal to this number." | ||
) | ||
args = parser.parse_args() | ||
|
||
verify_reproducibility(args.conf_path, args.max_eval_instances, args.priority, args.models_to_run) | ||
hlog("Done.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters