Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into queriesjsonl
  • Loading branch information
teetone committed Jul 22, 2022
2 parents 6dc0e9b + 75a4495 commit b09cfde
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 1 deletion.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,16 @@ to estimate the token usage. The tokenizer will be downloaded and cached when ru
1. Run `venv/bin/benchmark-present --output-path src/proxy/static/benchmark_output`.
1. Visit the [benchmarking status page](https://crfm-models.stanford.edu/static/benchmarking.html).

### To verify that the Scenario construction and generation of prompts are reproducible

1. `ssh scdt`.
1. `cd /u/scr/nlp/crfm/benchmarking/benchmarking`.
1. Create a screen session: `screen -S reproducible`.
1. `conda activate crfm_benchmarking`.
1. Run `python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b
--conf-path src/benchmark/presentation/run_specs.conf --max-eval-instances 1000 --priority 2 &> reproducible.log`.
1. Check the result at `reproducible.log`.

# Contributing

## One-time setup
Expand Down
122 changes: 122 additions & 0 deletions scripts/verify_reproducibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import argparse
import os
import shutil
import subprocess
from typing import List, Optional

from common.hierarchical_logger import hlog, htrack, htrack_block

"""
Verifies that the Scenario construction and generation of prompts are reproducible:
1. Performs dryrun
2. Performs dryrun again.
3. Compares the requests in the scenario_state.json files of the two dryrun output folders.
Usage:
python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b
"""
DRYRUN_SUITE1: str = "dryrun_results1"
DRYRUN_SUITE2: str = "dryrun_results2"


@htrack("Performing dryrun")
def do_dry_run(
dryrun_suite: str, conf_path: str, max_eval_instances: int, priority: int, models: Optional[List[str]]
) -> str:
"""Performs dry run. Blocks until the run finishes."""
output_path: str = f"benchmark_output/runs/{dryrun_suite}"
shutil.rmtree(output_path, ignore_errors=True)
hlog(f"Deleted old results at path: {output_path}.")

command: List[str] = [
"benchmark-present",
f"--suite={dryrun_suite}",
f"--conf-path={conf_path}",
f"--max-eval-instances={max_eval_instances}",
"--local",
"--dry-run",
f"--priority={priority}",
]
if models:
command.append("--models-to-run")
command.extend(models)

hlog(" ".join(command))
subprocess.call(command)
hlog(f"Results are written out to path: {output_path}.")
return output_path


@htrack("Verifying reproducibility")
def verify_reproducibility(conf_path: str, max_eval_instances: int, priority: int, models: Optional[List[str]]):
dryrun_path1: str = do_dry_run(DRYRUN_SUITE1, conf_path, max_eval_instances, priority, models)
dryrun_path2: str = do_dry_run(DRYRUN_SUITE2, conf_path, max_eval_instances, priority, models)

hlog(f"Comparing results in {dryrun_path1} vs. {dryrun_path2}")
for run_dir in os.listdir(dryrun_path1):
run_path1: str = os.path.join(dryrun_path1, run_dir)

if not os.path.isdir(run_path1):
continue

scenario_state_path1: str = os.path.join(run_path1, "scenario_state.json")
if not os.path.isfile(scenario_state_path1):
continue

run_path2: str = os.path.join(dryrun_path2, run_dir)
scenario_state_path2: str = os.path.join(run_path2, "scenario_state.json")

with htrack_block(f"Comparing `ScenarioState`s for {run_dir}"):
with open(scenario_state_path1) as f:
scenario_state1 = f.readlines()

with open(scenario_state_path2) as f:
scenario_state2 = f.readlines()

same: bool = True
# Check the difference between two scenario_state.json files
for i, (line1, line2) in enumerate(zip(scenario_state1, scenario_state2)):
if line1 != line2:
line_number: int = i + 1
same = False
hlog(
"ERROR: Not reproducible - content of "
f"{scenario_state_path1} and {scenario_state_path2} are different. "
f"Line {line_number}:"
)
hlog(f"--- scenario_state.json (1): {line1}")
hlog(f"+++ scenario_state.json (2): {line2}")
break

if same:
hlog("Verified reproducible.")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-m", "--max-eval-instances", type=int, default=1000, help="Maximum number of eval instances.",
)
parser.add_argument(
"-c",
"--conf-path",
type=str,
help="Where to read RunSpecs to run from",
default="src/benchmark/presentation/run_specs.conf",
)
parser.add_argument(
"--models-to-run",
nargs="+",
help="Only RunSpecs with these models specified. If no model is specified, run with all models.",
default=None,
)
parser.add_argument(
"--priority", type=int, default=2, help="Run RunSpecs with priority less than or equal to this number."
)
args = parser.parse_args()

verify_reproducibility(args.conf_path, args.max_eval_instances, args.priority, args.models_to_run)
hlog("Done.")
2 changes: 1 addition & 1 deletion src/benchmark/presentation/present.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def main():
parser.add_argument(
"--models-to-run",
nargs="+",
help="Only RunSpecs with these models specified. If no model is specified, run everything.",
help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
default=None,
)
parser.add_argument(
Expand Down

0 comments on commit b09cfde

Please sign in to comment.