Merge branch 'main' of https://github.com/stanford-crfm/benchmarking …

…into queriesjsonl
danielz02 · Jul 22, 2022 · b09cfde · b09cfde
2 parents 6dc0e9b + 75a4495
commit b09cfde
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -351,6 +351,16 @@ to estimate the token usage. The tokenizer will be downloaded and cached when ru
 1. Run `venv/bin/benchmark-present --output-path src/proxy/static/benchmark_output`.
 1. Visit the [benchmarking status page](https://crfm-models.stanford.edu/static/benchmarking.html).
 
+### To verify that the Scenario construction and generation of prompts are reproducible
+
+1. `ssh scdt`.
+1. `cd /u/scr/nlp/crfm/benchmarking/benchmarking`.
+1. Create a screen session: `screen -S reproducible`.
+1. `conda activate crfm_benchmarking`.
+1. Run `python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b
+   --conf-path src/benchmark/presentation/run_specs.conf --max-eval-instances 1000 --priority 2 &> reproducible.log`.
+1. Check the result at `reproducible.log`.
+
 # Contributing
 
 ## One-time setup

diff --git a/scripts/verify_reproducibility.py b/scripts/verify_reproducibility.py
@@ -0,0 +1,122 @@
+import argparse
+import os
+import shutil
+import subprocess
+from typing import List, Optional
+
+from common.hierarchical_logger import hlog, htrack, htrack_block
+
+"""
+Verifies that the Scenario construction and generation of prompts are reproducible:
+
+1. Performs dryrun
+2. Performs dryrun again.
+3. Compares the requests in the scenario_state.json files of the two dryrun output folders.
+
+Usage:
+
+  python3 scripts/verify_reproducibility.py --models-to-run openai/davinci openai/code-cushman-001 together/gpt-neox-20b
+
+"""
+DRYRUN_SUITE1: str = "dryrun_results1"
+DRYRUN_SUITE2: str = "dryrun_results2"
+
+
+@htrack("Performing dryrun")
+def do_dry_run(
+    dryrun_suite: str, conf_path: str, max_eval_instances: int, priority: int, models: Optional[List[str]]
+) -> str:
+    """Performs dry run. Blocks until the run finishes."""
+    output_path: str = f"benchmark_output/runs/{dryrun_suite}"
+    shutil.rmtree(output_path, ignore_errors=True)
+    hlog(f"Deleted old results at path: {output_path}.")
+
+    command: List[str] = [
+        "benchmark-present",
+        f"--suite={dryrun_suite}",
+        f"--conf-path={conf_path}",
+        f"--max-eval-instances={max_eval_instances}",
+        "--local",
+        "--dry-run",
+        f"--priority={priority}",
+    ]
+    if models:
+        command.append("--models-to-run")
+        command.extend(models)
+
+    hlog(" ".join(command))
+    subprocess.call(command)
+    hlog(f"Results are written out to path: {output_path}.")
+    return output_path
+
+
+@htrack("Verifying reproducibility")
+def verify_reproducibility(conf_path: str, max_eval_instances: int, priority: int, models: Optional[List[str]]):
+    dryrun_path1: str = do_dry_run(DRYRUN_SUITE1, conf_path, max_eval_instances, priority, models)
+    dryrun_path2: str = do_dry_run(DRYRUN_SUITE2, conf_path, max_eval_instances, priority, models)
+
+    hlog(f"Comparing results in {dryrun_path1} vs. {dryrun_path2}")
+    for run_dir in os.listdir(dryrun_path1):
+        run_path1: str = os.path.join(dryrun_path1, run_dir)
+
+        if not os.path.isdir(run_path1):
+            continue
+
+        scenario_state_path1: str = os.path.join(run_path1, "scenario_state.json")
+        if not os.path.isfile(scenario_state_path1):
+            continue
+
+        run_path2: str = os.path.join(dryrun_path2, run_dir)
+        scenario_state_path2: str = os.path.join(run_path2, "scenario_state.json")
+
+        with htrack_block(f"Comparing `ScenarioState`s for {run_dir}"):
+            with open(scenario_state_path1) as f:
+                scenario_state1 = f.readlines()
+
+            with open(scenario_state_path2) as f:
+                scenario_state2 = f.readlines()
+
+            same: bool = True
+            # Check the difference between two scenario_state.json files
+            for i, (line1, line2) in enumerate(zip(scenario_state1, scenario_state2)):
+                if line1 != line2:
+                    line_number: int = i + 1
+                    same = False
+                    hlog(
+                        "ERROR: Not reproducible - content of "
+                        f"{scenario_state_path1} and {scenario_state_path2} are different. "
+                        f"Line {line_number}:"
+                    )
+                    hlog(f"--- scenario_state.json (1): {line1}")
+                    hlog(f"+++ scenario_state.json (2): {line2}")
+                    break
+
+            if same:
+                hlog("Verified reproducible.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--max-eval-instances", type=int, default=1000, help="Maximum number of eval instances.",
+    )
+    parser.add_argument(
+        "-c",
+        "--conf-path",
+        type=str,
+        help="Where to read RunSpecs to run from",
+        default="src/benchmark/presentation/run_specs.conf",
+    )
+    parser.add_argument(
+        "--models-to-run",
+        nargs="+",
+        help="Only RunSpecs with these models specified. If no model is specified, run with all models.",
+        default=None,
+    )
+    parser.add_argument(
+        "--priority", type=int, default=2, help="Run RunSpecs with priority less than or equal to this number."
+    )
+    args = parser.parse_args()
+
+    verify_reproducibility(args.conf_path, args.max_eval_instances, args.priority, args.models_to_run)
+    hlog("Done.")
diff --git a/src/benchmark/presentation/present.py b/src/benchmark/presentation/present.py
@@ -353,7 +353,7 @@ def main():
     parser.add_argument(
         "--models-to-run",
         nargs="+",
-        help="Only RunSpecs with these models specified. If no model is specified, run everything.",
+        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
         default=None,
     )
     parser.add_argument(