A command to run all models for a benchmark (mlcommons#318)

* implement benchmark run all * add a util to cleanup a specific path * finalize benchmark run all * modify tests of the changed cleanup util * fix linting error * change BenchmarkExecution to run multiple models * adapt other commands to the changes * create execution class * modify compatibility test * modify benchmark execution * use dataset registered uid for result generated uid * add config.test_storage * fix minor bugs * fix how to read models list file * fix typer hint types, consistent no-cache name * minor bugs, add integration tests * add integration tests for partial result submission * fix tests * abort submission for existing registered results * remove unused config variable * create tests for benchmarkexecution * add note, remove redundant cleanup * add test for execution command * delete failing compatibilityTest unittests This is going to be refactored very soon. unittests will be re-written * add test for execution to check cube.run * fix integration tests error introduced by merging main * rename ignore_errors to ignore_model_errors * use `state_variables` instead of `system_inputs` * fix typos in cli integration tests error messages * more clear error msg for unassociated models * fixes: use pydantic mocks, replace uid by id * additional changes due to pydantic * additional fixes for pydantic * use consistently integer type for server IDs * use uids as integers in all tests --------- Co-authored-by: Alejandro Aristizábal <[email protected]>
sarthakpati · Feb 9, 2023 · 73b5ffc · 73b5ffc
1 parent c1f01c0
commit 73b5ffc
Show file tree

Hide file tree

Showing 45 changed files with 1,199 additions and 955 deletions.
diff --git a/cli/cli_tests.sh b/cli/cli_tests.sh
@@ -52,7 +52,7 @@ fi
 ##########################################################
 ########################## Setup #########################
 ##########################################################
-ASSETS_URL="https://raw.githubusercontent.com/hasan7n/mockcube/ebecacaf22689ec9ea0d20826c805a206e2c63e0"
+ASSETS_URL="https://raw.githubusercontent.com/hasan7n/mockcube/451e95e3bd62a7112ffd3336302d4cddd895d0cb"
 
 # datasets
 DSET_A_URL="$ASSETS_URL/assets/datasets/dataset_a.tar.gz"
@@ -65,6 +65,7 @@ PREP_MLCUBE="$ASSETS_URL/prep/mlcube/mlcube.yaml"
 PREP_PARAMS="$ASSETS_URL/prep/mlcube/workspace/parameters.yaml"
 
 # model cubes
+FAILING_MODEL_MLCUBE="$ASSETS_URL/model-bug/mlcube/mlcube.yaml" # doesn't fail with association
 MODEL_MLCUBE="$ASSETS_URL/model-cpu/mlcube/mlcube.yaml"
 MODEL_ADD="$ASSETS_URL/assets/weights/weights1.tar.gz"
 MODEL1_PARAMS="$ASSETS_URL/model-cpu/mlcube/workspace/parameters1.yaml"
@@ -149,9 +150,9 @@ medperf mlcube submit --name model3 -m $MODEL_MLCUBE -p $MODEL3_PARAMS -a $MODEL
 checkFailed "Model3 submission failed"
 MODEL3_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
 
-medperf mlcube submit --name model4 -m $MODEL_MLCUBE -p $MODEL4_PARAMS -a $MODEL_ADD
-checkFailed "Model4 submission failed"
-MODEL4_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
+medperf mlcube submit --name model-fail -m $FAILING_MODEL_MLCUBE -p $MODEL4_PARAMS -a $MODEL_ADD
+checkFailed "failing model submission failed"
+FAILING_MODEL_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
 
 medperf mlcube submit --name metrics -m $METRIC_MLCUBE -p $METRIC_PARAMS
 checkFailed "Metrics submission failed"
@@ -269,10 +270,10 @@ echo "\n"
 
 ##########################################################
 echo "====================================="
-echo "Running model4 association"
+echo "Running failing model association"
 echo "====================================="
-medperf mlcube associate -m $MODEL4_UID -b $BMK_UID -y
-checkFailed "Model4 association failed"
+medperf mlcube associate -m $FAILING_MODEL_UID -b $BMK_UID -y
+checkFailed "Failing model association failed"
 ##########################################################
 
 echo "\n"
@@ -282,7 +283,31 @@ echo "====================================="
 echo "Changing priority of model2"
 echo "====================================="
 medperf association set_priority -b $BMK_UID -m $MODEL2_UID -p 77
-checkFailed "Priority set failed"
+checkFailed "Priority set of model2 failed"
+##########################################################
+
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Login with modelowner"
+echo "====================================="
+medperf login --username=$MODELOWNER --password=test
+checkFailed "modelowner login failed"
+##########################################################
+
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Approve model2,3,F, associations"
+echo "====================================="
+medperf association approve -b $BMK_UID -m $MODEL2_UID
+checkFailed "Model2 association approval failed"
+medperf association approve -b $BMK_UID -m $MODEL3_UID
+checkFailed "Model3 association approval failed"
+medperf association approve -b $BMK_UID -m $FAILING_MODEL_UID
+checkFailed "failing model association approval failed"
 ##########################################################
 
 echo "\n"
@@ -299,12 +324,34 @@ echo "\n"
 
 ##########################################################
 echo "====================================="
-echo "Running model1"
+echo "Running model2"
 echo "====================================="
-medperf run -b $BMK_UID -d $DSET_A_UID -m $MODEL1_UID -y
-checkFailed "Model1 run failed"
+medperf run -b $BMK_UID -d $DSET_A_UID -m $MODEL2_UID -y
+checkFailed "Model2 run failed"
 ##########################################################
 
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Running outstanding models"
+echo "====================================="
+medperf benchmark run -b $BMK_UID -d $DSET_A_UID
+checkFailed "run all outstanding models failed"
+##########################################################
+
+echo "\n"
+
+##########################################################
+echo "====================================="
+echo "Run failing cube with ignore errors"
+echo "====================================="
+medperf run -b $BMK_UID -d $DSET_A_UID -m $FAILING_MODEL_UID -y --ignore-model-errors
+checkFailed "Failing mlcube run with ignore errors failed"
+##########################################################
+
+echo "\n"
+
 ##########################################################
 echo "====================================="
 echo "Delete mocktest profile"

diff --git a/cli/medperf/__main__.py b/cli/medperf/__main__.py
@@ -71,25 +71,41 @@ def execute(
     benchmark_uid: int = typer.Option(
         ..., "--benchmark", "-b", help="UID of the desired benchmark"
     ),
-    data_uid: str = typer.Option(
+    data_uid: int = typer.Option(
         ..., "--data_uid", "-d", help="Registered Dataset UID"
     ),
     model_uid: int = typer.Option(
         ..., "--model_uid", "-m", help="UID of model to execute"
     ),
     approval: bool = typer.Option(False, "-y", help="Skip approval step"),
-    ignore_errors: bool = typer.Option(
+    ignore_model_errors: bool = typer.Option(
+        False,
+        "--ignore-model-errors",
+        help="Ignore failing model cubes, allowing for possibly submitting partial results",
+    ),
+    no_cache: bool = typer.Option(
         False,
-        "--ignore-errors",
-        help="Ignore failing cubes, allowing for submitting partial results",
+        "--no-cache",
+        help="Ignore existing results. The experiment then will be rerun",
     ),
 ):
     """Runs the benchmark execution step for a given benchmark, prepared dataset and model
     """
-    result_uid = BenchmarkExecution.run(
-        benchmark_uid, data_uid, model_uid, ignore_errors=ignore_errors
-    )
-    ResultSubmission.run(result_uid, approved=approval)
+    result = BenchmarkExecution.run(
+        benchmark_uid,
+        data_uid,
+        [model_uid],
+        ignore_model_errors=ignore_model_errors,
+        no_cache=no_cache,
+    )[0]
+    if result.id:  # TODO: use result.is_registered once PR #338 is merged
+        config.ui.print(  # TODO: msg should be colored yellow
+            """An existing registered result for the requested execution has been\n
+            found. If you wish to submit a new result for the same execution,\n
+            please run the command again with the --no-cache option.\n"""
+        )
+    else:
+        ResultSubmission.run(result.generated_uid, approved=approval)
     config.ui.print("✅ Done!")
 
 
@@ -106,7 +122,7 @@ def test(
         None,
         "--data_uid",
         "-d",
-        help="Registered Dataset UID. Used for dataset testing. Optional. Defaults to benchmark demo dataset.",
+        help="Prepared Dataset UID. Used for dataset testing. Optional. Defaults to benchmark demo dataset.",
     ),
     data_prep: str = typer.Option(
         None,
@@ -126,16 +142,16 @@ def test(
         "-e",
         help="UID or local path to the evaluator mlcube. Optional. Defaults to benchmark evaluator mlcube",
     ),
-    force_test: bool = typer.Option(
-        False, "--force-test", help="Execute the test even if results already exist",
+    no_cache: bool = typer.Option(
+        False, "--no-cache", help="Execute the test even if results already exist",
     ),
 ):
     """
     Executes a compatibility test for a determined benchmark.
     Can test prepared datasets, remote and local models independently.
     """
     CompatibilityTestExecution.run(
-        benchmark_uid, data_uid, data_prep, model, evaluator, force_test=force_test,
+        benchmark_uid, data_uid, data_prep, model, evaluator, no_cache=no_cache,
     )
     config.ui.print("✅ Done!")
     cleanup()

diff --git a/cli/medperf/commands/association/approval.py b/cli/medperf/commands/association/approval.py
@@ -5,20 +5,20 @@
 class Approval:
     @staticmethod
     def run(
-        benchmark_uid: str,
+        benchmark_uid: int,
         approval_status: str,
-        dataset_uid: str = None,
-        mlcube_uid: str = None,
+        dataset_uid: int = None,
+        mlcube_uid: int = None,
     ):
         """Sets approval status for an association between a benchmark and a dataset or mlcube
 
         Args:
-            benchmark_uid (str): Benchmark UID.
+            benchmark_uid (int): Benchmark UID.
             approval_status (str): Desired approval status to set for the association.
             comms (Comms): Instance of Comms interface.
             ui (UI): Instance of UI interface.
-            dataset_uid (str, optional): Dataset UID. Defaults to None.
-            mlcube_uid (str, optional): MLCube UID. Defaults to None.
+            dataset_uid (int, optional): Dataset UID. Defaults to None.
+            mlcube_uid (int, optional): MLCube UID. Defaults to None.
         """
         comms = config.comms
         too_many_resources = dataset_uid and mlcube_uid

diff --git a/cli/medperf/commands/association/priority.py b/cli/medperf/commands/association/priority.py
@@ -5,13 +5,13 @@
 class AssociationPriority:
     @staticmethod
     def run(
-        benchmark_uid: str, mlcube_uid: str, priority: int,
+        benchmark_uid: int, mlcube_uid: int, priority: int,
     ):
         """Sets priority for an association between a benchmark and an mlcube
 
         Args:
-            benchmark_uid (str): Benchmark UID.
-            mlcube_uid (str): MLCube UID.
+            benchmark_uid (int): Benchmark UID.
+            mlcube_uid (int): MLCube UID.
             priority (int): priority value
 
         """

diff --git a/cli/medperf/commands/benchmark/associate.py b/cli/medperf/commands/benchmark/associate.py
@@ -7,18 +7,18 @@ class AssociateBenchmark:
     @classmethod
     def run(
         cls,
-        benchmark_uid: str,
-        model_uid: str,
-        data_uid: str,
+        benchmark_uid: int,
+        model_uid: int,
+        data_uid: int,
         approved=False,
-        force_test=False,
+        no_cache=False,
     ):
         """Associates a dataset or model to the given benchmark
 
         Args:
-            benchmark_uid (str): UID of benchmark to associate entities with
-            model_uid (str): UID of model to associate with benchmark
-            data_uid (str): UID of dataset to associate with benchmark
+            benchmark_uid (int): UID of benchmark to associate entities with
+            model_uid (int): UID of model to associate with benchmark
+            data_uid (int): UID of dataset to associate with benchmark
             comms (Comms): Instance of Communications interface
             ui (UI): Instance of UI interface
             approved (bool): Skip approval step. Defaults to False
@@ -29,10 +29,10 @@ def run(
             raise InvalidArgumentError("Must provide either a dataset or mlcube")
         if model_uid is not None:
             AssociateCube.run(
-                model_uid, benchmark_uid, approved=approved, force_test=force_test
+                model_uid, benchmark_uid, approved=approved, no_cache=no_cache
             )
 
         if data_uid is not None:
             AssociateDataset.run(
-                data_uid, benchmark_uid, approved=approved, force_test=force_test
+                data_uid, benchmark_uid, approved=approved, no_cache=no_cache
             )
diff --git a/cli/medperf/commands/benchmark/benchmark.py b/cli/medperf/commands/benchmark/benchmark.py
@@ -6,6 +6,7 @@
 from medperf.commands.benchmark.list import BenchmarksList
 from medperf.commands.benchmark.submit import SubmitBenchmark
 from medperf.commands.benchmark.associate import AssociateBenchmark
+from medperf.commands.result.create import BenchmarkExecution
 
 app = typer.Typer()
 
@@ -38,13 +39,13 @@ def submit(
     demo_hash: str = typer.Option(
         "", "--demo-hash", help="SHA1 of demonstration dataset tarball file"
     ),
-    data_preparation_mlcube: str = typer.Option(
+    data_preparation_mlcube: int = typer.Option(
         ..., "--data-preparation-mlcube", "-p", help="Data Preparation MLCube UID"
     ),
-    reference_model_mlcube: str = typer.Option(
+    reference_model_mlcube: int = typer.Option(
         ..., "--reference-model-mlcube", "-m", help="Reference Model MLCube UID"
     ),
-    evaluator_mlcube: str = typer.Option(
+    evaluator_mlcube: int = typer.Option(
         ..., "--evaluator-mlcube", "-e", help="Evaluator MLCube UID"
     ),
 ):
@@ -67,23 +68,64 @@ def submit(
 @app.command("associate")
 @clean_except
 def associate(
-    benchmark_uid: str = typer.Option(
+    benchmark_uid: int = typer.Option(
         ..., "--benchmark_uid", "-b", help="UID of benchmark to associate with"
     ),
-    model_uid: str = typer.Option(
+    model_uid: int = typer.Option(
         None, "--model_uid", "-m", help="UID of model MLCube to associate"
     ),
-    dataset_uid: str = typer.Option(
+    dataset_uid: int = typer.Option(
         None, "--data_uid", "-d", help="Server UID of registered dataset to associate"
     ),
     approval: bool = typer.Option(False, "-y", help="Skip approval step"),
-    force_test: bool = typer.Option(
-        False, "--force-test", help="Execute the test even if results already exist",
+    no_cache: bool = typer.Option(
+        False, "--no-cache", help="Execute the test even if results already exist",
     ),
 ):
     """Associates a benchmark with a given mlcube or dataset. Only one option at a time.
     """
     AssociateBenchmark.run(
-        benchmark_uid, model_uid, dataset_uid, approved=approval, force_test=force_test
+        benchmark_uid, model_uid, dataset_uid, approved=approval, no_cache=no_cache
+    )
+    config.ui.print("✅ Done!")
+
+
+@app.command("run")
+@clean_except
+def run(
+    benchmark_uid: int = typer.Option(
+        ..., "--benchmark", "-b", help="UID of the desired benchmark"
+    ),
+    data_uid: int = typer.Option(
+        ..., "--data_uid", "-d", help="Registered Dataset UID"
+    ),
+    file: str = typer.Option(
+        None,
+        "--models-from-file",
+        "-f",
+        help="""A file containing the model UIDs to be executed.\n
+        The file should contain a single line as a list of\n
+        comma-separated integers corresponding to the model UIDs""",
+    ),
+    ignore_model_errors: bool = typer.Option(
+        False,
+        "--ignore-model-errors",
+        help="Ignore failing model cubes, allowing for possibly submitting partial results",
+    ),
+    no_cache: bool = typer.Option(
+        False, "--no-cache", help="Execute even if results already exist",
+    ),
+):
+    """Runs the benchmark execution step for a given benchmark, prepared dataset and model
+    """
+    BenchmarkExecution.run(
+        benchmark_uid,
+        data_uid,
+        models_uids=None,
+        no_cache=no_cache,
+        models_input_file=file,
+        ignore_model_errors=ignore_model_errors,
+        show_summary=True,
+        ignore_failed_experiments=True,
     )
     config.ui.print("✅ Done!")