Skip to content

Commit

Permalink
A command to run all models for a benchmark (mlcommons#318)
Browse files Browse the repository at this point in the history
* implement benchmark run all

* add a util to cleanup a specific path

* finalize benchmark run all

* modify tests of the changed cleanup util

* fix linting error

* change BenchmarkExecution to run multiple models

* adapt other commands to the changes

* create execution class

* modify compatibility test

* modify benchmark execution

* use dataset registered uid for result generated uid

* add config.test_storage

* fix minor bugs

* fix how to read models list file

* fix typer hint types, consistent no-cache name

* minor bugs, add integration tests

* add integration tests for partial result submission

* fix tests

* abort submission for existing registered results

* remove unused config variable

* create tests for benchmarkexecution

* add note, remove redundant cleanup

* add test for execution command

* delete failing compatibilityTest unittests
This is going to be refactored very soon. unittests will be re-written

* add test for execution to check cube.run

* fix integration tests error introduced by merging main

* rename ignore_errors to ignore_model_errors

* use `state_variables` instead of `system_inputs`

* fix typos in cli integration tests error messages

* more clear error msg for unassociated models

* fixes: use pydantic mocks, replace uid by id

* additional changes due to pydantic

* additional fixes for pydantic

* use consistently integer type for server IDs

* use uids as integers in all tests

---------

Co-authored-by: Alejandro Aristizábal <[email protected]>
  • Loading branch information
hasan7n and aristizabal95 authored Feb 9, 2023
1 parent c1f01c0 commit 73b5ffc
Show file tree
Hide file tree
Showing 45 changed files with 1,199 additions and 955 deletions.
69 changes: 58 additions & 11 deletions cli/cli_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ fi
##########################################################
########################## Setup #########################
##########################################################
ASSETS_URL="https://raw.githubusercontent.com/hasan7n/mockcube/ebecacaf22689ec9ea0d20826c805a206e2c63e0"
ASSETS_URL="https://raw.githubusercontent.com/hasan7n/mockcube/451e95e3bd62a7112ffd3336302d4cddd895d0cb"

# datasets
DSET_A_URL="$ASSETS_URL/assets/datasets/dataset_a.tar.gz"
Expand All @@ -65,6 +65,7 @@ PREP_MLCUBE="$ASSETS_URL/prep/mlcube/mlcube.yaml"
PREP_PARAMS="$ASSETS_URL/prep/mlcube/workspace/parameters.yaml"

# model cubes
FAILING_MODEL_MLCUBE="$ASSETS_URL/model-bug/mlcube/mlcube.yaml" # doesn't fail with association
MODEL_MLCUBE="$ASSETS_URL/model-cpu/mlcube/mlcube.yaml"
MODEL_ADD="$ASSETS_URL/assets/weights/weights1.tar.gz"
MODEL1_PARAMS="$ASSETS_URL/model-cpu/mlcube/workspace/parameters1.yaml"
Expand Down Expand Up @@ -149,9 +150,9 @@ medperf mlcube submit --name model3 -m $MODEL_MLCUBE -p $MODEL3_PARAMS -a $MODEL
checkFailed "Model3 submission failed"
MODEL3_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)

medperf mlcube submit --name model4 -m $MODEL_MLCUBE -p $MODEL4_PARAMS -a $MODEL_ADD
checkFailed "Model4 submission failed"
MODEL4_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)
medperf mlcube submit --name model-fail -m $FAILING_MODEL_MLCUBE -p $MODEL4_PARAMS -a $MODEL_ADD
checkFailed "failing model submission failed"
FAILING_MODEL_UID=$(medperf mlcube ls | tail -n 1 | tr -s ' ' | cut -d ' ' -f 2)

medperf mlcube submit --name metrics -m $METRIC_MLCUBE -p $METRIC_PARAMS
checkFailed "Metrics submission failed"
Expand Down Expand Up @@ -269,10 +270,10 @@ echo "\n"

##########################################################
echo "====================================="
echo "Running model4 association"
echo "Running failing model association"
echo "====================================="
medperf mlcube associate -m $MODEL4_UID -b $BMK_UID -y
checkFailed "Model4 association failed"
medperf mlcube associate -m $FAILING_MODEL_UID -b $BMK_UID -y
checkFailed "Failing model association failed"
##########################################################

echo "\n"
Expand All @@ -282,7 +283,31 @@ echo "====================================="
echo "Changing priority of model2"
echo "====================================="
medperf association set_priority -b $BMK_UID -m $MODEL2_UID -p 77
checkFailed "Priority set failed"
checkFailed "Priority set of model2 failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Login with modelowner"
echo "====================================="
medperf login --username=$MODELOWNER --password=test
checkFailed "modelowner login failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Approve model2,3,F, associations"
echo "====================================="
medperf association approve -b $BMK_UID -m $MODEL2_UID
checkFailed "Model2 association approval failed"
medperf association approve -b $BMK_UID -m $MODEL3_UID
checkFailed "Model3 association approval failed"
medperf association approve -b $BMK_UID -m $FAILING_MODEL_UID
checkFailed "failing model association approval failed"
##########################################################

echo "\n"
Expand All @@ -299,12 +324,34 @@ echo "\n"

##########################################################
echo "====================================="
echo "Running model1"
echo "Running model2"
echo "====================================="
medperf run -b $BMK_UID -d $DSET_A_UID -m $MODEL1_UID -y
checkFailed "Model1 run failed"
medperf run -b $BMK_UID -d $DSET_A_UID -m $MODEL2_UID -y
checkFailed "Model2 run failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Running outstanding models"
echo "====================================="
medperf benchmark run -b $BMK_UID -d $DSET_A_UID
checkFailed "run all outstanding models failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Run failing cube with ignore errors"
echo "====================================="
medperf run -b $BMK_UID -d $DSET_A_UID -m $FAILING_MODEL_UID -y --ignore-model-errors
checkFailed "Failing mlcube run with ignore errors failed"
##########################################################

echo "\n"

##########################################################
echo "====================================="
echo "Delete mocktest profile"
Expand Down
40 changes: 28 additions & 12 deletions cli/medperf/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,25 +71,41 @@ def execute(
benchmark_uid: int = typer.Option(
..., "--benchmark", "-b", help="UID of the desired benchmark"
),
data_uid: str = typer.Option(
data_uid: int = typer.Option(
..., "--data_uid", "-d", help="Registered Dataset UID"
),
model_uid: int = typer.Option(
..., "--model_uid", "-m", help="UID of model to execute"
),
approval: bool = typer.Option(False, "-y", help="Skip approval step"),
ignore_errors: bool = typer.Option(
ignore_model_errors: bool = typer.Option(
False,
"--ignore-model-errors",
help="Ignore failing model cubes, allowing for possibly submitting partial results",
),
no_cache: bool = typer.Option(
False,
"--ignore-errors",
help="Ignore failing cubes, allowing for submitting partial results",
"--no-cache",
help="Ignore existing results. The experiment then will be rerun",
),
):
"""Runs the benchmark execution step for a given benchmark, prepared dataset and model
"""
result_uid = BenchmarkExecution.run(
benchmark_uid, data_uid, model_uid, ignore_errors=ignore_errors
)
ResultSubmission.run(result_uid, approved=approval)
result = BenchmarkExecution.run(
benchmark_uid,
data_uid,
[model_uid],
ignore_model_errors=ignore_model_errors,
no_cache=no_cache,
)[0]
if result.id: # TODO: use result.is_registered once PR #338 is merged
config.ui.print( # TODO: msg should be colored yellow
"""An existing registered result for the requested execution has been\n
found. If you wish to submit a new result for the same execution,\n
please run the command again with the --no-cache option.\n"""
)
else:
ResultSubmission.run(result.generated_uid, approved=approval)
config.ui.print("✅ Done!")


Expand All @@ -106,7 +122,7 @@ def test(
None,
"--data_uid",
"-d",
help="Registered Dataset UID. Used for dataset testing. Optional. Defaults to benchmark demo dataset.",
help="Prepared Dataset UID. Used for dataset testing. Optional. Defaults to benchmark demo dataset.",
),
data_prep: str = typer.Option(
None,
Expand All @@ -126,16 +142,16 @@ def test(
"-e",
help="UID or local path to the evaluator mlcube. Optional. Defaults to benchmark evaluator mlcube",
),
force_test: bool = typer.Option(
False, "--force-test", help="Execute the test even if results already exist",
no_cache: bool = typer.Option(
False, "--no-cache", help="Execute the test even if results already exist",
),
):
"""
Executes a compatibility test for a determined benchmark.
Can test prepared datasets, remote and local models independently.
"""
CompatibilityTestExecution.run(
benchmark_uid, data_uid, data_prep, model, evaluator, force_test=force_test,
benchmark_uid, data_uid, data_prep, model, evaluator, no_cache=no_cache,
)
config.ui.print("✅ Done!")
cleanup()
Expand Down
12 changes: 6 additions & 6 deletions cli/medperf/commands/association/approval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
class Approval:
@staticmethod
def run(
benchmark_uid: str,
benchmark_uid: int,
approval_status: str,
dataset_uid: str = None,
mlcube_uid: str = None,
dataset_uid: int = None,
mlcube_uid: int = None,
):
"""Sets approval status for an association between a benchmark and a dataset or mlcube
Args:
benchmark_uid (str): Benchmark UID.
benchmark_uid (int): Benchmark UID.
approval_status (str): Desired approval status to set for the association.
comms (Comms): Instance of Comms interface.
ui (UI): Instance of UI interface.
dataset_uid (str, optional): Dataset UID. Defaults to None.
mlcube_uid (str, optional): MLCube UID. Defaults to None.
dataset_uid (int, optional): Dataset UID. Defaults to None.
mlcube_uid (int, optional): MLCube UID. Defaults to None.
"""
comms = config.comms
too_many_resources = dataset_uid and mlcube_uid
Expand Down
6 changes: 3 additions & 3 deletions cli/medperf/commands/association/priority.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
class AssociationPriority:
@staticmethod
def run(
benchmark_uid: str, mlcube_uid: str, priority: int,
benchmark_uid: int, mlcube_uid: int, priority: int,
):
"""Sets priority for an association between a benchmark and an mlcube
Args:
benchmark_uid (str): Benchmark UID.
mlcube_uid (str): MLCube UID.
benchmark_uid (int): Benchmark UID.
mlcube_uid (int): MLCube UID.
priority (int): priority value
"""
Expand Down
18 changes: 9 additions & 9 deletions cli/medperf/commands/benchmark/associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ class AssociateBenchmark:
@classmethod
def run(
cls,
benchmark_uid: str,
model_uid: str,
data_uid: str,
benchmark_uid: int,
model_uid: int,
data_uid: int,
approved=False,
force_test=False,
no_cache=False,
):
"""Associates a dataset or model to the given benchmark
Args:
benchmark_uid (str): UID of benchmark to associate entities with
model_uid (str): UID of model to associate with benchmark
data_uid (str): UID of dataset to associate with benchmark
benchmark_uid (int): UID of benchmark to associate entities with
model_uid (int): UID of model to associate with benchmark
data_uid (int): UID of dataset to associate with benchmark
comms (Comms): Instance of Communications interface
ui (UI): Instance of UI interface
approved (bool): Skip approval step. Defaults to False
Expand All @@ -29,10 +29,10 @@ def run(
raise InvalidArgumentError("Must provide either a dataset or mlcube")
if model_uid is not None:
AssociateCube.run(
model_uid, benchmark_uid, approved=approved, force_test=force_test
model_uid, benchmark_uid, approved=approved, no_cache=no_cache
)

if data_uid is not None:
AssociateDataset.run(
data_uid, benchmark_uid, approved=approved, force_test=force_test
data_uid, benchmark_uid, approved=approved, no_cache=no_cache
)
60 changes: 51 additions & 9 deletions cli/medperf/commands/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from medperf.commands.benchmark.list import BenchmarksList
from medperf.commands.benchmark.submit import SubmitBenchmark
from medperf.commands.benchmark.associate import AssociateBenchmark
from medperf.commands.result.create import BenchmarkExecution

app = typer.Typer()

Expand Down Expand Up @@ -38,13 +39,13 @@ def submit(
demo_hash: str = typer.Option(
"", "--demo-hash", help="SHA1 of demonstration dataset tarball file"
),
data_preparation_mlcube: str = typer.Option(
data_preparation_mlcube: int = typer.Option(
..., "--data-preparation-mlcube", "-p", help="Data Preparation MLCube UID"
),
reference_model_mlcube: str = typer.Option(
reference_model_mlcube: int = typer.Option(
..., "--reference-model-mlcube", "-m", help="Reference Model MLCube UID"
),
evaluator_mlcube: str = typer.Option(
evaluator_mlcube: int = typer.Option(
..., "--evaluator-mlcube", "-e", help="Evaluator MLCube UID"
),
):
Expand All @@ -67,23 +68,64 @@ def submit(
@app.command("associate")
@clean_except
def associate(
benchmark_uid: str = typer.Option(
benchmark_uid: int = typer.Option(
..., "--benchmark_uid", "-b", help="UID of benchmark to associate with"
),
model_uid: str = typer.Option(
model_uid: int = typer.Option(
None, "--model_uid", "-m", help="UID of model MLCube to associate"
),
dataset_uid: str = typer.Option(
dataset_uid: int = typer.Option(
None, "--data_uid", "-d", help="Server UID of registered dataset to associate"
),
approval: bool = typer.Option(False, "-y", help="Skip approval step"),
force_test: bool = typer.Option(
False, "--force-test", help="Execute the test even if results already exist",
no_cache: bool = typer.Option(
False, "--no-cache", help="Execute the test even if results already exist",
),
):
"""Associates a benchmark with a given mlcube or dataset. Only one option at a time.
"""
AssociateBenchmark.run(
benchmark_uid, model_uid, dataset_uid, approved=approval, force_test=force_test
benchmark_uid, model_uid, dataset_uid, approved=approval, no_cache=no_cache
)
config.ui.print("✅ Done!")


@app.command("run")
@clean_except
def run(
benchmark_uid: int = typer.Option(
..., "--benchmark", "-b", help="UID of the desired benchmark"
),
data_uid: int = typer.Option(
..., "--data_uid", "-d", help="Registered Dataset UID"
),
file: str = typer.Option(
None,
"--models-from-file",
"-f",
help="""A file containing the model UIDs to be executed.\n
The file should contain a single line as a list of\n
comma-separated integers corresponding to the model UIDs""",
),
ignore_model_errors: bool = typer.Option(
False,
"--ignore-model-errors",
help="Ignore failing model cubes, allowing for possibly submitting partial results",
),
no_cache: bool = typer.Option(
False, "--no-cache", help="Execute even if results already exist",
),
):
"""Runs the benchmark execution step for a given benchmark, prepared dataset and model
"""
BenchmarkExecution.run(
benchmark_uid,
data_uid,
models_uids=None,
no_cache=no_cache,
models_input_file=file,
ignore_model_errors=ignore_model_errors,
show_summary=True,
ignore_failed_experiments=True,
)
config.ui.print("✅ Done!")
Loading

0 comments on commit 73b5ffc

Please sign in to comment.