From 338123c10788a518adf046e334c291dfde23c01c Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Thu, 21 Nov 2024 16:15:04 +0000 Subject: [PATCH 1/3] feat(a/b): save A/B fail results Save A/B test fails to be able to do offline analysis. Signed-off-by: Egor Lazarchuk --- tools/ab_test.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/ab_test.py b/tools/ab_test.py index 2d03d9591a1..a1188fd1e86 100755 --- a/tools/ab_test.py +++ b/tools/ab_test.py @@ -277,6 +277,7 @@ def analyze_data( ) messages = [] + fails = [] for dimension_set, metric, result, unit in failures: # Sanity check as described above if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold: @@ -291,10 +292,19 @@ def analyze_data( old_mean = statistics.mean(processed_emf_a[dimension_set][metric][0]) new_mean = statistics.mean(processed_emf_b[dimension_set][metric][0]) + change_unit = format_with_reduced_unit(result.statistic, unit) + change_p = result.statistic / old_mean + old_unit = format_with_reduced_unit(old_mean, unit) + new_unit = format_with_reduced_unit(new_mean, unit) + + fail = dict(dimension_set) + fail["diff"] = change_p + fails.append(fail) + msg = ( f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of " - f"{format_with_reduced_unit(result.statistic, unit)}, or {result.statistic / old_mean:.2%}, " - f"(from {format_with_reduced_unit(old_mean, unit)} to {format_with_reduced_unit(new_mean, unit)}) " + f"{change_unit}, or {change_p:.2%}, " + f"(from {old_unit} to {new_unit}) " f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. " f"This means that observing a change of this magnitude or worse, assuming that performance " f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. " @@ -302,7 +312,10 @@ def analyze_data( ) messages.append(msg) - assert not messages, "\n" + "\n".join(messages) + if messages: + with open("test_results/ab.json", "w") as f: + json.dump({"fails": fails}, f, indent=2, sort_keys=True) + assert False, "\n" + "\n".join(messages) print("No regressions detected!") From 3eb6a770971ef60051776b9beb57514cdf8fb17d Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Thu, 21 Nov 2024 16:59:08 +0000 Subject: [PATCH 2/3] feat(a/b): add support scripts for A/B visualization Add scripts for combining and plotting results of A/B runs. Move all A/B related scripts to the tools/ab Signed-off-by: Egor Lazarchuk --- tools/{ => ab}/ab_test.py | 0 tools/ab/combine.py | 45 +++++++++++++ tools/ab/plot.py | 138 ++++++++++++++++++++++++++++++++++++++ tools/devtool | 2 +- 4 files changed, 184 insertions(+), 1 deletion(-) rename tools/{ => ab}/ab_test.py (100%) create mode 100644 tools/ab/combine.py create mode 100644 tools/ab/plot.py diff --git a/tools/ab_test.py b/tools/ab/ab_test.py similarity index 100% rename from tools/ab_test.py rename to tools/ab/ab_test.py diff --git a/tools/ab/combine.py b/tools/ab/combine.py new file mode 100644 index 00000000000..43619d69192 --- /dev/null +++ b/tools/ab/combine.py @@ -0,0 +1,45 @@ +import argparse +import json +import os +from pathlib import Path + +parser = argparse.ArgumentParser( + description="Combine A/B test fails into groups per test type" +) +parser.add_argument( + "path", + help="Path to the directory with failed A/B runs", + type=Path, +) +args = parser.parse_args() + +BLOCK = "test_block_performance" +NET_THROUGHPUT = "test_network_throughput" +NET_LATENCY = "test_network_latency" + +block_data = [] +net_data = [] +net_lat_data = [] +for d in os.walk(args.path): + if "ab.json" in d[-1]: + path = d[0] + "/ab.json" + print(path) + with open(path, "r+") as f: + lines = f.read() + j = '{"data":' + lines + "}" + data = json.loads(j) + for e in data["data"]: + match e["performance_test"]: + case BLOCk: + block_data.append(e) + case NET_THROUGHPUT: + net_data.append(e) + case NET_LATENCY: + net_lat_data.append(e) + +with open(f"{NET_LATENCY}.json", "w") as f: + json.dump({"results": net_lat_data}, f, indent=2, sort_keys=True) +with open(f"{NET_THROUGHPUT}.json", "w") as f: + json.dump({"results": net_data}, f, indent=2, sort_keys=True) +with open(f"{BLOCK}.json", "w") as f: + json.dump({"fails": block_data}, f, indent=2, sort_keys=True) diff --git a/tools/ab/plot.py b/tools/ab/plot.py new file mode 100644 index 00000000000..1eae52b4cc0 --- /dev/null +++ b/tools/ab/plot.py @@ -0,0 +1,138 @@ +import argparse +import json +import os +from enum import Enum + +import matplotlib.pyplot as plt +import numpy as np + +plt.style.use("dark_background") + + +def clamp(min_v, max_v, v): + return max(min_v, min(max_v, v)) + + +def lerp(color_a, color_b, t): + return ( + clamp(0.0, 1.0, abs(color_a[0] * (1.0 - t) + color_b[0] * t)), + clamp(0.0, 1.0, abs(color_a[1] * (1.0 - t) + color_b[1] * t)), + clamp(0.0, 1.0, abs(color_a[2] * (1.0 - t) + color_b[2] * t)), + ) + + +GREY = (0.5, 0.5, 0.5) +GREEN = (0.1, 0.8, 0.1) +RED = (0.8, 0.0, 0.1) + +POSITIVE_COLOR = GREEN +NEGATIVE_COLOR = RED + + +class DataType(Enum): + Block = "block" + Net = "net" + NetLatency = "net_latency" + + +parser = argparse.ArgumentParser(description="Plot results of A/B test") +parser.add_argument("path", type=str) +args = parser.parse_args() + +paths = [f"{args.path}/{f}" for f in os.listdir(args.path)] +for path in paths: + print(f"processing: {path}") + with open(path) as f: + fails = json.load(f)["fails"] + + if not fails: + print(f"skipping {path}. No data present") + continue + + instances = set() + host_kernels = set() + aggregated = {} + + match fails[0]["performance_test"]: + case "test_block_performance": + data_type = DataType.Block + case "test_network_tcp_throughput": + data_type = DataType.Net + case "test_network_latency": + data_type = DataType.NetLatency + case _: + print("unknown data type. skipping") + continue + + for fail in fails: + instances.add(fail["instance"]) + host_kernels.add(fail["host_kernel"]) + + if data_type == DataType.Block: + tag = ( + fail["instance"], + fail["host_kernel"], + fail["guest_kernel"], + fail["fio_mode"], + fail["vcpus"], + fail["io_engine"], + ) + elif data_type == DataType.Net: + tag = ( + fail["instance"], + fail["host_kernel"], + fail["guest_kernel"], + fail["mode"], + fail["vcpus"], + ) + elif data_type == DataType.NetLatency: + tag = ( + fail["instance"], + fail["host_kernel"], + fail["guest_kernel"], + ) + POSITIVE_COLOR = RED + NEGATIVE_COLOR = GREEN + + if tag not in aggregated: + aggregated[tag] = [] + aggregated[tag].append(fail["diff"]) + + for instance in sorted(instances): + fig, ax = plt.subplots(len(host_kernels), figsize=(16, 11)) + if len(host_kernels) == 1: + ax = [ax] + fig.tight_layout(pad=8.0) + + for i, host_kernel in enumerate(sorted(host_kernels)): + data = [] + for key, value in aggregated.items(): + if key[0] == instance and key[1] == host_kernel: + label = "\n".join(key[2:]) + values = np.array(value) + mean = np.mean(values) + std = np.std(values) + data.append((label, mean, std)) + data.sort() + labels = np.array([t[0] for t in data]) + means = np.array([t[1] for t in data]) + errors = np.array([t[2] for t in data]) + colors = [ + ( + lerp(GREY, POSITIVE_COLOR, t) + if 0.0 < t + else lerp(GREY, NEGATIVE_COLOR, -t) + ) + for t in [m / 100.0 for m in means] + ] + + bar = ax[i].bar(labels, means, yerr=errors, color=colors, ecolor="white") + bar_labels = [f"{m:.2f} / {s:.2f}" for (m, s) in zip(means, errors)] + ax[i].bar_label(bar, labels=bar_labels) + ax[i].set_ylabel("Percentage of change: mean / std") + ax[i].grid(color="grey", linestyle="-.", linewidth=0.5, alpha=0.5) + ax[i].set_title( + f"{data_type}\nInstance: {instance}\nHost kernel: {host_kernel}", + ) + + plt.savefig(f"{args.path}/{data_type}_{instance}.png") diff --git a/tools/devtool b/tools/devtool index c3b7b27ed98..258a904c0cc 100755 --- a/tools/devtool +++ b/tools/devtool @@ -760,7 +760,7 @@ cmd_test() { test_script="./tools/test.sh" if [ $do_ab_test -eq 1 ]; then - test_script="./tools/ab_test.py" + test_script="./tools/ab/ab_test.py" fi # Testing (running Firecracker via the jailer) needs root access, From 8163526c2da655a27da0ab411c4112875eca3fd9 Mon Sep 17 00:00:00 2001 From: Egor Lazarchuk Date: Thu, 21 Nov 2024 17:24:48 +0000 Subject: [PATCH 3/3] chore: update links in the docs with new ab_test.py location Update links in the docs with new ab_test.py location Signed-off-by: Egor Lazarchuk --- tests/README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/README.md b/tests/README.md index ea46ff56786..c2b5e32b496 100644 --- a/tests/README.md +++ b/tests/README.md @@ -150,13 +150,13 @@ post-merge. Specific tests, such as our [snapshot restore latency tests](integration_tests/performance/test_snapshot_ab.py) contain no assertions themselves, but rather they emit data series using the `aws_embedded_metrics` library. When executed by the -[`tools/ab_test.py`](../tools/ab_test.py) orchestration script, these data +[`tools/ab/ab_test.py`](../tools/ab/ab_test.py) orchestration script, these data series are collected. The orchestration script executes each test twice with different Firecracker binaries, and then matches up corresponding data series from the _A_ and _B_ run. For each data series, it performs a non-parametric test. For each data series where the difference between the _A_ and _B_ run is considered statically significant, it will print out the associated metric. -Please see `tools/ab_test.py --help` for information on how to configure what +Please see `tools/ab/ab_test.py --help` for information on how to configure what the script considers significant. Writing your own A/B-Test is easy: Simply write a test that outputs a data @@ -193,12 +193,12 @@ metric for which they wish to support A/B-testing**. This is because non-parametric tests operate on data series instead of individual data points. When emitting metrics with `aws_embedded_metrics`, each metric (data series) is -associated with a set of dimensions. The `tools/ab_test.py` script uses these +associated with a set of dimensions. The `tools/ab/ab_test.py` script uses these dimension to match up data series between two test runs. It only matches up two data series with the same name if their dimensions match. Special care needs to be taken when pytest expands the argument passed to -`tools/ab_test.py`'s `--test` option into multiple individual test cases. If two +`tools/ab/ab_test.py`'s `--test` option into multiple individual test cases. If two test cases use the same dimensions for different data series, the script will fail and print out the names of the violating data series. For this reason, **A/B-Compatible tests should include a `performance_test` key in their @@ -208,14 +208,14 @@ In addition to the above, care should be taken that the dimensions of the data series emitted by some test case are unique to that test case. For example, if we have a boottime test parameterized by number of vcpus, but the emitted boottime data series' dimension set is just -`{"performance_test": "test_boottime"}`, then `tools/ab_test.py` will not be +`{"performance_test": "test_boottime"}`, then `tools/ab/ab_test.py` will not be able to tell apart data series belonging to different microVM sizes, and instead combine them (which is probably not desired). For this reason **A/B-Compatible tests should always include all pytest parameters in their dimension set.** -Lastly, performance A/B-Testing through `tools/ab_test.py` can only detect +Lastly, performance A/B-Testing through `tools/ab/ab_test.py` can only detect performance differences that are present in the Firecracker binary. The -`tools/ab_test.py` script only checks out the revisions it is passed to execute +`tools/ab/ab_test.py` script only checks out the revisions it is passed to execute `cargo build` to generate a Firecracker binary. It does not run integration tests in the context of the checked out revision. In particular, both the _A_ and the _B_ run will be triggered from within the same docker container, and @@ -223,7 +223,7 @@ using the same revision of the integration test code. This means it is not possible to use orchestrated A/B-Testing to assess the impact of, say, changing only python code (such as enabling logging). Only Rust code can be A/B-Tested. The exception to this are toolchain differences. If both specified revisions -have `rust-toolchain.toml` files, then `tools/ab_test.py` will compile using the +have `rust-toolchain.toml` files, then `tools/ab/ab_test.py` will compile using the toolchain specified by the revision, instead of the toolchain installed in the docker container from which the script is executed. @@ -256,25 +256,25 @@ This instructs `aws_embedded_metrics` to dump all data series that our A/B-Test orchestration would analyze to `stdout`, and pytest will capture this output into a file stored at `./test_results/test-report.json`. -The `tools/ab_test.py` script can consume these test reports, so next collect +The `tools/ab/ab_test.py` script can consume these test reports, so next collect your two test report files to your local machine and run ```sh -tools/ab_test.py analyze +tools/ab/ab_test.py analyze ``` This will then print the same analysis described in the previous sections. #### Troubleshooting -If during `tools/ab_test.py analyze` you get an error like +If during `tools/ab/ab_test.py analyze` you get an error like ```bash -$ tools/ab_test.py analyze +$ tools/ab/ab_test.py analyze Traceback (most recent call last): - File "/firecracker/tools/ab_test.py", line 412, in + File "/firecracker/tools/ab/ab_test.py", line 412, in data_a = load_data_series(args.report_a) - File "/firecracker/tools/ab_test.py", line 122, in load_data_series + File "/firecracker/tools/ab/ab_test.py", line 122, in load_data_series for line in test["teardown"]["stdout"].splitlines(): KeyError: 'stdout' ```