-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathweb_demo.py
335 lines (266 loc) · 12.8 KB
/
web_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#
# --------------------------------------------------------------------------
# Licensed under the MIT License. See LICENSE file in the project root for
# license information.
# Copyright (c) Microsoft Corporation.
# --------------------------------------------------------------------------
#
"""
This module provides an interactive interface for running autoscaling simulations using.
Streamlit. It supports various operations like visualizing workloads from CSV files,
configuring simulation parameters, and running autoscaling algorithms.
Main Features:
- Workload Visualization: Display CPU usage trends from CSV files.
- Simulation Options: Run autoscaling simulations or tune simulation parameters.
- Parameter Input: Dynamically adjust simulation parameters based on user input.
Modules:
create_charts: Displays charts of CPU usage over time.
process_params_to_tune: Processes simulation parameters for tuning.
get_files_with_extension: Retrieves files with a specific extension from a directory.
"""
# pylint: disable=no-member # FIXME
import itertools
import json
import os
from typing import Optional
import pandas as pd
import streamlit as st
from utils import run_simulation, unflatten_dict # pylint: disable=import-error
from vasim.recommender.cluster_state_provider.ClusterStateConfig import (
ClusterStateConfig,
)
from vasim.simulator.analysis.pareto_visualization import (
create_pareto_curve_from_folder,
)
from vasim.simulator.ParameterTuning import tune_with_strategy
st.set_page_config(layout="wide")
st.title("VASIM Autoscaling Simulator Toolkit Presentation")
def construct_config_metric_df(config_metrics_list) -> pd.DataFrame:
"""
Helper function to output parameter list and metrics.
Args:
config_metrics_list (list): A List of metrics and configs
"""
# Create an empty list to store row data
rows = []
# Iterate through the results and add rows to the list
for modified_config, metrics in config_metrics_list:
if metrics is None:
print(f"Skipping {modified_config.uuid} because of an error")
continue
row_data = {**metrics, **modified_config} # TODO: there is a known issue https://github.com/microsoft/vasim/issues/119
rows.append(row_data)
# Create the DataFrame from the list of row data
return pd.DataFrame(rows)
@st.cache_data()
def create_charts(chart_data: pd.DataFrame) -> None:
"""
Creates and displays a line chart for CPU usage over time in the sidebar.
Args:
chart_data (pd.DataFrame): A DataFrame containing CPU usage data. It must have
a 'TIMESTAMP' column representing the time and
a 'CPU_USAGE_ACTUAL' column representing the actual
CPU usage.
"""
# Create a new DataFrame for Streamlit line_chart
chart_data_df = pd.DataFrame({"TIMESTAMP": chart_data["TIMESTAMP"], "CPU_USAGE_ACTUAL": chart_data["CPU_USAGE_ACTUAL"]})
# Plot the DataFrame using Streamlit line_chart
st.sidebar.line_chart(chart_data_df.set_index("TIMESTAMP"))
st.sidebar.success("Workload visualization finished.")
def process_params_to_tune(input_selected_params_to_tune):
"""
Processes selected parameters by prompting the user for input and returning.
the processed values.
Args:
input_selected_params_to_tune (list): A list of parameter names selected by the user.
Returns:
dict: A dictionary with parameter names as keys and user-supplied values as values.
"""
resulted_params_to_tune = {}
for param_name in input_selected_params_to_tune:
param_values = process_parameter_input(param_name)
resulted_params_to_tune[param_name] = param_values
return resulted_params_to_tune
def process_parameter_input(param_name):
"""
Prompts the user for input values for a specific simulation parameter and.
returns the values as a list.
Args:
param_name (str): The name of the parameter to be processed.
Returns:
list: A list of values input by the user for the parameter.
"""
st.subheader(f"Edit values for parameter: {param_name}")
user_input = st.text_input(f"Enter values for {param_name} (comma-separated):")
# Process user input and return a list of values
param_values = [float(x.strip()) for x in user_input.split(",")] if user_input else []
return param_values
# Sidebar for simulation options
st.sidebar.title("Simulation Options")
simulation_option = st.sidebar.radio(
"Select Simulation Option", ["Simulation Run", "Simulation Tuning", "Simulation Tuning History"]
)
# Sidebar for workload visualization and navigation
st.sidebar.title("Workload Visualization")
# Add Path Input for the user
st.sidebar.subheader("Input Data Directory")
parent_data_input_directory = st.sidebar.text_input("Enter the directory path for CSV files:", "tests/test_data")
# Function to get all CSV files recursively from a directory
def get_files_with_extension(directory, format_suffix=".csv"):
"""
Recursively retrieves all files with a specific extension from a directory.
Args:
directory (str): The directory to search for files.
format_suffix (str): The file extension to filter (default is ".csv").
Returns:
list: A list of file paths matching the specified extension.
"""
files = []
for root, _, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith(format_suffix):
files.append(os.path.join(root, filename))
return files
# Get all CSV files from the directory input by the user
csv_files = get_files_with_extension(parent_data_input_directory, ".csv")
# Check if there are CSV files available
if not csv_files:
st.sidebar.error("No CSV files found in the directory.")
else:
# Display the list of CSV files in the sidebar
selected_csv = st.sidebar.selectbox("Select a CSV file:", csv_files)
selected_algorithm_names = st.sidebar.selectbox("Select an algorithm:", ["additive", "multiplicative"])
# Get all JSON files recursively from the data input directory
json_config_files = get_files_with_extension(parent_data_input_directory, ".json")
# Check if there are CSV files available
config_path_run: Optional[str] = None
if not json_config_files:
st.sidebar.error("No json files found in the directory.")
else:
# Display the list of CSV files in the sidebar
config_path_run = st.sidebar.selectbox("Select a json file:", json_config_files)
# Check if file exists
if not config_path_run or not os.path.exists(config_path_run):
st.error(f"Error loading JSON file: {config_path_run} does not exist")
st.stop()
with open(config_path_run, mode="r", encoding="utf-8") as json_file_run:
data_run = json.load(json_file_run)
df_run = pd.json_normalize(data_run)
# selected_csv is definitely already defined
data_dir = os.path.dirname(selected_csv) # pylint: disable=possibly-used-before-assignment
if selected_csv:
if st.sidebar.button("Visualize workload"):
workload_df = pd.read_csv(selected_csv)
workload_df["TIMESTAMP"] = pd.to_datetime(workload_df["TIMESTAMP"], format="%Y.%m.%d-%H:%M:%S:%f")
workload_df["TIMESTAMP"] = pd.DatetimeIndex(workload_df["TIMESTAMP"]).floor("min")
workload_df = workload_df.drop_duplicates(subset=["TIMESTAMP"], keep="last")
perf_log_resampled = workload_df.set_index("TIMESTAMP").resample("1T").ffill().reset_index()
# Display the chart in the left sidebar
data = pd.DataFrame({"TIMESTAMP": workload_df["TIMESTAMP"], "CPU_USAGE_ACTUAL": workload_df["CPU_USAGE_ACTUAL"]})
create_charts(data)
# Page 1: Simulation Run
if simulation_option == "Simulation Run":
st.title("Simulation Run")
initial_cores_count_run = st.slider("Select the initial core count:", 1, 20, 7)
# Display the DataFrame with an editable data editor
edited_data_run = st.data_editor(df_run)
# Convert the edited data back to JSON
edited_json_run = edited_data_run.to_dict(orient="records")[0]
edited_json_run = unflatten_dict(edited_json_run)
# Display the edited JSON data
st.json(edited_json_run)
# Create a button to run the algorithm for simulation run
if st.button("Run Simulation"):
config_run = ClusterStateConfig(config_dict=edited_json_run)
run_simulation(selected_algorithm_names, data_dir, initial_cores_count_run, config_run)
# Page 2: Simulation Tuning
elif simulation_option == "Simulation Tuning":
st.title("Simulation Tuning")
initial_cores_count_run = st.slider("Select the initial core count:", 1, 20, 7)
# Display the full data
st.json(data_run)
# Define the sections available in metadata.json
config_sections = ["algo_specific_config", "general_config", "prediction_config"]
# Create a dictionary to hold selected parameters for each section
params_to_tune = {}
# Allow users to tune parameters from all sections
for section in config_sections:
st.subheader(f"Tuning {section}")
selected_params = list(data_run[section].keys())
# Multiselect to allow selecting parameters within each section
selected_params_to_tune = st.multiselect(f"Select parameters to tune from {section}:", selected_params)
# Process the selected parameters for tuning
params_to_tune[section] = process_params_to_tune(selected_params_to_tune)
st.divider()
# Predictive parameters (example)
predictive_params_to_tune = {"waiting_before_predict": [24 * 60]}
# Create the ClusterStateConfig with all sections' data
config_tun = ClusterStateConfig(config_dict=data_run)
# Strategy selection
strategy = st.radio("Select tuning strategy:", ["grid", "random"])
# Number of combinations
NUM_COMBINATIONS = None
if strategy == "random":
NUM_COMBINATIONS = st.number_input("Enter num_combinations:", min_value=1, value=500)
else:
# Calculate combinations for grid strategy
all_combinations = [list(itertools.product(*params.values())) for params in params_to_tune.values()]
config_param_combinations = list(itertools.product(*[item for sublist in all_combinations for item in sublist]))
NUM_COMBINATIONS = len(config_param_combinations)
st.text(f"Number of combinations (calculated): {NUM_COMBINATIONS}")
st.text("The 'num_combinations' field is disabled because the strategy is 'grid'.")
# Number of workers and initial cores count
num_workers = st.number_input("Enter num_workers:", min_value=1, value=10)
initial_cores_count = st.number_input("Enter initial_cores_count:", min_value=1, value=10)
# Update session state with selected parameters
st.session_state.tuning_has_run = False
# Available dimensions for analysis
available_dimensions = [
"average_slack",
"average_insufficient_cpu",
"sum_slack",
"sum_insufficient_cpu",
"num_scalings",
"num_insufficient_cpu",
"insufficient_observations_percentage",
"slack_percentage",
"median_insufficient_cpu",
"median_slack",
"max_slack",
]
# Run tuning when button clicked
if st.button("Run Tuning"):
st.session_state.tuning_has_run = True
st.write("Running tuning...")
# Pass the section being tuned and parameters into the tuning function
results = tune_with_strategy(
config_path_run,
strategy,
num_combinations=NUM_COMBINATIONS,
num_workers=num_workers,
data_dir=data_dir,
algorithm=selected_algorithm_names,
initial_cpu_limit=initial_cores_count_run,
algo_specific_params_to_tune=params_to_tune["algo_specific_config"],
general_params_to_tune=params_to_tune["general_config"],
predictive_params_to_tune=params_to_tune["prediction_config"],
)
st.write(f"Tuning results saved at: {data_dir}_tuning")
config_metric_df = construct_config_metric_df(results)
st.dataframe(config_metric_df)
st.write("Getting best config:")
# calculate config closest to zero
pareto_2d = create_pareto_curve_from_folder(data_dir, data_dir + "_tuning")
folder, _, _, _ = pareto_2d.find_closest_to_zero()
# display the data_tuning/pareto_frontier.png
st.image(data_dir + "_tuning/pareto_frontier.png")
st.write(f"Folder with winning config: {data_dir}_tuning/{folder}")
# open the file at the folder
with open(data_dir + "_tuning/" + folder + "/metadata.json", mode="r", encoding="utf-8") as json_file:
data = json.load(json_file)
df = pd.json_normalize(data)
st.write("metadata.json")
st.json(data)
st.write(f"Make sure to delete the {data_dir}_tuning folder before running the tuning again.")
else:
st.write("WIP")