-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path2_scaling_up_to_multinode.py
116 lines (85 loc) · 2.89 KB
/
2_scaling_up_to_multinode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Databricks notebook source
# MAGIC %md
# MAGIC
# MAGIC # Scaling up with Multiple Nodes
# MAGIC
# MAGIC Now that we have maxed out a single node, we can move to multinode training
# COMMAND ----------
import horovod.torch as hvd
from TrainLoop.pl_train import main_hvd, build_trainer, main_train
import os
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC ## Setup Experiment
# MAGIC
# MAGIC These flags are the same as last time
# COMMAND ----------
# MLFlow workaround
#DEMO_SCOPE_TOKEN_NAME = "TOKEN"
databricks_host = dbutils.secrets.get(scope="scaling_dl", key="host_workspace")
databricks_token = dbutils.secrets.get(scope="scaling_dl", key="host_token")
os.environ['DATABRICKS_HOST'] = databricks_host
os.environ['DATABRICKS_TOKEN'] = databricks_token
imagenette_data_path = '/dbfs/Users/[email protected]/data/imagenette2'
dais_root_folder = '/dbfs/Users/[email protected]/dais_exp'
# We set the id so that we can have multiple notebooks all feeding in
dais_experiment_id = 3156978719066434
# COMMAND ----------
from Models.resnet import ResnetClassification
from Dataloaders.imagenette import ImagenetteDataModule
# COMMAND ----------
data_module = ImagenetteDataModule(imagenette_data_path)
model = ResnetClassification(*data_module.image_shape, num_classes=data_module.num_classes, pretrain=False)
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC # Trigger runs
# MAGIC
# MAGIC We use horovod spark to scale up across nodes.
# MAGIC This is because HorovodRunner currently doesn't distribute Python Modules onto worker nodes
# MAGIC
# MAGIC this setup took 22 minutes to run
# COMMAND ----------
import horovod.spark
# COMMAND ----------
66# set to the number of workers * ?num gpu per worker?
num_processes = 2
epochs = 15
model = horovod.spark.run(main_hvd,
kwargs={'mlflow_db_host': databricks_host,
'mlflow_db_token': databricks_token,
'data_module': data_module,
'model': model,
'root_dir': dais_root_folder,
'epochs': epochs,
'run_name': 'hvd_spark_2_wrk',
'experiment_id': dais_experiment_id},
num_proc=num_processes,
verbose=2)
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC # Moving to 8 GPUS
# MAGIC
# MAGIC Now lets move up to 8.
# MAGIC
# MAGIC This setup took 7 minutes to run
# COMMAND ----------
66# set to the number of workers * ?num gpu per worker?
num_processes = 8
epochs = 15
model = horovod.spark.run(main_hvd,
kwargs={'mlflow_db_host': databricks_host,
'mlflow_db_token': databricks_token,
'data_module': data_module,
'model': model,
'root_dir': dais_root_folder,
'epochs': epochs,
'run_name': 'hvd_spark_8_wrk',
'experiment_id': dais_experiment_id},
num_proc=num_processes,
verbose=2)
# COMMAND ----------
model
# COMMAND ----------