forked from coqui-ai/STT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.compute
executable file
·62 lines (50 loc) · 2.65 KB
/
.compute
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash
# this script gets executed on every node of an allocation;
# in our case we use the provided COMPUTE_* env variables
# to construct a distributed TensorFlow cluster definition
set -e
# activating standard TensorFlow Python virtual environment
source /usr/local/tensorflow/bin/activate
# the standard script to execute
base_cmd="bin/run-wer-automation.sh"
# reading the comma separated node list into array "nodes"
IFS=',' read -r -a nodes <<< "$COMPUTE_NODES"
# keep fist node for convenience
first=${nodes[0]}
# hostname for debugging
hostname=`hostname`
if ((${#nodes[@]} == 1)); then
# there is only one (this) node - so we are alone and we don't need a cluster definition
echo "[slurm ] Starting single node process on $hostname ..."
$base_cmd --job_name localhost "$@"
else
# there is more than one node so we will build a cluster definition
# defining all cluster ports in a way that avoids collisions with other cluster allocations
# (that could eventually get scheduled on the same node)
((port_base=10000 + (COMPUTE_JOB_NUMBER * 100) % 50000))
((coord_port=port_base))
((ps_port=port_base + 1))
((worker_port=port_base + 2))
for node in "${nodes[@]}"; do
worker_hosts[$worker_port]="$node:$worker_port"
((worker_port=worker_port + 1))
done
# converting worker_hosts array of host:port pairs into a comma separated list
worker_hosts=$(printf ",%s" "${worker_hosts[@]}")
worker_hosts=${worker_hosts:1}
# shared cluster configuration
# assert: for this job it should be exactly the same on all allocated nodes
cluster="--coord_host $first --coord_port $coord_port --ps_hosts=$first:$ps_port --worker_hosts=$worker_hosts"
# helpful for debugging potential networking issues
echo "[slurm ] Starting allocated node no. $COMPUTE_NODE_INDEX on $hostname of cluster (coordinator: $first:$coord_port, ps: $first:$ps_port, workers: $worker_hosts) ..."
# starting the parameter server side by side with first worker on the first node;
# so for the moment we only run one ps per allocation
if ((COMPUTE_NODE_INDEX == 0)); then
# CUDA_VISIBLE_DEVICES="" - as the parameter server does not require a GPU;
# the GPU would be shared with the worker on the same machine;
# it turned out that this would reduce available GPU memory for the worker by almost 50%
CUDA_VISIBLE_DEVICES="" $base_cmd $cluster --job_name ps --task_index 0 "$@" 2>&1 | sed 's/^/[ps '"$COMPUTE_NODE_INDEX"'] /' &
fi
# starting the worker
$base_cmd $cluster --job_name worker --task_index $COMPUTE_NODE_INDEX "$@" 2>&1 | sed 's/^/[worker '"$COMPUTE_NODE_INDEX"'] /'
fi