forked from Rikorose/DeepFilterNet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsbatch_train.sh
189 lines (168 loc) · 6.38 KB
/
sbatch_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/bin/bash
#SBATCH --job-name=df
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=8000
#SBATCH --gres=gpu:1
#SBATCH -o /cluster/%u/logs/%j.out
#SBATCH -e /cluster/%u/logs/%j.out
#SBATCH --mail-type=ALL
# Time limit format: "hours:minutes:seconds"
#SBATCH --time=24:00:00
# Send the SIGUSR1 signal 7 h before time limit
#SBATCH --signal=B:USR1@25200
set -e
# Cluster directory containing data, project code, logging, summaries,
# checkpoitns, etc.
export CLUSTER=/net/cluster/$USER
cd "$CLUSTER"
# Workaround for our cluster
export WORKON_HOME="/cluster/$USER/.cache"
export XDG_CACHE_DIR="/cluster/$USER/.cache"
export PYTHONUSERBASE="/cluster/$USER/.python_packages"
# Workaround for HDF5 on other file systems. All hdf5 files are opened in read
# only mode; we do not need locks.
export HDF5_USE_FILE_LOCKING='FALSE'
export RUST_BACKTRACE=1
PROJECT_NAME=DeepFilterNet
DATA_DIR=${DATA_DIR:-$CLUSTER/Data/HDF5} # Set to the directory containing the HDF5s
DATA_CFG=${DATA_CFG:-$DATA_DIR/datasets.cfg} # Default dataset configuration
DATA_CFG=$(readlink -f "$DATA_CFG")
PYTORCH_JIT=${PYTORCH_JIT:-1} # Set to 0 to disable pytorch JIT compilation
COPY_DATA=${COPY_DATA:-1} # Copy data
COPY_MAX_GB=${COPY_MAX_GB:-150} # Max amount to copy hdf5 datasets, rest will be linked
DF_CACHE_MAX_GB=${DF_CACHE_MAX_GB:-100} # Max GB for validation dataset caching
DEBUG=${DEBUG:-0} # Debug mode passed to the python train script
EXCLUDE=${EXCLUDE:-lme[49,170,171]} # Slurm nodes to exclude
if [ "$DEBUG" -eq 1 ]; then
DEBUG="--debug"
elif [ "$DEBUG" -eq 0 ]; then
DEBUG="--no-debug"
fi
echo "Started sbatch script at $(date) in $(pwd)"
echo "Found cuda devices: $CUDA_VISIBLE_DEVICES"
nvidia-smi -L || echo "nvidia-smi not found"
echo "Running on host: $(hostname)"
# Check base dir file
if [[ -z $1 ]]; then
echo >&2 "No model base directory provided!"
exit
fi
if [[ ! -d $1 ]]; then
echo >&2 "Model base directory not found at $1!"
exit
fi
BASE_DIR=$(readlink -f "$1")
echo "Got base_dir: $BASE_DIR"
MODEL_NAME=$(basename "$BASE_DIR")
# Git project setup.
# Creates a separate code directory so that changes of the source code don't
# have any impact on the automatic resubmission process. Furthermore, a specific
# branch or commit can be specified that will be tried to checkout. By default,
# the currently active branch is used.
PROJECT_BRANCH=${BRANCH:-$PROJECT_BRANCH_CUR}
if [[ -n $2 ]]; then
if [[ ! -d $2 ]]; then
echo >&2 "Project home not found at $2!"
exit
fi
PROJECT_HOME=$2
else
PROJECT_ClUSTER_HOME=$CLUSTER/$PROJECT_NAME/
PROJECT_HOME=$CLUSTER/sbatch-$PROJECT_NAME/$MODEL_NAME/
mkdir -p "$PROJECT_HOME"
echo "Copying repo to $PROJECT_HOME"
cd "$PROJECT_ClUSTER_HOME"
rsync -avq --include .git \
--exclude-from="$(git -C "$PROJECT_ClUSTER_HOME" ls-files --exclude-standard -oi --directory >.git/ignores.tmp && echo .git/ignores.tmp)" \
"$PROJECT_ClUSTER_HOME" "$PROJECT_HOME" --delete
fi
if [ -n "$3" ]; then
# Checkout specified branch from previous job
PROJECT_BRANCH_CUR=$3
else
# Use current branch of project on /cluster
PROJECT_BRANCH_CUR=$(git -C "$PROJECT_ClUSTER_HOME" rev-parse --abbrev-ref HEAD)
fi
echo "Running on branch $PROJECT_BRANCH in dir $PROJECT_HOME"
if [ "$PROJECT_BRANCH_CUR" != "$PROJECT_BRANCH" ]; then
stash="stash_$SLURM_JOB_ID"
git -C "$PROJECT_HOME" stash save "$stash"
git -C "$PROJECT_HOME" checkout "$PROJECT_BRANCH"
stash_idx=$(git -C "$PROJECT_HOME" stash list | grep "$stash" | cut -d: -f1)
if [ -n "$stash_idx" ] && [ "$stash_idx" != " " ]; then
# Try to apply current stash; If not possible just proceed.
if ! git -C "$PROJECT_HOME" stash pop "$stash_idx"; then
echo "Could not apply stash to branch $PROJECT_BRANCH"
git -C "$PROJECT_HOME" checkout -f
fi
fi
fi
# Setup conda environment.
# This installs miniconda environment if not existing, pytorch with cuda
# integration and pip packages in requirements.txt
. "$PROJECT_HOME"/scripts/setup_env.sh --source-only
setup_env "$CLUSTER" "$PROJECT_HOME" "$MODEL_NAME"
# Copy data from shared file system to a local folder
if [[ -d /scratch ]] && [[ $COPY_DATA -eq 1 ]]; then
test -d "/scratch/$USER" || mkdir "/scratch/$USER"
NEW_DATA_DIR=/scratch/"$USER"/"$PROJECT_NAME"
echo "Setting up data dir in $NEW_DATA_DIR"
mkdir -p "$NEW_DATA_DIR"
python3 "$PROJECT_HOME"/scripts/copy_datadir.py cp "$DATA_DIR" "$NEW_DATA_DIR" "$DATA_CFG" \
--lock "$MODEL_NAME" --max-gb "$COPY_MAX_GB" --other-hosts
DATA_DIR="$NEW_DATA_DIR"
fi
# Double check there is enough space for validation dataset caching
FREE_GB=$(($(stat -f --format="%a*%S/1024/1024/1024" "$DATA_DIR")))
echo $DF_CACHE_MAX_GB
echo $FREE_GB
export DF_CACHE_MAX_GB=$(( FREE_GB < DF_CACHE_MAX_GB ? FREE_GB : DF_CACHE_MAX_GB ))
echo $DF_CACHE_MAX_GB
# Signal handlers.
# This is used to indicate that maximum training time will be exceeded.
# Therefore, we send a SIGUSR1 to the python training script which needs to
# write a continue file if it wants to get restarted. After training scripts
# returns, check if there is a continue file and resubmit.
function _cleanup_scratch {
echo "Checking if need to cleanup scratch: $NEW_DATA_DIR"
if [[ -d /scratch ]] && [[ $COPY_DATA -eq 1 ]]; then
python3 "$PROJECT_HOME"/scripts/copy_datadir.py cleanup "$NEW_DATA_DIR" --lock "$MODEL_NAME"
fi
}
function _at_exit {
conda deactivate
# Check for return code if training was completed
echo "Checking if need to resubmit training script"
python3 "$PROJECT_HOME"/scripts/has_continue_file.py "$BASE_DIR"
retVal=$?
if [ $retVal -eq 0 ]; then
echo "Training not completed. Resubmitting to continue training."
sh -c "sbatch --exclude=$EXCLUDE \
--job-name=$SLURM_JOB_NAME \
$PROJECT_HOME/scripts/sbatch_train.sh $BASE_DIR $PROJECT_HOME $PROJECT_BRANCH"
exit 0
fi
_cleanup_scratch
}
trap _at_exit EXIT
trap _cleanup_scratch ERR
function _usr1 {
echo "Caught SIGUSR1 signal!"
kill -USR1 "$trainprocess" 2>/dev/null
wait "$trainprocess"
}
trap _usr1 SIGUSR1
cd "$PROJECT_HOME"/DeepFilterNet/df/
# Start training
printf "\n***Starting training***\n\n"
PYTHONPATH="$PROJECT_HOME/DeepFilterNet/" python train.py \
--host-batchsize-config $CLUSTER/host_batchsize.ini \
"$DATA_CFG" \
"$DATA_DIR" \
"$BASE_DIR" \
"$DEBUG" &
trainprocess=$!
echo "Started trainprocess: $trainprocess"
wait $trainprocess
echo "Training stopped"