Fix coqui-ai#1986 - Remove distributed training support

purplesparkle · Apr 1, 2019 · a179a23 · a179a23
1 parent a009361
commit a179a23
Show file tree

Hide file tree

Showing 15 changed files with 226 additions and 1,169 deletions.
diff --git a/.compute b/.compute
@@ -7,7 +7,7 @@ python3 -m venv /tmp/venv
 source /tmp/venv/bin/activate
 
 pip install -r <(grep -v tensorflow requirements.txt)
-pip install tensorflow-gpu==1.13.0-rc2
+pip install tensorflow-gpu==1.13.1
 
 # Install ds_ctcdecoder package from TaskCluster
 pip install $(python3 util/taskcluster.py --decoder)
@@ -30,7 +30,5 @@ python -u DeepSpeech.py \
   --learning_rate 0.0001 \
   --dropout_rate 0.2 \
   --epoch 13 \
-  --display_step 0 \
-  --validation_step 1 \
   --checkpoint_dir "../keep" \
   --summary_dir "../keep/summaries"
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@
 /runs
 /logs
 /exports
+/data/ldc93s1
 /native_client/setup.cfg
 /native_client/build
 /native_client/*.egg-info

diff --git a/DeepSpeech.py b/DeepSpeech.py
diff --git a/README.md b/README.md
@@ -48,7 +48,6 @@ See the output of `deepspeech -h` for more information on the use of `deepspeech
   - [Checkpointing](#checkpointing)
   - [Exporting a model for inference](#exporting-a-model-for-inference)
   - [Exporting a model for TFLite](#exporting-a-model-for-tflite)
-  - [Distributed computing across more than one machine](#distributed-training-across-more-than-one-machine)
   - [Continuing training from a release model](#continuing-training-from-a-release-model)
 - [Contact/Getting Help](#contactgetting-help)
 
@@ -352,30 +351,6 @@ $ convert_graphdef_memmapped_format --in_graph=output_graph.pb --out_graph=outpu
 
 Upon sucessfull run, it should report about conversion of a non-zero number of nodes. If it reports converting `0` nodes, something is wrong: make sure your model is a frozen one, and that you have not applied any incompatible changes (this includes `quantize_weights`).
 
-### Distributed training across more than one machine
-
-DeepSpeech has built-in support for [distributed TensorFlow](https://www.tensorflow.org/deploy/distributed). To get an idea on how this works, you can use the script `bin/run-cluster.sh` for running a cluster with workers just on the local machine.
-
-```bash
-$ bin/run-cluster.sh --help
-Usage: run-cluster.sh [--help] [--script script] [p:w:g] <arg>*
-
---help      print this help message
---script    run the provided script instead of DeepSpeech.py
-p           number of local parameter servers
-w           number of local workers
-g           number of local GPUs per worker
-<arg>*      remaining parameters will be forwarded to DeepSpeech.py or a provided script
-
-Example usage - The following example will create a local DeepSpeech.py cluster
-with 1 parameter server, and 2 workers with 1 GPU each:
-$ run-cluster.sh 1:2:1 --epoch 10
-```
-
-Be aware that for the help example to be able to run, you need at least two `CUDA` capable GPUs (2 workers x 1 GPU). The script utilizes environment variable `CUDA_VISIBLE_DEVICES` for `DeepSpeech.py` to see only the provided number of GPUs per worker.
-
-The script is meant to be a template for your own distributed computing instrumentation. Just modify the startup code for the different servers (workers and parameter servers) accordingly. You could use SSH or something similar for running them on your remote hosts.
-
 ### Continuing training from a release model
 
 If you'd like to use one of the pre-trained models released by Mozilla to bootstrap your training process (transfer learning, fine tuning), you can do so by using the `--checkpoint_dir` flag in `DeepSpeech.py`. Specify the path where you downloaded the checkpoint from the release, and training will resume from the pre-trained model.

diff --git a/bin/job-template.sbatch b/bin/job-template.sbatch
diff --git a/bin/run-cluster.sh b/bin/run-cluster.sh
diff --git a/bin/run-ldc93s1.sh b/bin/run-ldc93s1.sh
@@ -16,7 +16,7 @@ else
     checkpoint_dir=$(python -c 'from xdg import BaseDirectory as xdg; print(xdg.save_data_path("deepspeech/ldc93s1"))')
 fi
 
-python -u DeepSpeech.py \
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files data/ldc93s1/ldc93s1.csv \
   --dev_files data/ldc93s1/ldc93s1.csv \
   --test_files data/ldc93s1/ldc93s1.csv \

diff --git a/bin/run-tc-ldc93s1_checkpoint.sh b/bin/run-tc-ldc93s1_checkpoint.sh
@@ -12,7 +12,7 @@ if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
     python -u bin/import_ldc93s1.py ${ldc93s1_dir}
 fi;
 
-python -u DeepSpeech.py --noshow_progressbar \
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \
@@ -22,7 +22,7 @@ python -u DeepSpeech.py --noshow_progressbar \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie' | tee /tmp/resume.log
 
-if ! grep "Training of Epoch $epoch_count" /tmp/resume.log; then
+if ! grep "Training epoch $epoch_count" /tmp/resume.log; then
   echo "Did not resume training from checkpoint"
   exit 1
 else

diff --git a/bin/run-tc-ldc93s1_new.sh b/bin/run-tc-ldc93s1_new.sh
@@ -12,7 +12,7 @@ if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
     python -u bin/import_ldc93s1.py ${ldc93s1_dir}
 fi;
 
-python -u DeepSpeech.py \
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
   --train_cached_features_path "/tmp/ldc93s1.hdf5" \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \

diff --git a/bin/run-tc-ldc93s1_singleshotinference.sh b/bin/run-tc-ldc93s1_singleshotinference.sh
@@ -10,7 +10,7 @@ if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
     python -u bin/import_ldc93s1.py ${ldc93s1_dir}
 fi;
 
-python -u DeepSpeech.py \
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \
@@ -20,7 +20,7 @@ python -u DeepSpeech.py \
   --lm_binary_path 'data/smoke_test/vocab.pruned.lm' \
   --lm_trie_path 'data/smoke_test/vocab.trie'
 
-python -u DeepSpeech.py \
+python -u DeepSpeech.py --noshow_progressbar --noearly_stop \
   --train_files ${ldc93s1_csv} --train_batch_size 1 \
   --dev_files ${ldc93s1_csv} --dev_batch_size 1 \
   --test_files ${ldc93s1_csv} --test_batch_size 1 \

diff --git a/bin/run-tc-ldc93s1_tflite.sh b/bin/run-tc-ldc93s1_tflite.sh
@@ -10,7 +10,7 @@ if [ ! -f "${ldc93s1_dir}/ldc93s1.csv" ]; then
     python -u bin/import_ldc93s1.py ${ldc93s1_dir}
 fi;
 
-python -u DeepSpeech.py \
+python -u DeepSpeech.py --noshow_progressbar \
   --n_hidden 494 \
   --checkpoint_dir '/tmp/ckpt' \
   --export_dir '/tmp/train' \

diff --git a/util/config.py b/util/config.py
@@ -4,7 +4,6 @@
 import tensorflow as tf
 
 from attrdict import AttrDict
-from six.moves import zip, range, filter
 from util.flags import FLAGS
 from util.gpu import get_available_gpus
 from util.logging import log_error
@@ -27,30 +26,11 @@ def __getattr__(self, name):
 def initialize_globals():
     c = AttrDict()
 
-    # ps and worker hosts required for p2p cluster setup
-    FLAGS.ps_hosts = list(filter(len, FLAGS.ps_hosts.split(',')))
-    FLAGS.worker_hosts = list(filter(len, FLAGS.worker_hosts.split(',')))
+    # CPU device
+    c.cpu_device = '/cpu:0'
 
-    # Create a cluster from the parameter server and worker hosts.
-    c.cluster = tf.train.ClusterSpec({'ps': FLAGS.ps_hosts, 'worker': FLAGS.worker_hosts})
-
-    # The absolute number of computing nodes - regardless of cluster or single mode
-    num_workers = max(1, len(FLAGS.worker_hosts))
-
-    # If replica numbers are negative, we multiply their absolute values with the number of workers
-    if FLAGS.replicas < 0:
-        FLAGS.replicas = num_workers * -FLAGS.replicas
-    if FLAGS.replicas_to_agg < 0:
-        FLAGS.replicas_to_agg = num_workers * -FLAGS.replicas_to_agg
-
-    # The device path base for this node
-    c.worker_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task_index)
-
-    # This node's CPU device
-    c.cpu_device = c.worker_device + '/cpu:0'
-
-    # This node's available GPU devices
-    c.available_devices = [c.worker_device + gpu for gpu in get_available_gpus()]
+    # Available GPU devices
+    c.available_devices = get_available_gpus()
 
     # If there is no GPU available, we fall back to CPU based operation
     if 0 == len(c.available_devices):
@@ -68,6 +48,9 @@ def initialize_globals():
     if len(FLAGS.checkpoint_dir) == 0:
         FLAGS.checkpoint_dir = xdg.save_data_path(os.path.join('deepspeech','checkpoints'))
 
+    if FLAGS.load not in ['last', 'best', 'init', 'auto']:
+        FLAGS.load = 'auto'
+
     # Set default summary dir
     if len(FLAGS.summary_dir) == 0:
         FLAGS.summary_dir = xdg.save_data_path(os.path.join('deepspeech','summaries'))
@@ -109,26 +92,6 @@ def initialize_globals():
     # Units in the sixth layer = number of characters in the target language plus one
     c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label
 
-    # Queues that are used to gracefully stop parameter servers.
-    # Each queue stands for one ps. A finishing worker sends a token to each queue before joining/quitting.
-    # Each ps will dequeue as many tokens as there are workers before joining/quitting.
-    # This ensures parameter servers won't quit, if still required by at least one worker and
-    # also won't wait forever (like with a standard `server.join()`).
-    done_queues = []
-    for i, ps in enumerate(FLAGS.ps_hosts):
-        # Queues are hosted by their respective owners
-        with tf.device('/job:ps/task:%d' % i):
-            done_queues.append(tf.FIFOQueue(1, tf.int32, shared_name=('queue%i' % i)))
-
-    # Placeholder to pass in the worker's index as token
-    c.token_placeholder = tf.placeholder(tf.int32)
-
-    # Enqueue operations for each parameter server
-    c.done_enqueues = [queue.enqueue(c.token_placeholder) for queue in done_queues]
-
-    # Dequeue operations for each parameter server
-    c.done_dequeues = [queue.dequeue() for queue in done_queues]
-
     if len(FLAGS.one_shot_infer) > 0:
         FLAGS.train = False
         FLAGS.test = False
@@ -137,7 +100,4 @@ def initialize_globals():
             log_error('Path specified in --one_shot_infer is not a valid file.')
             exit(1)
 
-    # Determine, if we are the chief worker
-    c.is_chief = len(FLAGS.worker_hosts) == 0 or (FLAGS.task_index == 0 and FLAGS.job_name == 'worker')
-
     ConfigSingleton._config = c