Merge pull request yahoo#522 from yahoo/leewyang_update_examples

fix examples broken by external changes
ssheikholeslami · May 11, 2020 · b802848 · b802848
2 parents 690b35f + 007bcb4
commit b802848
Show file tree

Hide file tree

Showing 19 changed files with 215 additions and 2,743 deletions.
diff --git a/examples/mnist/estimator/mnist_spark.py b/examples/mnist/estimator/mnist_spark.py
@@ -1,7 +1,5 @@
 # Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator
 
-from __future__ import absolute_import, division, print_function, unicode_literals
-
 
 def main_fun(args, ctx):
   import numpy as np
@@ -62,7 +60,7 @@ def scale(image, label):
 
   def serving_input_receiver_fn():
     features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features')
-    receiver_tensors = {'features': features}
+    receiver_tensors = {'conv2d_input': features}
     return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors)
 
   def model_fn(features, labels, mode):
@@ -154,4 +152,4 @@ def parse(ln):
 
   cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief')
   cluster.train(images_labels, args.epochs)
-  cluster.shutdown(grace_secs=120)  # allow time for the chief to export model after data feeding
+  cluster.shutdown(grace_secs=60)  # allow time for the chief to export model after data feeding
diff --git a/examples/mnist/estimator/mnist_spark_streaming.py b/examples/mnist/estimator/mnist_spark_streaming.py
@@ -1,7 +1,5 @@
 # Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator
 
-from __future__ import absolute_import, division, print_function, unicode_literals
-
 
 def main_fun(args, ctx):
   import numpy as np
@@ -50,7 +48,7 @@ def scale(image, label):
 
   def serving_input_receiver_fn():
     features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features')
-    receiver_tensors = {'features': features}
+    receiver_tensors = {'conv2d_input': features}
     return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors)
 
   def model_fn(features, labels, mode):

diff --git a/examples/mnist/estimator/mnist_tf.py b/examples/mnist/estimator/mnist_tf.py
@@ -1,7 +1,5 @@
 # Adapted from: https://www.tensorflow.org/beta/tutorials/distribute/multi_worker_with_estimator
 
-from __future__ import absolute_import, division, print_function, unicode_literals
-
 
 def main_fun(args, ctx):
   import tensorflow_datasets as tfds
@@ -30,7 +28,7 @@ def scale(image, label):
 
   def serving_input_receiver_fn():
     features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features')
-    receiver_tensors = {'features': features}
+    receiver_tensors = {'conv2d_input': features}
     return tf.estimator.export.ServingInputReceiver(receiver_tensors, receiver_tensors)
 
   def model_fn(features, labels, mode):
@@ -39,7 +37,7 @@ def model_fn(features, labels, mode):
         tf.keras.layers.MaxPooling2D(),
         tf.keras.layers.Flatten(),
         tf.keras.layers.Dense(64, activation='relu'),
-        tf.keras.layers.Dense(10, activation='softmax')
+        tf.keras.layers.Dense(10)
     ])
     logits = model(features, training=False)
 
@@ -107,4 +105,4 @@ def model_fn(features, labels, mode):
   print("args:", args)
 
   cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True)
-  cluster.shutdown(grace_secs=120)
+  cluster.shutdown(grace_secs=60)
diff --git a/examples/resnet/README.md b/examples/resnet/README.md
@@ -1,6 +1,6 @@
 # ResNet Image Classification
 
-Original Source: https://github.com/tensorflow/models/tree/master/official/vision/image_classification
+Original Source: https://github.com/tensorflow/models/tree/9b8544e0aa5db64161e4f784eac9a0836736183f/official/benchmark/models
 
 This code is based on the Image Classification model from the official [TensorFlow Models](https://github.com/tensorflow/models) repository.  This example already supports different forms of distribution via the `DistributionStrategy` API, so there isn't much additional work to convert it to TensorFlowOnSpark.
 
@@ -10,34 +10,44 @@ Notes:
 
 #### Run the Single-Node Application
 
-First, make sure that you can run the example per the [original instructions](https://github.com/tensorflow/models/tree/68c3c65596b8fc624be15aef6eac3dc8952cbf23/official/vision/image_classification).  For now, we'll just use the CIFAR-10 dataset.  After cloning the [tensorflow/models](https://github.com/tensorflow/models) repository (checking out the `v2.0` tag with `git checkout v2.0`), and downloading the dataset, you should be able to run the training as follows:
+First, make sure that you can run the original example, as follows:
 ```
-# Note: these instructions have been tested with the `v2.0` tag of tensorflow/models.
+# clone the TensorFlow models repository
+git clone https://github.com/tensorflow/models
+cd models
+
+# checkout the specific revision that this example was based upon
+git checkout 9b8544e0aa5db64161e4f784eac9a0836736183f
+
+# download the CIFAR10 dataset to /tmp/cifar10_data
+python official/r1/resnet/cifar10_download_and_extract.py
+
+# run the example
+export TENSORFLOW_MODELS=$(pwd)
+export CIFAR_DATA=/tmp/cifar10_dta
+export PYTHONPATH=${TENSORFLOW_MODELS}:$PYTHONPATH
 
-export TENSORFLOW_MODELS=/path/to/tensorflow/models
-export CIFAR_DATA=/path/to/cifar
-export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS}
 python resnet_cifar_main.py --data_dir=${CIFAR_DATA} --num_gpus=0 --train_epochs=1
 ```
 
-If you have GPUs available, just set `--num_gpus` to the number of GPUs on your machine.  Note: by default, `--train_epochs=182`, which runs for a long time on a CPU machine, so for brevity, we'll just run a single epoch in these examples.
+If you have GPUs available, just set `--num_gpus` to the number of GPUs on your machine.
 
 #### Run as a Distributed TensorFlow Application
 
 Next, confirm that this application is capable of being distributed.  We can test this on a single CPU machine by using two different terminal/shell sessions, as follows:
 ```
 # in one shell/window
 export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS}
-export TF_CONFIG='{"cluster": { "worker": ["localhost:2222", "localhost:2223"]}, "task": {"type": "worker", "index": 0}}'
+export TF_CONFIG='{"cluster": { "chief": ["localhost:2222"], "worker": ["localhost:2223"]}, "task": {"type": "chief", "index": 0}}'
 python resnet_cifar_main.py --data_dir=${CIFAR_DATA} --num_gpus=0 --ds=multi_worker_mirrored --train_epochs=1
 
 # in another shell/window
 export PYTHONPATH=${PYTHONPATH}:${TENSORFLOW_MODELS}
-export TF_CONFIG='{"cluster": { "worker": ["localhost:2222", "localhost:2223"]}, "task": {"type": "worker", "index": 1}}'
+export TF_CONFIG='{"cluster": { "chief": ["localhost:2222"], "worker": ["localhost:2223"]}, "task": {"type": "worker", "index": 0}}'
 python resnet_cifar_main.py --data_dir=${CIFAR_DATA} --num_gpus=0 --ds=multi_worker_mirrored --train_epochs=1
 ```
 
-Note that we now configure the code to use the `MultiWorkerMirroredtrategy`.  Also note that training will not begin until both nodes have started.
+Note that we now configure the code to use the `MultiWorkerMirroredStrategy`.  Also note that training will not begin until both nodes have started.
 
 ### Run as a TensorFlowOnSpark Application
 
@@ -62,7 +72,7 @@ ${SPARK_HOME}/bin/spark-submit \
 ${TFoS_HOME}/examples/resnet/resnet_cifar_spark.py \
 --cluster_size ${SPARK_WORKER_INSTANCES} \
 --epochs 1 \
---data_dir /Users/leewyang/datasets/cifar10/cifar-10-batches-bin \
+--data_dir ${CIFAR_DATA} \
 --num_gpus=0 \
 --ds=multi_worker_mirrored \
 --train_epochs 1

diff --git a/examples/resnet/cifar_preprocessing.py b/examples/resnet/cifar_preprocessing.py