Skip to content

Commit

Permalink
Fix spurious "did not start correctly" error. (tensorflow#5252)
Browse files Browse the repository at this point in the history
* Fix spurious "did not start correctly" error.

The error "Generation subprocess did not start correctly" would occur if the async process started up after the main process checked for the subproc_alive file.

* Add error message
  • Loading branch information
reedwm authored and Taylor Robie committed Sep 5, 2018
1 parent 5856878 commit 7babedc
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 7 deletions.
14 changes: 12 additions & 2 deletions official/recommendation/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,14 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
atexit.register(tf.gfile.DeleteRecursively,
ncf_dataset.cache_paths.cache_root)

for _ in range(15):
if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
break
time.sleep(1) # allow `alive` file to be written
if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
raise ValueError("Generation subprocess did not start correctly. Data will "
"not be available; exiting to avoid waiting forever.")

return ncf_dataset


Expand Down Expand Up @@ -495,8 +503,10 @@ def make_train_input_fn(ncf_dataset):
"""Construct training input_fn for the current epoch."""

if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
raise ValueError("Generation subprocess did not start correctly. Data will "
"not be available; exiting to avoid waiting forever.")
# The generation subprocess must have been alive at some point, because we
# earlier checked that the subproc_alive file existed.
raise ValueError("Generation subprocess unexpectedly died. Data will not "
"be available; exiting to avoid waiting forever.")

train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir
while not tf.gfile.Exists(train_epoch_dir):
Expand Down
5 changes: 0 additions & 5 deletions official/recommendation/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,6 @@ def test_end_to_end(self):
batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2,
num_neg=NUM_NEG)

for _ in range(30):
if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
break
time.sleep(1) # allow `alive` file to be written

g = tf.Graph()
with g.as_default():
input_fn, record_dir, batch_count = \
Expand Down

0 comments on commit 7babedc

Please sign in to comment.