Skip to content

Commit

Permalink
Added a SummaryWriter singleton per output dir.
Browse files Browse the repository at this point in the history
This should ensure:
 * Single summary file for multiple instances of call evaluate.
 * Single summary file in training for global_step, session log and merged summaries.
Change: 123453434
  • Loading branch information
ilblackdragon authored and tensorflower-gardener committed May 27, 2016
1 parent d437d2e commit 926a67e
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,8 @@ def _train_model(self,
monitors += monitors_lib.get_default_monitors(
loss_op=loss_op,
summary_op=logging_ops.get_summary_op(),
save_summary_steps=100)
save_summary_steps=100,
summary_writer=graph_actions.get_summary_writer(self._model_dir))

is_chief = self._config.task == 0
if not is_chief:
Expand Down
34 changes: 30 additions & 4 deletions tensorflow/contrib/learn/python/learn/graph_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import itertools
import sys
import threading
import time

import numpy as np
Expand Down Expand Up @@ -54,6 +55,30 @@
Coordinator = coordinator.Coordinator
SummaryWriter = summary_io.SummaryWriter

# Singletone for SummaryWriter per logdir folder.
_SUMMARY_WRITERS = {}

# Lock protecting _SUMMARY_WRITERS
_summary_writer_lock = threading.Lock()


def get_summary_writer(logdir):
"""Returns single SummaryWriter per logdir in current run.
Args:
logdir: str, folder to write summaries.
Returns:
Existing `SummaryWriter` object or new one if never wrote to given
directory.
"""
_summary_writer_lock.acquire()
if logdir not in _SUMMARY_WRITERS:
_SUMMARY_WRITERS[logdir] = SummaryWriter(logdir,
graph=ops.get_default_graph())
_summary_writer_lock.release()
return _SUMMARY_WRITERS[logdir]


class NanLossDuringTrainingError(RuntimeError):

Expand Down Expand Up @@ -196,6 +221,8 @@ def train(graph,
if global_step_tensor is None:
raise ValueError('No "global_step" was provided or found in the graph.')

summary_writer = get_summary_writer(output_dir)

# TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
if not supervisor_is_chief:
# monitors should run only in supervisor.
Expand All @@ -205,7 +232,7 @@ def train(graph,
loss_op=loss_op,
summary_op=logging_ops.get_summary_op(),
save_summary_steps=supervisor_save_summaries_steps,
output_dir=output_dir)
summary_writer=summary_writer)

# Start monitors, can create graph parts.
for monitor in monitors:
Expand All @@ -220,6 +247,7 @@ def train(graph,
saver=_make_saver(graph),
global_step=global_step_tensor,
summary_op=None,
summary_writer=summary_writer,
save_model_secs=supervisor_save_model_secs,
init_fn=init_fn)
session = supervisor.PrepareSession(master=supervisor_master,
Expand Down Expand Up @@ -500,9 +528,7 @@ def evaluate(graph,
if summary_op is not None and feed_fn is None:
summary_writer = None
try:
summary_writer = SummaryWriter(output_dir,
graph_def=session.graph_def)

summary_writer = get_summary_writer(output_dir)
summary_str = session.run(summary_op)
if summary_str:
summary_writer.add_summary(summary_str, current_global_step)
Expand Down
12 changes: 7 additions & 5 deletions tensorflow/contrib/learn/python/learn/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,13 @@ def every_n_step_end(self, step, outputs):
class SummarySaver(EveryN):
"""Saves summary every N seconds."""

def __init__(self, summary_op, save_steps=100, output_dir=None):
def __init__(self, summary_op, save_steps=100, output_dir=None,
summary_writer=None):
# TODO(ipolosukhin): Implement every N seconds.
super(SummarySaver, self).__init__(every_n_steps=save_steps)
self._summary_op = summary_op
self._summary_writer = None
if output_dir:
self._summary_writer = summary_writer
if summary_writer is None and output_dir:
self._summary_writer = summary_io.SummaryWriter(output_dir)

def set_estimator(self, estimator):
Expand Down Expand Up @@ -226,11 +227,12 @@ def every_n_step_end(self, step, outputs):


def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
output_dir=None):
output_dir=None, summary_writer=None):
monitors = []
if loss_op is not None:
monitors.append(PrintTensor([loss_op.name]))
if summary_op is not None:
monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
output_dir=output_dir))
output_dir=output_dir,
summary_writer=summary_writer))
return monitors

0 comments on commit 926a67e

Please sign in to comment.