Skip to content

Commit

Permalink
[TF] Progress bar for evaluation step. Timer for logging. (#448)
Browse files Browse the repository at this point in the history
* Progress bar for evaluation step was added

* Timer for train step was added

Co-authored-by: Alexander Suslov <[email protected]>
  • Loading branch information
andrey-churkin and alexsu52 authored Feb 5, 2021
1 parent 9a98492 commit 3b6847c
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 34 deletions.
29 changes: 29 additions & 0 deletions beta/examples/tensorflow/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
limitations under the License.
"""

import time
import datetime
import json
import logging
Expand Down Expand Up @@ -172,3 +173,31 @@ def __call__(self, metrics, step):

def close(self):
self.writer.close()


class Timer:
"""A simple timer."""

def __init__(self):
self.reset()

def tic(self):
# using time.time instead of time.clock because time time.clock
# does not normalize for multithreading
self.start_time = time.time()

def toc(self, average=True):
self.diff = time.time() - self.start_time
self.total_time += self.diff
self.calls += 1
self.average_time = self.total_time / self.calls
if average:
return self.average_time
return self.diff

def reset(self):
self.total_time = 0.
self.calls = 0
self.start_time = 0.
self.diff = 0.
self.average_time = 0.
58 changes: 40 additions & 18 deletions beta/examples/tensorflow/object_detection/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from beta.examples.tensorflow.common.sample_config import create_sample_config
from beta.examples.tensorflow.common.scheduler import build_scheduler
from beta.examples.tensorflow.common.utils import SummaryWriter
from beta.examples.tensorflow.common.utils import Timer
from beta.examples.tensorflow.common.utils import serialize_config
from beta.examples.tensorflow.common.utils import create_code_snapshot
from beta.examples.tensorflow.common.utils import configure_paths
Expand All @@ -41,8 +42,7 @@

def get_argument_parser():
parser = get_common_argument_parser(precision=False,
save_checkpoint_freq=False,
print_freq=False)
save_checkpoint_freq=False)

parser.add_argument(
'--mode',
Expand Down Expand Up @@ -166,16 +166,18 @@ def train_step(dataset_inputs):


def train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step,
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer):
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer, num_test_batches, print_freq):

train_summary_writer = SummaryWriter(log_dir, 'train')
validation_summary_writer = SummaryWriter(log_dir, 'validation')
compression_summary_writer = SummaryWriter(log_dir, 'compression')

timer = Timer()
timer.tic()

logger.info('Training started')
logger.info('Training...')
for epoch in range(initial_epoch, epochs):
logger.info('Epoch {}/{}'.format(epoch, epochs))
logger.info('Epoch: {}/{}'.format(epoch, epochs))
compression_ctrl.scheduler.epoch_step(epoch)

for step, x in enumerate(train_dist_dataset):
Expand All @@ -197,12 +199,13 @@ def train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_data

train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy())

if step % 100 == 0:
logger.info('Step {}/{}'.format(step, steps_per_epoch))
if step % print_freq == 0:
time = timer.toc(average=False)
logger.info('Step: {}/{} Time: {:.3f} sec'.format(step, steps_per_epoch, time))
logger.info('Training metric = {}'.format(train_metric_result))
timer.tic()

logger.info('Evaluation...')
test_metric_result = evaluate(test_step, eval_metric, test_dist_dataset)
test_metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, print_freq)
validation_summary_writer(metrics=test_metric_result, step=optimizer.iterations.numpy())
eval_metric.reset_states()
logger.info('Validation metric = {}'.format(test_metric_result))
Expand All @@ -220,13 +223,32 @@ def train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_data
compression_summary_writer.close()


def evaluate(test_step, metric, test_dist_dataset):
def evaluate(test_step, metric, test_dist_dataset, num_batches, print_freq):
"""Runs evaluation steps and aggregate metrics"""
for x in test_dist_dataset:
timer = Timer()
timer.tic()

logger.info('Testing...')
for batch_idx, x in enumerate(test_dist_dataset):
labels, outputs = test_step(x)
metric.update_state(labels, outputs)

return metric.result()
if batch_idx % print_freq == 0:
time = timer.toc(average=False)
logger.info('Predict for batch: {}/{} Time: {:.3f} sec'.format(batch_idx, num_batches, time))
timer.tic()

logger.info('Total time: {:.3f} sec'.format(timer.total_time))

timer.reset()

logger.info('Evaluating predictions...')
timer.tic()
result = metric.result()
timer.toc(average=False)
logger.info('Total time: {:.3f} sec'.format(timer.total_time))

return result


def run(config):
Expand All @@ -235,14 +257,15 @@ def run(config):
# Create dataset
builders = get_dataset_builders(config, strategy.num_replicas_in_sync)
datasets = [builder.build() for builder in builders]
train_builder, _ = builders
train_builder, test_builder = builders
train_dataset, test_dataset = datasets
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

# Training parameters
epochs = config.epochs
steps_per_epoch = train_builder.steps_per_epoch
num_test_batches = test_builder.steps_per_epoch

# Create model builder
model_builder = get_model_builder(config)
Expand Down Expand Up @@ -276,20 +299,19 @@ def run(config):
config.ckpt_path,
steps_per_epoch)
else:
logger.info('initialization...')
logger.info('Initialization...')
compression_ctrl.initialize(dataset=train_dataset)

train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer)
test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn)

if 'train' in config.mode:
logger.info('Training...')
train(train_step, test_step, eval_metric, train_dist_dataset, test_dist_dataset, initial_epoch, initial_step,
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer)
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, num_test_batches,
config.print_freq)

logger.info('Evaluation...')
print_statistics(compression_ctrl.statistics())
metric_result = evaluate(test_step, eval_metric, test_dist_dataset)
metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_test_batches, config.print_freq)
logger.info('Validation metric = {}'.format(metric_result))

if config.metrics_dump is not None:
Expand Down
34 changes: 26 additions & 8 deletions beta/examples/tensorflow/segmentation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from beta.examples.tensorflow.common.utils import get_saving_parameters
from beta.examples.tensorflow.common.utils import SummaryWriter
from beta.examples.tensorflow.common.utils import write_metrics
from beta.examples.tensorflow.common.utils import Timer
from beta.examples.tensorflow.segmentation.models.model_selector import get_predefined_config
from beta.examples.tensorflow.segmentation.models.model_selector import get_model_builder

Expand All @@ -40,7 +41,6 @@ def get_argument_parser():
precision=False,
save_checkpoint_freq=False,
to_h5=False,
print_freq=False,
dataset_type=False)

parser.add_argument(
Expand Down Expand Up @@ -116,13 +116,32 @@ def load_checkpoint(checkpoint, ckpt_path):
return None


def evaluate(test_step, metric, test_dist_dataset):
def evaluate(test_step, metric, test_dist_dataset, num_batches, print_freq):
"""Runs evaluation steps and aggregate metrics"""
for x in test_dist_dataset:
timer = Timer()
timer.tic()

logger.info('Testing...')
for batch_idx, x in enumerate(test_dist_dataset):
labels, outputs = test_step(x)
metric.update_state(labels, outputs)

return metric.result()
if batch_idx % print_freq == 0:
time = timer.toc(average=False)
logger.info('Predict for batch: {}/{} Time: {:.3f} sec'.format(batch_idx, num_batches, time))
timer.tic()

logger.info('Total time: {:.3f} sec'.format(timer.total_time))

timer.reset()

logger.info('Evaluating predictions...')
timer.tic()
result = metric.result()
timer.toc(average=False)
logger.info('Total time: {:.3f} sec'.format(timer.total_time))

return result


def create_test_step_fn(strategy, model, predict_post_process_fn):
Expand Down Expand Up @@ -152,6 +171,7 @@ def run_evaluation(config, eval_timeout=None):

dataset_builder = get_dataset_builders(config, strategy.num_replicas_in_sync)
dataset = dataset_builder.build()
num_batches = dataset_builder.steps_per_epoch
test_dist_dataset = strategy.experimental_distribute_dataset(dataset)

# We use `model_batch_size` to create input layer for model
Expand All @@ -175,10 +195,9 @@ def run_evaluation(config, eval_timeout=None):
if config.ckpt_path:
load_checkpoint(checkpoint, config.ckpt_path)

logger.info('Evaluation...')
statistics = compression_ctrl.statistics()
print_statistics(statistics)
metric_result = evaluate(test_step, eval_metric, test_dist_dataset)
metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq)
eval_metric.reset_states()
logger.info('Test metric = {}'.format(metric_result))

Expand All @@ -197,8 +216,7 @@ def run_evaluation(config, eval_timeout=None):
status.expect_partial()
logger.info('Checkpoint file {} found and restoring from checkpoint'.format(checkpoint_path))
logger.info('Checkpoint step: {}'.format(checkpoint.step.numpy()))
logger.info('Evaluation...')
metric_result = evaluate(test_step, eval_metric, test_dist_dataset)
metric_result = evaluate(test_step, eval_metric, test_dist_dataset, num_batches, config.print_freq)

current_step = checkpoint.step.numpy()
validation_summary_writer(metrics=metric_result, step=current_step)
Expand Down
20 changes: 12 additions & 8 deletions beta/examples/tensorflow/segmentation/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from beta.examples.tensorflow.common.utils import create_code_snapshot
from beta.examples.tensorflow.common.utils import serialize_config
from beta.examples.tensorflow.common.utils import SummaryWriter
from beta.examples.tensorflow.common.utils import Timer
from beta.examples.tensorflow.segmentation.models.model_selector import get_predefined_config
from beta.examples.tensorflow.segmentation.models.model_selector import get_model_builder

Expand All @@ -44,7 +45,6 @@ def get_argument_parser():
precision=False,
save_checkpoint_freq=False,
export_args=False,
print_freq=False,
dataset_type=False,
cpu_only=False,
metrics_dump=False)
Expand Down Expand Up @@ -153,14 +153,17 @@ def train_step(dataset_inputs):


def train(train_step, train_dist_dataset, initial_epoch, initial_step,
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer):
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, log_dir, optimizer, print_freq):

train_summary_writer = SummaryWriter(log_dir, 'train')
compression_summary_writer = SummaryWriter(log_dir, 'compression')

logger.info('Training started')
timer = Timer()
timer.tic()

logger.info('Training...')
for epoch in range(initial_epoch, epochs):
logger.info('Epoch {}/{}'.format(epoch, epochs))
logger.info('Epoch: {}/{}'.format(epoch, epochs))
compression_ctrl.scheduler.epoch_step(epoch)

for step, x in enumerate(train_dist_dataset):
Expand All @@ -185,9 +188,11 @@ def train(train_step, train_dist_dataset, initial_epoch, initial_step,

train_summary_writer(metrics=train_metric_result, step=optimizer.iterations.numpy())

if step % 100 == 0:
logger.info('Step {}/{}'.format(step, steps_per_epoch))
if step % print_freq == 0:
time = timer.toc(average=False)
logger.info('Step: {}/{} Time: {:.3f} sec'.format(step, steps_per_epoch, time))
logger.info('Training metric = {}'.format(train_metric_result))
timer.tic()

statistics = compression_ctrl.statistics()
print_statistics(statistics)
Expand Down Expand Up @@ -256,9 +261,8 @@ def run_train(config):

train_step = create_train_step_fn(strategy, compress_model, loss_fn, optimizer)

logger.info('Training...')
train(train_step, train_dist_dataset, initial_epoch, initial_step,
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer)
epochs, steps_per_epoch, checkpoint_manager, compression_ctrl, config.log_dir, optimizer, config.print_freq)

logger.info('Compression statistics')
print_statistics(compression_ctrl.statistics())
Expand Down

0 comments on commit 3b6847c

Please sign in to comment.