Skip to content

Commit

Permalink
Added basic benchmark data collection to track training batch times.
Browse files Browse the repository at this point in the history
  • Loading branch information
gperdrizet committed Feb 14, 2024
1 parent b24ea89 commit c0b7382
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 65 deletions.
5 changes: 3 additions & 2 deletions skylines/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# Option to resume a training run ######################################
########################################################################

RESUME = False
RESUME_RUN_DATE = '2024-02-09'
RESUME = True
RESUME_RUN_DATE = '2024-02-11'

########################################################################
# Paths and directories ################################################
Expand All @@ -35,6 +35,7 @@
MODEL_CHECKPOINT_DIR = f'{path}/data/training_checkpoints/{path_date}'
SPECIMEN_DIR = f'{path}/data/specimens/{path_date}'
IMAGE_OUTPUT_DIR = f'{path}/data/gan_output/{path_date}'
BENCHMARK_DATA_DIR = f'{path}/benchmarking'


########################################################################
Expand Down
178 changes: 119 additions & 59 deletions skylines/functions/training_functions.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,19 @@
import pickle
import time
import os.path
import numpy as np
import pandas as pd
import tensorflow as tf
from numpy import zeros
from numpy import ones
from numpy.random import randn
import matplotlib.pyplot as plt
import functions.model_definitions as models
import functions.training_functions as training_funcs
import tensorflow as tf

np.set_printoptions(threshold=np.inf)


# generate points in latent space as input for the generator

def generate_latent_points(latent_dim, n_samples):
# generate points in the latent space
x_input = randn(latent_dim * n_samples)
# reshape into a batch of inputs for the network
x_input = x_input.reshape(n_samples, latent_dim)
# x_input = tf.convert_to_tensor(x_input, dtype=tf.float16)

return x_input


# Handles resuming or not and getting models ready accordingly

def prepare_models(
Expand Down Expand Up @@ -91,51 +82,6 @@ def prepare_models(
return [latent_points, frame, discriminator_model, generator_model, gann_model]


# use the generator to generate n fake examples, with class labels

def generate_fake_samples(g_model, latent_dim, n_samples):
# generate points in latent space
x_input = generate_latent_points(latent_dim, n_samples)
# predict outputs
X = g_model.predict(x_input)
# create 'fake' class labels (0)
y = zeros((n_samples, 1))
return X, y


# save 3x3 grid of generated images

def save_frame(g_model, latent_points, frame, image_output_dir):
# create images from latent points
images = g_model.predict(latent_points)
# scale images into range 0.0, 1.0 for plotting as RGB
images = (images + 1.0) / 2.0

plot_dim = 3

fig = plt.figure(figsize=(13.3025, 13.3025), dpi=300)
ax = []

for i in range(plot_dim * plot_dim):
# create subplot and append to ax
ax.append(fig.add_subplot(plot_dim, plot_dim, i+1))
plt.imshow(images[i])
plt.axis('off')

# remove whitespace between plots
plt.subplots_adjust(wspace=-0.019, hspace=0)

# save plot to file
filename = f'{image_output_dir}/frame{frame:07d}.jpg'
#filename = './gan_output/frame%07d.jpg' % (frame)
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
plt.close()

frame += 1

return frame


# train the generator and discriminator

def train(
Expand All @@ -152,20 +98,43 @@ def train(
model_checkpoint_dir,
checkpoint_save_frequency,
project_name,
image_output_dir
image_output_dir,
benchmarking_data_dir,
gpu_parallelism
):

# Calculate how many batches we need for each epoch
bat_per_epo = int((image_count) / n_batch)

# Construct filename for benchmarking results
num_gpus = len(tf.config.experimental.list_physical_devices("GPU"))
benchmark_datafile = f'{benchmarking_data_dir}/{gpu_parallelism}_{num_gpus}_gpus_batch_{n_batch}.csv'

# Make empty dict for benchmarking data
benchmark_data = {
'GPU parallelism': [],
'GPUs': [],
'Batch size': [],
'Training time (sec.)': []
}

# loop on epochs
training_times = []
for i in range(n_epochs):

iterator = iter(dataset)
training_time = 0

# loop on batches
for j in range(bat_per_epo):

# get randomly selected 'real' samples
X_real = iterator.get_next()
y_real = ones((n_batch, 1))

# Start training timer
training_start = time.time()

# train discriminator on real samples
d_loss1, _ = d_model.train_on_batch(X_real, y_real)

Expand All @@ -183,6 +152,9 @@ def train(
# train the generator via the discriminator's error
g_loss = gan_model.train_on_batch(X_gan, y_gan)

# Add batch time to total
training_time += time.time() - training_start

# summarize loss on this batch
print(f'{project_name}-{frame}: d1={d_loss1:.3f}, d2={d_loss2:.3f}, g={g_loss:.3f}')

Expand All @@ -195,6 +167,94 @@ def train(
j += 1
i += 1

training_times.append(training_time)

# Save benchmarking data every 10 epochs
if i % 10 == 0:

# Make dataframe from benchmarking data
benchmark_data['Training time (sec.)'] = training_times
benchmark_data['GPU parallelism'] = str(gpu_parallelism) * len(training_times)
benchmark_data['GPUs'] = str(num_gpus) * len(training_times)
benchmark_data['Batch size'] = str(n_batch) * len(training_times)

training_times = []

new_data_df = pd.DataFrame(benchmark_data)

# Check to see if we already have data for this config
if os.path.isfile(benchmark_datafile) == True:

# The datafile already exists read old data and append
old_data_df = pd.read_csv(benchmark_datafile)
output_df = pd.concat([old_data_df, new_data_df], axis=1)

else:
# If we don't have old data, just save the new data
output_df = new_data_df

# Save the data
print(output_df.head())
output_df.to_csv(benchmarking_data_dir)



# generate points in latent space as input for the generator

def generate_latent_points(latent_dim, n_samples):
# generate points in the latent space
x_input = randn(latent_dim * n_samples)
# reshape into a batch of inputs for the network
x_input = x_input.reshape(n_samples, latent_dim)
# x_input = tf.convert_to_tensor(x_input, dtype=tf.float16)

return x_input


# use the generator to generate n fake examples, with class labels

def generate_fake_samples(g_model, latent_dim, n_samples):
# generate points in latent space
x_input = generate_latent_points(latent_dim, n_samples)
# predict outputs
X = g_model.predict(x_input)
# create 'fake' class labels (0)
y = zeros((n_samples, 1))
return X, y


# save 3x3 grid of generated images

def save_frame(g_model, latent_points, frame, image_output_dir):
# create images from latent points
images = g_model.predict(latent_points)
# scale images into range 0.0, 1.0 for plotting as RGB
images = (images + 1.0) / 2.0

plot_dim = 3

fig = plt.figure(figsize=(13.3025, 13.3025), dpi=300)
ax = []

for i in range(plot_dim * plot_dim):
# create subplot and append to ax
ax.append(fig.add_subplot(plot_dim, plot_dim, i+1))
plt.imshow(images[i])
plt.axis('off')

# remove whitespace between plots
plt.subplots_adjust(wspace=-0.019, hspace=0)

# save plot to file
filename = f'{image_output_dir}/frame{frame:07d}.jpg'
#filename = './gan_output/frame%07d.jpg' % (frame)
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
plt.close()

frame += 1

return frame


# Generates from model

Expand Down
13 changes: 9 additions & 4 deletions skylines/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@

if __name__ == '__main__':

# Check available GPUs
print("Num GPUs Available:", len(
tf.config.experimental.list_physical_devices('GPU')))

# Create or clear output directories as appropriate
_=data_funcs.prep_output_dir(config.MODEL_CHECKPOINT_DIR, config.RESUME)
_=data_funcs.prep_output_dir(config.SPECIMEN_DIR, config.RESUME)
Expand All @@ -28,9 +32,8 @@
# Get saved checkpoints, if any:
checkpoints=list(pathlib.Path(config.MODEL_CHECKPOINT_DIR).glob('generator_model_f*'))

# Check available GPUs
print("Num GPUs Available:", len(
tf.config.experimental.list_physical_devices('GPU')))
# Discard the last saved checkpoint because one of the models may have an incomplete save
checkpoints=checkpoints[:-1]

# If we only have one GPU or are explicitly not using parallelism, prep the models
# outside of a tf.distribute strategy
Expand Down Expand Up @@ -94,5 +97,7 @@
config.MODEL_CHECKPOINT_DIR,
config.CHECKPOINT_SAVE_FREQUENCY,
config.PROJECT_NAME,
config.IMAGE_OUTPUT_DIR
config.IMAGE_OUTPUT_DIR,
config.BENCHMARK_DATA_DIR,
config.GPU_PARALLELISM
)

0 comments on commit c0b7382

Please sign in to comment.