Added basic benchmark data collection to track training batch times.

gperdrizet · Feb 14, 2024 · c0b7382 · c0b7382
1 parent b24ea89
commit c0b7382
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 65 deletions.
diff --git a/skylines/config.py b/skylines/config.py
@@ -9,8 +9,8 @@
 # Option to resume a training run ######################################
 ########################################################################
 
-RESUME = False
-RESUME_RUN_DATE = '2024-02-09'
+RESUME = True
+RESUME_RUN_DATE = '2024-02-11'
 
 ########################################################################
 # Paths and directories ################################################
@@ -35,6 +35,7 @@
 MODEL_CHECKPOINT_DIR = f'{path}/data/training_checkpoints/{path_date}'
 SPECIMEN_DIR = f'{path}/data/specimens/{path_date}'
 IMAGE_OUTPUT_DIR = f'{path}/data/gan_output/{path_date}'
+BENCHMARK_DATA_DIR = f'{path}/benchmarking'
 
 
 ########################################################################

diff --git a/skylines/functions/training_functions.py b/skylines/functions/training_functions.py
@@ -1,28 +1,19 @@
 import pickle
+import time
+import os.path
 import numpy as np
+import pandas as pd
+import tensorflow as tf
 from numpy import zeros
 from numpy import ones
 from numpy.random import randn
 import matplotlib.pyplot as plt
 import functions.model_definitions as models
 import functions.training_functions as training_funcs
-import tensorflow as tf
 
 np.set_printoptions(threshold=np.inf)
 
 
-# generate points in latent space as input for the generator
-
-def generate_latent_points(latent_dim, n_samples):
-    # generate points in the latent space
-    x_input = randn(latent_dim * n_samples)
-    # reshape into a batch of inputs for the network
-    x_input = x_input.reshape(n_samples, latent_dim)
-    # x_input = tf.convert_to_tensor(x_input, dtype=tf.float16)
-
-    return x_input
-
-
 # Handles resuming or not and getting models ready accordingly
 
 def prepare_models(
@@ -91,51 +82,6 @@ def prepare_models(
     return [latent_points, frame, discriminator_model, generator_model, gann_model]
 
 
-# use the generator to generate n fake examples, with class labels
-
-def generate_fake_samples(g_model, latent_dim, n_samples):
-    # generate points in latent space
-    x_input = generate_latent_points(latent_dim, n_samples)
-    # predict outputs
-    X = g_model.predict(x_input)
-    # create 'fake' class labels (0)
-    y = zeros((n_samples, 1))
-    return X, y
-
-
-# save 3x3 grid of generated images
-
-def save_frame(g_model, latent_points, frame, image_output_dir):
-    # create images from latent points
-    images = g_model.predict(latent_points)
-    # scale images into range 0.0, 1.0 for plotting as RGB
-    images = (images + 1.0) / 2.0
-
-    plot_dim = 3
-
-    fig = plt.figure(figsize=(13.3025, 13.3025), dpi=300)
-    ax = []
-
-    for i in range(plot_dim * plot_dim):
-        # create subplot and append to ax
-        ax.append(fig.add_subplot(plot_dim, plot_dim, i+1))
-        plt.imshow(images[i])
-        plt.axis('off')
-
-    # remove whitespace between plots
-    plt.subplots_adjust(wspace=-0.019, hspace=0)
-
-    # save plot to file
-    filename = f'{image_output_dir}/frame{frame:07d}.jpg'
-    #filename = './gan_output/frame%07d.jpg' % (frame)
-    plt.savefig(filename, bbox_inches='tight', pad_inches=0)
-    plt.close()
-
-    frame += 1
-
-    return frame
-
-
 # train the generator and discriminator
 
 def train(
@@ -152,20 +98,43 @@ def train(
     model_checkpoint_dir,
     checkpoint_save_frequency, 
     project_name,
-    image_output_dir
+    image_output_dir,
+    benchmarking_data_dir,
+    gpu_parallelism
 ):
 
+    # Calculate how many batches we need for each epoch
     bat_per_epo = int((image_count) / n_batch)
 
+    # Construct filename for benchmarking results
+    num_gpus = len(tf.config.experimental.list_physical_devices("GPU"))
+    benchmark_datafile = f'{benchmarking_data_dir}/{gpu_parallelism}_{num_gpus}_gpus_batch_{n_batch}.csv'
+
+    # Make empty dict for benchmarking data
+    benchmark_data = {
+        'GPU parallelism': [],
+        'GPUs': [],
+        'Batch size': [],
+        'Training time (sec.)': []
+    }
+
     # loop on epochs
+    training_times = []
     for i in range(n_epochs):
+
         iterator = iter(dataset)
+        training_time = 0
 
         # loop on batches
         for j in range(bat_per_epo):
+
             # get randomly selected 'real' samples
             X_real = iterator.get_next()
             y_real = ones((n_batch, 1))
+
+            # Start training timer
+            training_start = time.time()
+
             # train discriminator on real samples
             d_loss1, _ = d_model.train_on_batch(X_real, y_real)
 
@@ -183,6 +152,9 @@ def train(
             # train the generator via the discriminator's error
             g_loss = gan_model.train_on_batch(X_gan, y_gan)
 
+            # Add batch time to total
+            training_time += time.time() - training_start
+
             # summarize loss on this batch
             print(f'{project_name}-{frame}: d1={d_loss1:.3f}, d2={d_loss2:.3f}, g={g_loss:.3f}')
 
@@ -195,6 +167,94 @@ def train(
             j += 1
         i += 1
 
+        training_times.append(training_time)
+
+        # Save benchmarking data every 10 epochs
+        if i % 10 == 0:
+
+            # Make dataframe from benchmarking data
+            benchmark_data['Training time (sec.)'] = training_times
+            benchmark_data['GPU parallelism'] = str(gpu_parallelism) * len(training_times)
+            benchmark_data['GPUs'] = str(num_gpus) * len(training_times)
+            benchmark_data['Batch size'] = str(n_batch) * len(training_times)
+
+            training_times = []
+
+            new_data_df = pd.DataFrame(benchmark_data)
+
+            # Check to see if we already have data for this config
+            if os.path.isfile(benchmark_datafile) == True:
+
+                # The datafile already exists read old data and append
+                old_data_df = pd.read_csv(benchmark_datafile)
+                output_df = pd.concat([old_data_df, new_data_df], axis=1)
+
+            else:
+                # If we don't have old data, just save the new data
+                output_df = new_data_df
+
+            # Save the data
+            print(output_df.head())
+            output_df.to_csv(benchmarking_data_dir)
+
+
+
+# generate points in latent space as input for the generator
+
+def generate_latent_points(latent_dim, n_samples):
+    # generate points in the latent space
+    x_input = randn(latent_dim * n_samples)
+    # reshape into a batch of inputs for the network
+    x_input = x_input.reshape(n_samples, latent_dim)
+    # x_input = tf.convert_to_tensor(x_input, dtype=tf.float16)
+
+    return x_input
+
+
+# use the generator to generate n fake examples, with class labels
+
+def generate_fake_samples(g_model, latent_dim, n_samples):
+    # generate points in latent space
+    x_input = generate_latent_points(latent_dim, n_samples)
+    # predict outputs
+    X = g_model.predict(x_input)
+    # create 'fake' class labels (0)
+    y = zeros((n_samples, 1))
+    return X, y
+
+
+# save 3x3 grid of generated images
+
+def save_frame(g_model, latent_points, frame, image_output_dir):
+    # create images from latent points
+    images = g_model.predict(latent_points)
+    # scale images into range 0.0, 1.0 for plotting as RGB
+    images = (images + 1.0) / 2.0
+
+    plot_dim = 3
+
+    fig = plt.figure(figsize=(13.3025, 13.3025), dpi=300)
+    ax = []
+
+    for i in range(plot_dim * plot_dim):
+        # create subplot and append to ax
+        ax.append(fig.add_subplot(plot_dim, plot_dim, i+1))
+        plt.imshow(images[i])
+        plt.axis('off')
+
+    # remove whitespace between plots
+    plt.subplots_adjust(wspace=-0.019, hspace=0)
+
+    # save plot to file
+    filename = f'{image_output_dir}/frame{frame:07d}.jpg'
+    #filename = './gan_output/frame%07d.jpg' % (frame)
+    plt.savefig(filename, bbox_inches='tight', pad_inches=0)
+    plt.close()
+
+    frame += 1
+
+    return frame
+
 
 # Generates from model
 

diff --git a/skylines/train.py b/skylines/train.py
@@ -13,6 +13,10 @@
 
 if __name__ == '__main__':
 
+    # Check available GPUs
+    print("Num GPUs Available:", len(
+        tf.config.experimental.list_physical_devices('GPU')))
+
     # Create or clear output directories as appropriate
     _=data_funcs.prep_output_dir(config.MODEL_CHECKPOINT_DIR, config.RESUME)
     _=data_funcs.prep_output_dir(config.SPECIMEN_DIR, config.RESUME)
@@ -28,9 +32,8 @@
     # Get saved checkpoints, if any:
     checkpoints=list(pathlib.Path(config.MODEL_CHECKPOINT_DIR).glob('generator_model_f*'))
 
-    # Check available GPUs
-    print("Num GPUs Available:", len(
-        tf.config.experimental.list_physical_devices('GPU')))
+    # Discard the last saved checkpoint because one of the models may have an incomplete save
+    checkpoints=checkpoints[:-1]
 
     # If we only have one GPU or are explicitly not using parallelism, prep the models
     # outside of a tf.distribute strategy
@@ -94,5 +97,7 @@
         config.MODEL_CHECKPOINT_DIR,
         config.CHECKPOINT_SAVE_FREQUENCY,
         config.PROJECT_NAME,
-        config.IMAGE_OUTPUT_DIR
+        config.IMAGE_OUTPUT_DIR,
+        config.BENCHMARK_DATA_DIR,
+        config.GPU_PARALLELISM
     )