Skip to content

Commit

Permalink
Set up weights and biases logging
Browse files Browse the repository at this point in the history
  • Loading branch information
djfoote committed May 12, 2023
1 parent 71b9136 commit 3b72e3f
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 115 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
notebooks/images
notebooks/videos

# Model checkpoints
# Model checkpoints and wandb logs
saved_reward_models
wandb

# Mac
.DS_Store
Expand Down
78 changes: 53 additions & 25 deletions imitation_modules.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import abc
import itertools
import pickle
import re
import time
from typing import Any, Dict, Tuple

import numpy as np
Expand Down Expand Up @@ -414,29 +416,48 @@ def _train(self, dataset, epoch_multiplier=1.0):
dataloader = self._make_data_loader(dataset)
epochs = np.round(self.epochs * epoch_multiplier).astype(int)
assert epochs > 0, "Must train for at least one epoch."

for _ in tqdm(range(epochs), desc="Training reward model"):
train_loss = 0.0
accumulated_size = 0
self.optim.zero_grad()
for fragments, feedback in dataloader:
loss = self._training_inner_loop(fragments, np.array(feedback))
loss *= len(fragments) / self.batch_size # rescale loss to account for minibatching
train_loss += loss.item()
loss.backward()
accumulated_size += len(fragments)
if accumulated_size >= self.batch_size:
self.optim.step()
self.optim.zero_grad()
with self.logger.accumulate_means("reward"):
for epoch_num in tqdm(range(epochs), desc="Training reward model"):
with self.logger.add_key_prefix(f"epoch-{epoch_num}"):
train_loss = 0.0
accumulated_size = 0
if accumulated_size > 0:
self.optim.step() # if there remains an incomplete batch
self.optim.zero_grad()
for fragments, feedback in dataloader:
with self.logger.add_key_prefix("train"):
loss = self._training_inner_loop(fragments, np.array(feedback))
loss *= len(fragments) / self.batch_size # rescale loss to account for minibatching
train_loss += loss.item()
loss.backward()
accumulated_size += len(fragments)
if accumulated_size >= self.batch_size:
self.optim.step()
self.optim.zero_grad()
accumulated_size = 0
if accumulated_size > 0:
self.optim.step() # if there remains an incomplete batch

# after training all the epochs,
# record also the final value in a separate key for easy access.
keys = list(self.logger.name_to_value.keys())
outer_prefix = self.logger.get_accumulate_prefixes()
for key in keys:
base_path = f"{outer_prefix}reward/" # existing prefix + accum_means ctx
epoch_path = f"mean/{base_path}epoch-{epoch_num}/" # mean for last epoch
final_path = f"{base_path}final/" # path to record last epoch
pattern = rf"{epoch_path}(.+)"
if regex_match := re.match(pattern, key):
(key_name,) = regex_match.groups()
val = self.logger.name_to_value[key]
new_key = f"{final_path}{key_name}"
self.logger.record(new_key, val)

def _training_inner_loop(self, fragments, feedback):
"""Inner loop of training, for a single minibatch."""
# The imitation implementation returns a NamedTuple where `loss` has to be unpacked.
# I've decided to skip all that for now.
return self.loss.forward(fragments, feedback, self._feedback_model)
# The imitation implementation returns a NamedTuple where `loss` has to be unpacked. This is to pass accuracy
# through in addition to loss for logging. I've decided to skip all that for now.
loss = self.loss.forward(fragments, feedback, self._feedback_model)
self.logger.record("loss", loss)
return loss


class ScalarRewardLearner(base.BaseImitationAlgorithm):
Expand Down Expand Up @@ -514,10 +535,9 @@ def train(self, total_timesteps, total_queries):

timesteps_per_iteration, extra_timesteps = divmod(total_timesteps, self.num_iterations)
reward_loss = None
reward_accuracy = None

for i, num_queries in enumerate(schedule):
iter_log_str = f"Beggining iteration {i} of {self.num_iterations}"
iter_log_str = f"Beginning iteration {i} of {self.num_iterations}"
if self._iteration != i:
iter_log_str += f" (global iteration {self._iteration})"
self.logger.log(iter_log_str)
Expand All @@ -538,16 +558,24 @@ def train(self, total_timesteps, total_queries):
self.logger.log("Gathering feedback")
feedback = self.feedback_gatherer(fragments)
self.dataset.push(fragments, feedback)
print(f"Best reward: {np.max(feedback)} | Worst reward: {np.min(feedback)}")
self.logger.log(f"Dataset now contains {len(self.dataset.reward_labels)} feedback queries")
self.logger.record(f"dataset_size", len(self.dataset.reward_labels))

######################
# Train reward model #
######################

# On the first iteration, we train the reward model for longer, as specified by initial_epoch_multiplier.
epoch_multiplier = self.initial_epoch_multiplier if i == 0 else 1.0

start_time = time.time()
self.reward_trainer.train(self.dataset, epoch_multiplier=epoch_multiplier)
self.logger.record("reward_train_time", time.time() - start_time)

base_key = self.logger.get_accumulate_prefixes() + "reward/final/train"
assert f"{base_key}/loss" in self.logger.name_to_value
reward_loss = self.logger.name_to_value[f"{base_key}/loss"]
self.logger.record("reward_loss", reward_loss)

###################
# Train the agent #
Expand All @@ -573,9 +601,9 @@ def train(self, total_timesteps, total_queries):
env=self.trajectory_generator.env,
num_trajs=1000,
)
self.logger.log(f"Proportion of bad trajectories: {prop_bad}")
self.logger.record("prop_bad_rollouts", prop_bad)
for condition, prop in prop_bad_per_condition.items():
self.logger.log(f" Proportion of trajectories in condition {condition}: {prop}")
self.logger.record(f"prop_bad_rollouts_{condition}", prop)

self.logger.dump(self._iteration)

Expand All @@ -584,4 +612,4 @@ def train(self, total_timesteps, total_queries):

self._iteration += 1

return {"reward_loss": reward_loss, "reward_accuracy": reward_accuracy}
return {"reward_loss": reward_loss}
Loading

0 comments on commit 3b72e3f

Please sign in to comment.