Skip to content

Commit

Permalink
Offline: Benchmark AntMaze-v2 and Adroit-v1 datasets (tinkoff-ai#52)
Browse files Browse the repository at this point in the history
  • Loading branch information
Vladislav Kurenkov authored Jun 10, 2023
1 parent dc077cc commit d81e7ec
Show file tree
Hide file tree
Showing 176 changed files with 2,868 additions and 162 deletions.
14 changes: 8 additions & 6 deletions algorithms/offline/any_percent_bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,12 @@ def keep_best_trajectories(
reward_scale = 1.0

sort_ord = np.argsort(returns, axis=0)[::-1].reshape(-1)
top_trajs = sort_ord[: int(frac * len(sort_ord))]
top_trajs = sort_ord[: max(1, int(frac * len(sort_ord)))]

order = []
for i in top_trajs:
order += ids_by_trajectories[i]
order = np.array(order)

dataset["observations"] = dataset["observations"][order]
dataset["actions"] = dataset["actions"][order]
dataset["next_observations"] = dataset["next_observations"][order]
Expand Down Expand Up @@ -390,10 +389,13 @@ def train(config: TrainConfig):
f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}"
)
print("---------------------------------------")
torch.save(
trainer.state_dict(),
os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"),
)

if config.checkpoints_path is not None:
torch.save(
trainer.state_dict(),
os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"),
)

wandb.log(
{"d4rl_normalized_score": normalized_eval_score},
step=trainer.total_it,
Expand Down
2 changes: 1 addition & 1 deletion algorithms/offline/awac.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ def train(config: TrainConfig):
if hasattr(env, "get_normalized_score"):
normalized_eval_scores = env.get_normalized_score(eval_scores) * 100.0
wandb.log(
{"normalized_eval_score": normalized_eval_scores.mean()}, step=t
{"d4rl_normalized_score": normalized_eval_scores.mean()}, step=t
)

if config.checkpoints_path is not None:
Expand Down
57 changes: 37 additions & 20 deletions algorithms/offline/cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class TrainConfig:
max_timesteps: int = int(1e6) # Max time steps to run environment
checkpoints_path: Optional[str] = None # Save path
load_model: str = "" # Model load file name, "" doesn't load

# CQL
buffer_size: int = 2_000_000 # Replay buffer size
batch_size: int = 256 # Batch size for all networks
Expand All @@ -43,7 +44,6 @@ class TrainConfig:
policy_lr: float = 3e-5 # Policy learning rate
qf_lr: float = 3e-4 # Critics learning rate
soft_target_update_rate: float = 5e-3 # Target network update rate
bc_steps: int = int(0) # Number of BC steps at start
target_update_period: int = 1 # Frequency of target nets updates
cql_n_actions: int = 10 # Number of sampled actions
cql_importance_sample: bool = True # Use importance sampling
Expand All @@ -57,6 +57,13 @@ class TrainConfig:
orthogonal_init: bool = True # Orthogonal initialization
normalize: bool = True # Normalize states
normalize_reward: bool = False # Normalize reward

# AntMaze hacks
bc_steps: int = int(0) # Number of BC steps at start
reward_scale: float = 5.0
reward_bias: float = -1.0
policy_log_std_multiplier: float = 1.0

# Wandb logging
project: str = "CORL"
group: str = "CQL-D4RL"
Expand Down Expand Up @@ -226,26 +233,35 @@ def return_reward_range(dataset, max_episode_steps):
return min(returns), max(returns)


def modify_reward(dataset, env_name, max_episode_steps=1000):
def modify_reward(dataset, env_name, bias, scale, max_episode_steps=1000):
if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")):
min_ret, max_ret = return_reward_range(dataset, max_episode_steps)
dataset["rewards"] /= max_ret - min_ret
dataset["rewards"] *= max_episode_steps
elif "antmaze" in env_name:
dataset["rewards"] -= 1.0
dataset["rewards"] = dataset["rewards"] * scale + bias


def extend_and_repeat(tensor: torch.Tensor, dim: int, repeat: int) -> torch.Tensor:
return tensor.unsqueeze(dim).repeat_interleave(repeat, dim=dim)


def init_module_weights(module: torch.nn.Module, orthogonal_init: bool = False):
if isinstance(module, nn.Linear):
if orthogonal_init:
nn.init.orthogonal_(module.weight, gain=np.sqrt(2))
nn.init.constant_(module.bias, 0.0)
else:
nn.init.xavier_uniform_(module.weight, gain=1e-2)
def init_module_weights(module: torch.nn.Sequential, orthogonal_init: bool = False):
# Specific orthgonal initialization for inner layers
# If orthogonal init is off, we do not change default initialization
if orthogonal_init:
for submodule in module[:-1]:
if isinstance(submodule, nn.Linear):
nn.init.orthogonal_(submodule.weight, gain=np.sqrt(2))
nn.init.constant_(submodule.bias, 0.0)

# Lasy layers should be initialzied differently as well
if orthogonal_init:
nn.init.orthogonal_(module[-1].weight, gain=1e-2)
else:
nn.init.xavier_uniform_(module[-1].weight, gain=1e-2)

nn.init.constant_(module[-1].bias, 0.0)


class ReparameterizedTanhGaussian(nn.Module):
Expand Down Expand Up @@ -321,10 +337,7 @@ def __init__(
nn.Linear(256, 2 * action_dim),
)

if orthogonal_init:
self.base_network.apply(lambda m: init_module_weights(m, True))
else:
init_module_weights(self.base_network[-1], False)
init_module_weights(self.base_network)

self.log_std_multiplier = Scalar(log_std_multiplier)
self.log_std_offset = Scalar(log_std_offset)
Expand Down Expand Up @@ -383,10 +396,8 @@ def __init__(
nn.ReLU(),
nn.Linear(256, 1),
)
if orthogonal_init:
self.network.apply(lambda m: init_module_weights(m, True))
else:
init_module_weights(self.network[-1], False)

init_module_weights(self.network)

def forward(self, observations: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
multiple_actions = False
Expand Down Expand Up @@ -815,7 +826,9 @@ def train(config: TrainConfig):
dataset = d4rl.qlearning_dataset(env)

if config.normalize_reward:
modify_reward(dataset, config.env)
modify_reward(
dataset, config.env, bias=config.reward_bias, scale=config.reward_bias
)

if config.normalize:
state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3)
Expand Down Expand Up @@ -859,7 +872,11 @@ def train(config: TrainConfig):
critic_2_optimizer = torch.optim.Adam(list(critic_2.parameters()), config.qf_lr)

actor = TanhGaussianPolicy(
state_dim, action_dim, max_action, orthogonal_init=config.orthogonal_init
state_dim,
action_dim,
max_action,
log_std_multiplier=config.policy_log_std_multiplier,
orthogonal_init=config.orthogonal_init,
).to(config.device)
actor_optimizer = torch.optim.Adam(actor.parameters(), config.policy_lr)

Expand Down
17 changes: 15 additions & 2 deletions algorithms/offline/iql.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TrainConfig:
iql_deterministic: bool = False # Use deterministic actor
normalize: bool = True # Normalize states
normalize_reward: bool = False # Normalize reward
actor_dropout: Optional[float] = None # Adroit uses dropout for policy network
# Wandb logging
project: str = "CORL"
group: str = "IQL-D4RL"
Expand Down Expand Up @@ -246,6 +247,7 @@ def __init__(
activation_fn: Callable[[], nn.Module] = nn.ReLU,
output_activation_fn: Callable[[], nn.Module] = None,
squeeze_output: bool = False,
dropout: Optional[float] = None,
):
super().__init__()
n_dims = len(dims)
Expand All @@ -256,6 +258,10 @@ def __init__(
for i in range(n_dims - 2):
layers.append(nn.Linear(dims[i], dims[i + 1]))
layers.append(activation_fn())

if dropout is not None:
layers.append(nn.Dropout(dropout))

layers.append(nn.Linear(dims[-2], dims[-1]))
if output_activation_fn is not None:
layers.append(output_activation_fn())
Expand All @@ -277,6 +283,7 @@ def __init__(
max_action: float,
hidden_dim: int = 256,
n_hidden: int = 2,
dropout: Optional[float] = None,
):
super().__init__()
self.net = MLP(
Expand Down Expand Up @@ -308,11 +315,13 @@ def __init__(
max_action: float,
hidden_dim: int = 256,
n_hidden: int = 2,
dropout: Optional[float] = None,
):
super().__init__()
self.net = MLP(
[state_dim, *([hidden_dim] * n_hidden), act_dim],
output_activation_fn=nn.Tanh,
dropout=dropout,
)
self.max_action = max_action

Expand Down Expand Up @@ -543,9 +552,13 @@ def train(config: TrainConfig):
q_network = TwinQ(state_dim, action_dim).to(config.device)
v_network = ValueFunction(state_dim).to(config.device)
actor = (
DeterministicPolicy(state_dim, action_dim, max_action)
DeterministicPolicy(
state_dim, action_dim, max_action, dropout=config.actor_dropout
)
if config.iql_deterministic
else GaussianPolicy(state_dim, action_dim, max_action)
else GaussianPolicy(
state_dim, action_dim, max_action, dropout=config.actor_dropout
)
).to(config.device)
v_optimizer = torch.optim.Adam(v_network.parameters(), lr=3e-4)
q_optimizer = torch.optim.Adam(q_network.parameters(), lr=3e-4)
Expand Down
11 changes: 7 additions & 4 deletions algorithms/offline/td3_bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,10 +497,13 @@ def train(config: TrainConfig):
f"{eval_score:.3f} , D4RL score: {normalized_eval_score:.3f}"
)
print("---------------------------------------")
torch.save(
trainer.state_dict(),
os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"),
)

if config.checkpoints_path is not None:
torch.save(
trainer.state_dict(),
os.path.join(config.checkpoints_path, f"checkpoint_{t}.pt"),
)

wandb.log(
{"d4rl_normalized_score": normalized_eval_score},
step=trainer.total_it,
Expand Down
19 changes: 19 additions & 0 deletions configs/offline/awac/antmaze/large_diverse_v2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: false
device: cuda
env_name: antmaze-large-diverse-v2
eval_frequency: 1000
gamma: 0.99
group: awac-antmaze-large-diverse-v2-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 100
normalize_reward: true
num_train_ops: 1000000
project: CORL
seed: 42
tau: 0.005
test_seed: 69
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: true
deterministic_torch: false
device: cuda
env_name: antmaze-large-play-v0
env_name: antmaze-large-play-v2
eval_frequency: 1000
gamma: 0.99
group: awac-antmaze-large-play-v0-multiseed-v0
group: awac-antmaze-large-play-v2-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 100
Expand Down
19 changes: 19 additions & 0 deletions configs/offline/awac/antmaze/medium_diverse_v2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: false
device: cuda
env_name: antmaze-medium-diverse-v2
eval_frequency: 1000
gamma: 0.99
group: awac-antmaze-medium-diverse-v2-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 100
normalize_reward: true
num_train_ops: 1000000
project: CORL
seed: 42
tau: 0.005
test_seed: 69
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: true
deterministic_torch: false
device: cuda
env_name: antmaze-medium-play-v0
env_name: antmaze-medium-play-v2
eval_frequency: 1000
gamma: 0.99
group: awac-antmaze-medium-play-v0-multiseed-v0
group: awac-antmaze-medium-play-v2-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 100
Expand Down
19 changes: 19 additions & 0 deletions configs/offline/awac/antmaze/umaze_diverse_v2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: false
device: cuda
env_name: antmaze-umaze-diverse-v2
eval_frequency: 1000
gamma: 0.99
group: awac-antmaze-umaze-diverse-v2-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 100
normalize_reward: true
num_train_ops: 1000000
project: CORL
seed: 42
tau: 0.005
test_seed: 69
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: true
deterministic_torch: false
device: cuda
env_name: antmaze-umaze-v0
env_name: antmaze-umaze-v2
eval_frequency: 1000
gamma: 0.99
group: awac-antmaze-umaze-v0-multiseed-v0
group: awac-antmaze-umaze-v2-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 100
Expand Down
18 changes: 18 additions & 0 deletions configs/offline/awac/door/cloned_v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: false
device: cuda
env_name: door-cloned-v1
eval_frequency: 1000
gamma: 0.99
group: awac-door-cloned-v1-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 10
num_train_ops: 1000000
project: CORL
seed: 42
tau: 0.005
test_seed: 69
18 changes: 18 additions & 0 deletions configs/offline/awac/door/expert_v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
awac_lambda: 0.1
batch_size: 256
buffer_size: 10000000
checkpoints_path: null
deterministic_torch: false
device: cuda
env_name: door-expert-v1
eval_frequency: 1000
gamma: 0.99
group: awac-door-expert-v1-multiseed-v0
hidden_dim: 256
learning_rate: 0.0003
n_test_episodes: 10
num_train_ops: 1000000
project: CORL
seed: 42
tau: 0.005
test_seed: 69
Loading

0 comments on commit d81e7ec

Please sign in to comment.