forked from sfujim/TD3
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
75 changed files
with
499 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
from torch.autograd import Variable | ||
import torch.nn.functional as F | ||
|
||
import utils | ||
|
||
|
||
# Implementation of Deep Deterministic Policy Gradients (DDPG) | ||
# Paper: https://arxiv.org/abs/1509.02971 | ||
# [Not the implementation used in the TD3 paper] | ||
|
||
|
||
def var(tensor, volatile=False): | ||
if torch.cuda.is_available(): | ||
return Variable(tensor, volatile=volatile).cuda() | ||
else: | ||
return Variable(tensor, volatile=volatile) | ||
|
||
|
||
class Actor(nn.Module): | ||
def __init__(self, state_dim, action_dim, max_action): | ||
super(Actor, self).__init__() | ||
|
||
self.l1 = nn.Linear(state_dim, 400) | ||
self.l2 = nn.Linear(400, 300) | ||
self.l3 = nn.Linear(300, action_dim) | ||
|
||
self.max_action = max_action | ||
|
||
|
||
def forward(self, x): | ||
x = F.relu(self.l1(x)) | ||
x = F.relu(self.l2(x)) | ||
x = self.max_action * F.tanh(self.l3(x)) | ||
return x | ||
|
||
|
||
class Critic(nn.Module): | ||
def __init__(self, state_dim, action_dim): | ||
super(Critic, self).__init__() | ||
|
||
self.l1 = nn.Linear(state_dim, 400) | ||
self.l2 = nn.Linear(400 + action_dim, 300) | ||
self.l3 = nn.Linear(300, 1) | ||
|
||
|
||
def forward(self, x, u): | ||
x = F.relu(self.l1(x)) | ||
x = F.relu(self.l2(torch.cat([x, u], 1))) | ||
x = self.l3(x) | ||
return x | ||
|
||
|
||
class DDPG(object): | ||
def __init__(self, state_dim, action_dim, max_action): | ||
self.actor = Actor(state_dim, action_dim, max_action) | ||
self.actor_target = Actor(state_dim, action_dim, max_action) | ||
self.actor_target.load_state_dict(self.actor.state_dict()) | ||
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4) | ||
|
||
self.critic = Critic(state_dim, action_dim) | ||
self.critic_target = Critic(state_dim, action_dim) | ||
self.critic_target.load_state_dict(self.critic.state_dict()) | ||
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2) | ||
|
||
if torch.cuda.is_available(): | ||
self.actor = self.actor.cuda() | ||
self.actor_target = self.actor_target.cuda() | ||
self.critic = self.critic.cuda() | ||
self.critic_target = self.critic_target.cuda() | ||
|
||
self.criterion = nn.MSELoss() | ||
self.state_dim = state_dim | ||
|
||
|
||
def select_action(self, state): | ||
state = var(torch.FloatTensor(state.reshape(-1, self.state_dim)), volatile=True) | ||
return self.actor(state).cpu().data.numpy().flatten() | ||
|
||
|
||
def train(self, replay_buffer, iterations, batch_size=64, discount=0.99, tau=0.001): | ||
|
||
for it in range(iterations): | ||
|
||
# Sample replay buffer | ||
x, y, u, r, d = replay_buffer.sample(batch_size) | ||
state = var(torch.FloatTensor(x)) | ||
action = var(torch.FloatTensor(u)) | ||
next_state = var(torch.FloatTensor(y), volatile=True) | ||
done = var(torch.FloatTensor(1 - d)) | ||
reward = var(torch.FloatTensor(r)) | ||
|
||
# Q target = reward + discount * Q(next_state, pi(next_state)) | ||
target_Q = self.critic_target(next_state, self.actor_target(next_state)) | ||
target_Q.volatile = False | ||
target_Q = reward + (done * discount * target_Q) | ||
|
||
# Get current Q estimate | ||
current_Q = self.critic([state, action]) | ||
|
||
# Compute critic loss | ||
critic_loss = self.criterion(current_Q, target_Q) | ||
|
||
# Optimize the critic | ||
self.critic_optimizer.zero_grad() | ||
critic_loss.backward() | ||
self.critic_optimizer.step() | ||
|
||
# Compute actor loss | ||
actor_loss = -self.critic(state, self.actor(state)).mean() | ||
|
||
# Optimize the actor | ||
self.actor_optimizer.zero_grad() | ||
actor_loss.backward() | ||
self.actor_optimizer.step() | ||
|
||
# Update the frozen target models | ||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): | ||
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) | ||
|
||
for param, target_param, in zip(self.actor.parameters(), self.actor_target.parameters()): | ||
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) | ||
|
||
|
||
def save(self, filename, directory): | ||
torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) | ||
torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) | ||
|
||
|
||
def load(self, filename, directory): | ||
self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename))) | ||
self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
from torch.autograd import Variable | ||
import torch.nn.functional as F | ||
|
||
import utils | ||
|
||
|
||
# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3) | ||
|
||
|
||
def var(tensor, volatile=False): | ||
if torch.cuda.is_available(): | ||
return Variable(tensor, volatile=volatile).cuda() | ||
else: | ||
return Variable(tensor, volatile=volatile) | ||
|
||
|
||
class Actor(nn.Module): | ||
def __init__(self, state_dim, action_dim, max_action): | ||
super(Actor, self).__init__() | ||
|
||
self.l1 = nn.Linear(state_dim, 400) | ||
self.l2 = nn.Linear(400, 300) | ||
self.l3 = nn.Linear(300, action_dim) | ||
|
||
self.max_action = max_action | ||
|
||
|
||
def forward(self, x): | ||
x = F.relu(self.l1(x)) | ||
x = F.relu(self.l2(x)) | ||
x = self.max_action * F.tanh(self.l3(x)) | ||
return x | ||
|
||
|
||
class Critic(nn.Module): | ||
def __init__(self, state_dim, action_dim): | ||
super(Critic, self).__init__() | ||
|
||
# Q1 architecture | ||
self.l1 = nn.Linear(state_dim + action_dim, 400) | ||
self.l2 = nn.Linear(400, 300) | ||
self.l3 = nn.Linear(300, 1) | ||
|
||
# Q2 architecture | ||
self.l4 = nn.Linear(state_dim + action_dim, 400) | ||
self.l5 = nn.Linear(400, 300) | ||
self.l6 = nn.Linear(300, 1) | ||
|
||
|
||
def forward(self, x, u): | ||
x1 = F.relu(self.l1(torch.cat([x, u], 1))) | ||
x1 = F.relu(self.l2(x1)) | ||
x1 = self.l3(x1) | ||
|
||
x2 = F.relu(self.l4(torch.cat([x, u], 1))) | ||
x2 = F.relu(self.l5(x2)) | ||
x2 = self.l6(x2) | ||
|
||
return x1, x2 | ||
|
||
|
||
class TD3(object): | ||
def __init__(self, state_dim, action_dim, max_action): | ||
self.actor = Actor(state_dim, action_dim, max_action) | ||
self.actor_target = Actor(state_dim, action_dim, max_action).cuda() | ||
self.actor_target.load_state_dict(self.actor.state_dict()) | ||
self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) | ||
|
||
self.critic = Critic(state_dim, action_dim) | ||
self.critic_target = Critic(state_dim, action_dim) | ||
self.critic_target.load_state_dict(self.critic.state_dict()) | ||
self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) | ||
|
||
if torch.cuda.is_available(): | ||
self.actor = self.actor.cuda() | ||
self.actor_target = self.actor_target.cuda() | ||
self.critic = self.critic.cuda() | ||
self.critic_target = self.critic_target.cuda() | ||
|
||
self.criterion = nn.MSELoss() | ||
self.state_dim = state_dim | ||
self.max_action = max_action | ||
|
||
|
||
def select_action(self, state): | ||
state = var(torch.FloatTensor(state.reshape(-1, self.state_dim)), volatile=True) | ||
return self.actor(state).cpu().data.numpy().flatten() | ||
|
||
|
||
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): | ||
|
||
for it in range(iterations): | ||
|
||
# Sample replay buffer | ||
x, y, u, r, d = replay_buffer.sample(batch_size) | ||
state = var(torch.FloatTensor(x)) | ||
action = var(torch.FloatTensor(u)) | ||
next_state = var(torch.FloatTensor(y), volatile=True) | ||
done = var(torch.FloatTensor(1 - d)) | ||
reward = var(torch.FloatTensor(r)) | ||
|
||
# Select action according to policy and add clipped noise | ||
noise = np.clip(np.random.normal(0, policy_noise, size=(batch_size,1)), -noise_clip, noise_clip) | ||
next_action = self.actor_target(next_state) + var(torch.FloatTensor(noise)) | ||
next_action = next_action.clamp(-self.max_action, self.max_action) | ||
|
||
# Q target = reward + discount * min(Qi(next_state, pi(next_state))) | ||
target_Q1, target_Q2 = self.critic_target(next_state, next_action) | ||
target_Q = torch.min(torch.cat([target_Q1, target_Q2], 1), 1)[0].view(-1, 1) | ||
target_Q.volatile = False | ||
target_Q = reward + (done * discount * target_Q) | ||
|
||
# Get current Q estimates | ||
current_Q1, current_Q2 = self.critic(state, action) | ||
|
||
# Compute critic loss | ||
critic_loss = self.criterion(current_Q1, target_Q) + self.criterion(current_Q2, target_Q) | ||
|
||
# Optimize the critic | ||
self.critic_optimizer.zero_grad() | ||
critic_loss.backward() | ||
self.critic_optimizer.step() | ||
|
||
# Delayed policy updates | ||
if it % policy_freq == 0: | ||
|
||
# Compute actor loss | ||
Q1, Q2 = self.critic(state, self.actor(state)) | ||
actor_loss = -Q1.mean() | ||
|
||
# Optimize the actor | ||
self.actor_optimizer.zero_grad() | ||
actor_loss.backward() | ||
self.actor_optimizer.step() | ||
|
||
# Update the frozen target models | ||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): | ||
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) | ||
|
||
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): | ||
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) | ||
|
||
|
||
def save(self, filename, directory): | ||
torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) | ||
torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) | ||
|
||
|
||
def load(self, filename, directory): | ||
self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename))) | ||
self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename))) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_0.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_1.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_2.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_3.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_4.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_5.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_6.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_7.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_8.npy
Binary file not shown.
Binary file added
BIN
+1.65 KB
learning_curves/InvertedDoublePendulum/TD3_InvertedDoublePendulum-v1_9.npy
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.