Skip to content

Commit

Permalink
Work on SAC and SAKC
Browse files Browse the repository at this point in the history
  • Loading branch information
Pdbz199 committed Jun 1, 2023
1 parent 32182a2 commit 01de0cb
Show file tree
Hide file tree
Showing 17 changed files with 749 additions and 76 deletions.
12 changes: 8 additions & 4 deletions final/control/double_well/dynamics_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,17 @@ def reset(self, seed=None, options={}):

return self.state, {}

def step(self, action):
# Compute reward of system
reward = -cost(
np.vstack(self.state),
@staticmethod
def reward(state, action):
return -cost(
np.vstack(state),
np.vstack(action)
)[0, 0]

def step(self, action):
# Compute reward of system
reward = DoubleWell.reward(self.state, action)

# Update state
self.state = f(
np.vstack(self.state),
Expand Down
1 change: 1 addition & 0 deletions final/control/fluid_flow/cost.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Imports
import numpy as np

# from dynamics import state_dim #, action_dim
from fluid_flow.dynamics import state_dim #, action_dim

# Define cost/reward
Expand Down
12 changes: 8 additions & 4 deletions final/control/fluid_flow/dynamics_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,17 @@ def reset(self, seed=None, options={}):

return self.state, {}

def step(self, action):
# Compute reward of system
reward = -cost(
np.vstack(self.state),
@staticmethod
def reward(state, action):
return -cost(
np.vstack(state),
np.vstack(action)
)[0, 0]

def step(self, action):
# Compute reward of system
reward = FluidFlow.reward(self.state, action)

# Update state
self.state = f(
np.vstack(self.state),
Expand Down
1 change: 1 addition & 0 deletions final/control/linear_system/cost.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Imports
import numpy as np

# from dynamics import action_dim, state_dim
from linear_system.dynamics import action_dim, state_dim

# Define cost/reward
Expand Down
12 changes: 8 additions & 4 deletions final/control/linear_system/dynamics_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,17 @@ def reset(self, seed=None, options={}):

return self.state, {}

def step(self, action):
# Compute reward of system
reward = -cost(
np.vstack(self.state),
@staticmethod
def reward(state, action):
return -cost(
np.vstack(state),
np.vstack(action)
)[0, 0]

def step(self, action):
# Compute reward of system
reward = LinearSystem.reward(self.state, action)

# Update state
self.state = f(
np.vstack(self.state),
Expand Down
1 change: 1 addition & 0 deletions final/control/lorenz/cost.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Imports
import numpy as np

# from dynamics import state_dim, x_e, y_e, z_e #, action_dim
from lorenz.dynamics import state_dim, x_e, y_e, z_e #, action_dim

# Define cost/reward
Expand Down
12 changes: 8 additions & 4 deletions final/control/lorenz/dynamics_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,17 @@ def reset(self, seed=None, options={"state": None}):

return self.state, {}

def step(self, action):
# Compute reward of system
reward = -cost(
np.vstack(self.state),
@staticmethod
def reward(state, action):
return -cost(
np.vstack(state),
np.vstack(action)
)[0, 0]

def step(self, action):
# Compute reward of system
reward = Lorenz.reward(self.state, action)

# Update state
self.state = f(
np.vstack(self.state),
Expand Down
43 changes: 23 additions & 20 deletions final/control/policies/soft_actor_critic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@

# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name)
training_env = gym.make(args.env_name)
# env.seed(args.seed)
# env.action_space.seed(args.seed)
sac_env = gym.make(args.env_name)
Expand All @@ -70,15 +70,22 @@
torch.manual_seed(args.seed)
np.random.seed(args.seed)

# Agent
agent = SAC(sac_env.observation_space.shape[0], sac_env.action_space, args)
agent.load_checkpoint(ckpt_path=f"checkpoints/sac_checkpoint_{args.env_name}_")

#%% Load LQR policy
# Append to sys path for loading tensor and LQR policy
sys.path.append('../../../../')
with open('../../lorenz/analysis/tmp/lqr/policy.pickle', 'rb') as handle:
system_name = "linear_system"

# Load LQR policy
with open(f'../../{system_name}/analysis/tmp/lqr/policy.pickle', 'rb') as handle:
lqr_policy = pickle.load(handle)

# Load Koopman tensor with pickle
with open(f'../../{system_name}/analysis/tmp/path_based_tensor.pickle', 'rb') as handle:
tensor = pickle.load(handle)

# Agent
agent = SAC(training_env, args)
# agent.load_checkpoint(ckpt_path=f"checkpoints/sac_checkpoint_{args.env_name}_")

# Tensorboard
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
args.policy, "autotune" if args.automatic_entropy_tuning else ""))
Expand All @@ -95,12 +102,12 @@
episode_reward = 0
episode_steps = 0
done = False
state, _ = env.reset()
state, _ = sac_env.reset()
# done = True

while not done:
if args.start_steps > total_numsteps:
action = env.action_space.sample() # Sample random action
action = sac_env.action_space.sample() # Sample random action
else:
action = agent.select_action(state) # Sample action from policy

Expand All @@ -117,14 +124,14 @@
writer.add_scalar('entropy_temprature/alpha', alpha, updates)
updates += 1

next_state, reward, done, _, __ = env.step(action) # Step
next_state, reward, done, _, __ = sac_env.step(action) # Step
episode_steps += 1
total_numsteps += 1
episode_reward += reward

# Ignore the "done" signal if it comes from hitting the time horizon.
# (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
mask = 1 if episode_steps == env._max_episode_steps else float(not done)
mask = 1 if episode_steps == sac_env._max_episode_steps else float(not done)

memory.push(state, action, reward, next_state, mask) # Append transition to memory

Expand All @@ -136,18 +143,17 @@
writer.add_scalar('reward/train', episode_reward, i_episode)
print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))

if i_episode % 10 == 0: # or True
# agent.save_checkpoint(args.env_name)
if i_episode % 10 == 0:# or True:
agent.save_checkpoint(args.env_name)

if args.eval is True:
# avg_reward = 0
sac_avg_reward = 0
lqr_avg_reward = 0
# lqr_avg_reward = 0
episodes = 200
# episodes = 1

for _ in range(episodes):
# initial_state = np.array([0, 0, 0])
# initial_state = np.array([10, 10, 10])

# sac_env.reset(options={"state": initial_state})
sac_env.reset()
Expand All @@ -165,10 +171,6 @@
sac_action = agent.select_action(sac_state, evaluate=True)
# lqr_action = lqr_policy.get_action(np.vstack(lqr_state))[0]

# next_state, reward, done, _, __ = env.step(action)
# env.render()
# episode_reward += reward

sac_state, sac_reward, done, _, __ = sac_env.step(sac_action)
# sac_env.render()
sac_episode_reward += sac_reward
Expand All @@ -179,6 +181,7 @@

# print("SAC Reward:", sac_episode_reward)
# print("LQR Reward", lqr_episode_reward, "\n")

sac_avg_reward += sac_episode_reward
# lqr_avg_reward += lqr_episode_reward
sac_avg_reward /= episodes
Expand Down
7 changes: 4 additions & 3 deletions final/control/policies/soft_actor_critic/model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down Expand Up @@ -32,16 +33,16 @@ def forward(self, state):
return x

class QNetwork(nn.Module):
def __init__(self, num_inputs, num_actions, hidden_dim):
def __init__(self, state_dim, action_dim, hidden_dim):
super(QNetwork, self).__init__()

# Q1 architecture
self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)

# Q2 architecture
self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
self.linear4 = nn.Linear(state_dim + action_dim, hidden_dim)
self.linear5 = nn.Linear(hidden_dim, hidden_dim)
self.linear6 = nn.Linear(hidden_dim, 1)

Expand Down
26 changes: 20 additions & 6 deletions final/control/policies/soft_actor_critic/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,19 @@
import torch.nn.functional as F
from torch.optim import Adam
from utils import soft_update, hard_update
from model import GaussianPolicy, QNetwork, DeterministicPolicy
from model import (
GaussianPolicy,
QNetwork,
DeterministicPolicy
)

class SAC(object):
def __init__(self, num_inputs, action_space, args):
def __init__(self, env, args):

self.env = env

state_dim = env.observation_space.shape[0]
action_space = env.action_space

self.gamma = args.gamma
self.tau = args.tau
Expand All @@ -18,10 +27,10 @@ def __init__(self, num_inputs, action_space, args):

self.device = torch.device("cuda" if args.cuda else "cpu")

self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
self.critic = QNetwork(state_dim, action_space.shape[0], args.hidden_size).to(device=self.device)
self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
self.critic_target = QNetwork(state_dim, action_space.shape[0], args.hidden_size).to(device=self.device)
hard_update(self.critic_target, self.critic)

if self.policy_type == "Gaussian":
Expand All @@ -31,13 +40,18 @@ def __init__(self, num_inputs, action_space, args):
self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
self.policy = GaussianPolicy(state_dim, action_space.shape[0], args.hidden_size, action_space).to(self.device)
self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

else:
self.alpha = 0
self.automatic_entropy_tuning = False
self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
self.policy = DeterministicPolicy(
state_dim,
action_space.shape[0],
args.hidden_size,
action_space
).to(self.device)
self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

def select_action(self, state, evaluate=False):
Expand Down
Loading

0 comments on commit 01de0cb

Please sign in to comment.