Work on SAC and SAKC

Pdbz199 · Jun 1, 2023 · 01de0cb · 01de0cb
1 parent 32182a2
commit 01de0cb
Show file tree

Hide file tree

Showing 17 changed files with 749 additions and 76 deletions.
diff --git a/final/control/double_well/dynamics_env.py b/final/control/double_well/dynamics_env.py
@@ -54,13 +54,17 @@ def reset(self, seed=None, options={}):
 
         return self.state, {}
 
-    def step(self, action):
-        # Compute reward of system
-        reward = -cost(
-            np.vstack(self.state),
+    @staticmethod
+    def reward(state, action):
+        return -cost(
+            np.vstack(state),
             np.vstack(action)
         )[0, 0]
 
+    def step(self, action):
+        # Compute reward of system
+        reward = DoubleWell.reward(self.state, action)
+
         # Update state
         self.state = f(
             np.vstack(self.state),

diff --git a/final/control/fluid_flow/cost.py b/final/control/fluid_flow/cost.py
@@ -1,6 +1,7 @@
 # Imports
 import numpy as np
 
+# from dynamics import state_dim #, action_dim
 from fluid_flow.dynamics import state_dim #, action_dim
 
 # Define cost/reward

diff --git a/final/control/fluid_flow/dynamics_env.py b/final/control/fluid_flow/dynamics_env.py
@@ -54,13 +54,17 @@ def reset(self, seed=None, options={}):
 
         return self.state, {}
 
-    def step(self, action):
-        # Compute reward of system
-        reward = -cost(
-            np.vstack(self.state),
+    @staticmethod
+    def reward(state, action):
+        return -cost(
+            np.vstack(state),
             np.vstack(action)
         )[0, 0]
 
+    def step(self, action):
+        # Compute reward of system
+        reward = FluidFlow.reward(self.state, action)
+
         # Update state
         self.state = f(
             np.vstack(self.state),

diff --git a/final/control/linear_system/cost.py b/final/control/linear_system/cost.py
@@ -1,6 +1,7 @@
 # Imports
 import numpy as np
 
+# from dynamics import action_dim, state_dim
 from linear_system.dynamics import action_dim, state_dim
 
 # Define cost/reward

diff --git a/final/control/linear_system/dynamics_env.py b/final/control/linear_system/dynamics_env.py
@@ -62,13 +62,17 @@ def reset(self, seed=None, options={}):
 
         return self.state, {}
 
-    def step(self, action):
-        # Compute reward of system
-        reward = -cost(
-            np.vstack(self.state),
+    @staticmethod
+    def reward(state, action):
+        return -cost(
+            np.vstack(state),
             np.vstack(action)
         )[0, 0]
 
+    def step(self, action):
+        # Compute reward of system
+        reward = LinearSystem.reward(self.state, action)
+
         # Update state
         self.state = f(
             np.vstack(self.state),

diff --git a/final/control/lorenz/cost.py b/final/control/lorenz/cost.py
@@ -1,6 +1,7 @@
 # Imports
 import numpy as np
 
+# from dynamics import state_dim, x_e, y_e, z_e #, action_dim
 from lorenz.dynamics import state_dim, x_e, y_e, z_e #, action_dim
 
 # Define cost/reward

diff --git a/final/control/lorenz/dynamics_env.py b/final/control/lorenz/dynamics_env.py
@@ -63,13 +63,17 @@ def reset(self, seed=None, options={"state": None}):
 
         return self.state, {}
 
-    def step(self, action):
-        # Compute reward of system
-        reward = -cost(
-            np.vstack(self.state),
+    @staticmethod
+    def reward(state, action):
+        return -cost(
+            np.vstack(state),
             np.vstack(action)
         )[0, 0]
 
+    def step(self, action):
+        # Compute reward of system
+        reward = Lorenz.reward(self.state, action)
+
         # Update state
         self.state = f(
             np.vstack(self.state),

diff --git a/final/control/policies/soft_actor_critic/main.py b/final/control/policies/soft_actor_critic/main.py
@@ -59,7 +59,7 @@
 
 # Environment
 # env = NormalizedActions(gym.make(args.env_name))
-env = gym.make(args.env_name)
+training_env = gym.make(args.env_name)
 # env.seed(args.seed)
 # env.action_space.seed(args.seed)
 sac_env = gym.make(args.env_name)
@@ -70,15 +70,22 @@
 torch.manual_seed(args.seed)
 np.random.seed(args.seed)
 
-# Agent
-agent = SAC(sac_env.observation_space.shape[0], sac_env.action_space, args)
-agent.load_checkpoint(ckpt_path=f"checkpoints/sac_checkpoint_{args.env_name}_")
-
-#%% Load LQR policy
+# Append to sys path for loading tensor and LQR policy
 sys.path.append('../../../../')
-with open('../../lorenz/analysis/tmp/lqr/policy.pickle', 'rb') as handle:
+system_name = "linear_system"
+
+# Load LQR policy
+with open(f'../../{system_name}/analysis/tmp/lqr/policy.pickle', 'rb') as handle:
     lqr_policy = pickle.load(handle)
 
+# Load Koopman tensor with pickle
+with open(f'../../{system_name}/analysis/tmp/path_based_tensor.pickle', 'rb') as handle:
+    tensor = pickle.load(handle)
+
+# Agent
+agent = SAC(training_env, args)
+# agent.load_checkpoint(ckpt_path=f"checkpoints/sac_checkpoint_{args.env_name}_")
+
 # Tensorboard
 writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
                                                              args.policy, "autotune" if args.automatic_entropy_tuning else ""))
@@ -95,12 +102,12 @@
     episode_reward = 0
     episode_steps = 0
     done = False
-    state, _ = env.reset()
+    state, _ = sac_env.reset()
     # done = True
 
     while not done:
         if args.start_steps > total_numsteps:
-            action = env.action_space.sample()  # Sample random action
+            action = sac_env.action_space.sample()  # Sample random action
         else:
             action = agent.select_action(state)  # Sample action from policy
 
@@ -117,14 +124,14 @@
                 writer.add_scalar('entropy_temprature/alpha', alpha, updates)
                 updates += 1
 
-        next_state, reward, done, _, __ = env.step(action) # Step
+        next_state, reward, done, _, __ = sac_env.step(action) # Step
         episode_steps += 1
         total_numsteps += 1
         episode_reward += reward
 
         # Ignore the "done" signal if it comes from hitting the time horizon.
         # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
-        mask = 1 if episode_steps == env._max_episode_steps else float(not done)
+        mask = 1 if episode_steps == sac_env._max_episode_steps else float(not done)
 
         memory.push(state, action, reward, next_state, mask) # Append transition to memory
 
@@ -136,18 +143,17 @@
     writer.add_scalar('reward/train', episode_reward, i_episode)
     print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
 
-    if i_episode % 10 == 0: # or True
-        # agent.save_checkpoint(args.env_name)
+    if i_episode % 10 == 0:# or True:
+        agent.save_checkpoint(args.env_name)
 
         if args.eval is True:
-            # avg_reward = 0
             sac_avg_reward = 0
-            lqr_avg_reward = 0
+            # lqr_avg_reward = 0
             episodes = 200
             # episodes = 1
 
             for _  in range(episodes):
-                # initial_state = np.array([0, 0, 0])
+                # initial_state = np.array([10, 10, 10])
 
                 # sac_env.reset(options={"state": initial_state})
                 sac_env.reset()
@@ -165,10 +171,6 @@
                     sac_action = agent.select_action(sac_state, evaluate=True)
                     # lqr_action = lqr_policy.get_action(np.vstack(lqr_state))[0]
 
-                    # next_state, reward, done, _, __ = env.step(action)
-                    # env.render()
-                    # episode_reward += reward
-
                     sac_state, sac_reward, done, _, __ = sac_env.step(sac_action)
                     # sac_env.render()
                     sac_episode_reward += sac_reward
@@ -179,6 +181,7 @@
 
                 # print("SAC Reward:", sac_episode_reward)
                 # print("LQR Reward", lqr_episode_reward, "\n")
+
                 sac_avg_reward += sac_episode_reward
                 # lqr_avg_reward += lqr_episode_reward
             sac_avg_reward /= episodes

diff --git a/final/control/policies/soft_actor_critic/model.py b/final/control/policies/soft_actor_critic/model.py
@@ -1,3 +1,4 @@
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -32,16 +33,16 @@ def forward(self, state):
         return x
 
 class QNetwork(nn.Module):
-    def __init__(self, num_inputs, num_actions, hidden_dim):
+    def __init__(self, state_dim, action_dim, hidden_dim):
         super(QNetwork, self).__init__()
 
         # Q1 architecture
-        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
+        self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
         self.linear3 = nn.Linear(hidden_dim, 1)
 
         # Q2 architecture
-        self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
+        self.linear4 = nn.Linear(state_dim + action_dim, hidden_dim)
         self.linear5 = nn.Linear(hidden_dim, hidden_dim)
         self.linear6 = nn.Linear(hidden_dim, 1)
 

diff --git a/final/control/policies/soft_actor_critic/sac.py b/final/control/policies/soft_actor_critic/sac.py
@@ -3,10 +3,19 @@
 import torch.nn.functional as F
 from torch.optim import Adam
 from utils import soft_update, hard_update
-from model import GaussianPolicy, QNetwork, DeterministicPolicy
+from model import (
+    GaussianPolicy,
+    QNetwork,
+    DeterministicPolicy
+)
 
 class SAC(object):
-    def __init__(self, num_inputs, action_space, args):
+    def __init__(self, env, args):
+
+        self.env = env
+
+        state_dim = env.observation_space.shape[0]
+        action_space = env.action_space
 
         self.gamma = args.gamma
         self.tau = args.tau
@@ -18,10 +27,10 @@ def __init__(self, num_inputs, action_space, args):
 
         self.device = torch.device("cuda" if args.cuda else "cpu")
 
-        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
+        self.critic = QNetwork(state_dim, action_space.shape[0], args.hidden_size).to(device=self.device)
         self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
 
-        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
+        self.critic_target = QNetwork(state_dim, action_space.shape[0], args.hidden_size).to(device=self.device)
         hard_update(self.critic_target, self.critic)
 
         if self.policy_type == "Gaussian":
@@ -31,13 +40,18 @@ def __init__(self, num_inputs, action_space, args):
                 self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                 self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
 
-            self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
+            self.policy = GaussianPolicy(state_dim, action_space.shape[0], args.hidden_size, action_space).to(self.device)
             self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
 
         else:
             self.alpha = 0
             self.automatic_entropy_tuning = False
-            self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
+            self.policy = DeterministicPolicy(
+                state_dim,
+                action_space.shape[0],
+                args.hidden_size,
+                action_space
+            ).to(self.device)
             self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
 
     def select_action(self, state, evaluate=False):