Replaced V(x) weights with value iteration method + system envs have …

…static methods to compute rewards and costs
Pdbz199 · Jun 25, 2023 · 6610883 · 6610883
1 parent ef2f1cf
commit 6610883
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 74 deletions.
diff --git a/final/control/double_well/dynamics_env.py b/final/control/double_well/dynamics_env.py
@@ -55,15 +55,19 @@ def reset(self, seed=None, options={}):
         return self.state, {}
 
     @staticmethod
-    def reward(state, action):
-        return -cost(
-            np.vstack(state),
-            np.vstack(action)
-        )[0, 0]
+    def cost_fn(state, action, vstack=True):
+        return cost(
+            np.vstack(state) if vstack else state,
+            np.vstack(action) if vstack else action
+        )
+
+    @staticmethod
+    def reward_fn(state, action, vstack=True):
+        return -DoubleWell.cost_fn(state, action, vstack=vstack)
 
     def step(self, action):
         # Compute reward of system
-        reward = DoubleWell.reward(self.state, action)
+        reward = DoubleWell.reward_fn(self.state, action)[0, 0]
 
         # Update state
         self.state = f(

diff --git a/final/control/fluid_flow/dynamics_env.py b/final/control/fluid_flow/dynamics_env.py
@@ -55,15 +55,19 @@ def reset(self, seed=None, options={}):
         return self.state, {}
 
     @staticmethod
-    def reward(state, action):
-        return -cost(
-            np.vstack(state),
-            np.vstack(action)
-        )[0, 0]
+    def cost_fn(state, action, vstack=True):
+        return cost(
+            np.vstack(state) if vstack else state,
+            np.vstack(action) if vstack else action
+        )
+
+    @staticmethod
+    def reward_fn(state, action, vstack=True):
+        return -FluidFlow.cost_fn(state, action, vstack=vstack)
 
     def step(self, action):
         # Compute reward of system
-        reward = FluidFlow.reward(self.state, action)
+        reward = FluidFlow.reward_fn(self.state, action)[0, 0]
 
         # Update state
         self.state = f(

diff --git a/final/control/linear_system/dynamics_env.py b/final/control/linear_system/dynamics_env.py
@@ -63,15 +63,19 @@ def reset(self, seed=None, options={}):
         return self.state, {}
 
     @staticmethod
-    def reward(state, action):
-        return -cost(
-            np.vstack(state),
-            np.vstack(action)
+    def cost_fn(state, action, vstack=True):
+        return cost(
+            np.vstack(state) if vstack else state,
+            np.vstack(action) if vstack else action
         )
 
+    @staticmethod
+    def reward_fn(state, action, vstack=True):
+        return -LinearSystem.cost_fn(state, action, vstack=vstack)
+
     def step(self, action):
         # Compute reward of system
-        reward = LinearSystem.reward(self.state, action)[0, 0]
+        reward = LinearSystem.reward_fn(self.state, action)[0, 0]
 
         # Update state
         self.state = f(

diff --git a/final/control/lorenz/dynamics_env.py b/final/control/lorenz/dynamics_env.py
@@ -64,15 +64,19 @@ def reset(self, seed=None, options={"state": None}):
         return self.state, {}
 
     @staticmethod
-    def reward(state, action):
-        return -cost(
-            np.vstack(state),
-            np.vstack(action)
-        )[0, 0]
+    def cost_fn(state, action, vstack=True):
+        return cost(
+            np.vstack(state) if vstack else state,
+            np.vstack(action) if vstack else action
+        )
+
+    @staticmethod
+    def reward_fn(state, action, vstack=True):
+        return -Lorenz.cost_fn(state, action, vstack=vstack)
 
     def step(self, action):
         # Compute reward of system
-        reward = Lorenz.reward(self.state, action)
+        reward = Lorenz.reward_fn(self.state, action)[0, 0]
 
         # Update state
         self.state = f(

diff --git a/final/control/policies/soft_actor_koopman_critic_test/sac.py b/final/control/policies/soft_actor_koopman_critic_test/sac.py
@@ -14,7 +14,8 @@
     QNetwork
 )
 
-epsilon = np.finfo(np.float64).eps
+# epsilon = np.finfo(np.float64).eps
+epsilon = np.finfo(np.float32).eps
 
 # Load LQR policy
 sys.path.append('../../../../')
@@ -99,61 +100,119 @@ def update_critic_weights(self, x_batch, u_batch, r_batch, x_prime_batch):
             Update the weights for the value function in the dictionary space.
         """
 
+        # Get batch size from input
+        batch_size = x_batch.shape[0]
+
+        """ SAC method """
+
+        if False:
+            with torch.no_grad():
+                # Get actions and log probabiltiies from current policy
+                u_prime_batch, log_prob_prime_batch, _ = self.policy.sample(x_prime_batch)
+
+                # Prepare batches for numpy functions
+                numpy_x_batch = x_batch.numpy().T # (state_dim, batch_size)
+                numpy_u_batch = u_batch.numpy().T # (action_dim, batch_size)
+                numpy_r_batch = r_batch.numpy().T # (1, batch_size)
+                numpy_x_prime_batch = x_prime_batch.numpy().T # (state_dim, batch_size)
+
+                numpy_u_prime_batch = u_prime_batch.numpy().T # (1, batch_size)
+                numpy_log_prob_prime_batch = log_prob_prime_batch.numpy().T # (1, batch_size)
+
+                # Compute rewards and expected phi(x')s
+                r_prime_batch = torch.zeros((batch_size, 1))
+                # expected_phi_x_prime_batch = np.zeros((self.koopman_tensor.Phi_X.shape[0], batch_size))
+                for i in range(batch_size):
+                    r_prime_batch[i, 0] = self.env.reward(
+                        np.vstack(numpy_x_prime_batch[:, i]),
+                        np.vstack(numpy_u_prime_batch[:, i])
+                    )[0, 0]
+
+                    # expected_phi_x_prime_batch[:, i] = self.koopman_tensor.phi_f(
+                    #     np.vstack(numpy_x_batch[:, i]),
+                    #     np.vstack(numpy_u_batch[:, i])
+                    # )[:, 0]
+                # normalized_r_prime_batch = (r_prime_batch - r_prime_batch.mean()) / (r_prime_batch.std() + epsilon)
+                phi_x_batch = self.koopman_tensor.phi(numpy_x_batch) # (phi_dim, batch_size)
+
+                # Compute target Q(s', a')
+                target_Q_x_u_prime_batch = self.critic_target(r_prime_batch, x_prime_batch, u_prime_batch).numpy() - \
+                                        self.alpha*numpy_log_prob_prime_batch # (1, batch_size)
+                # target_Q_x_u_prime_batch = self.critic_target(normalized_r_prime_batch, x_prime_batch, u_prime_batch).numpy() - \
+                #                         self.alpha*numpy_log_prob_prime_batch # (1, batch_size)
+                Q_x_u_prime_batch = numpy_r_batch + self.gamma*target_Q_x_u_prime_batch # (1, batch_size)
+
+                # Update value function weights
+                self.critic.w = torch.linalg.lstsq(
+                    # torch.Tensor(expected_phi_x_prime_batch.T),
+                    torch.Tensor(phi_x_batch.T),
+                    torch.Tensor((Q_x_u_prime_batch - numpy_r_batch).T)
+                ).solution
+
+                # norms = np.linalg.norm(
+                #     (Q_x_u_prime_batch - numpy_r_batch) - (self.critic.w.numpy().T @ expected_phi_x_prime_batch),
+                #     axis=0
+                # ) / np.linalg.norm(
+                #     Q_x_u_prime_batch - numpy_r_batch,
+                #     axis=0
+                # ).mean()
+                # print(norms.mean())
+
+        """ Value iteration method """
+
         with torch.no_grad():
-            # Get batch size from input
-            batch_size = x_batch.shape[0]
-
-            # Get actions and log probabiltiies from current policy
-            u_prime_batch, log_prob_prime_batch, _ = self.policy.sample(x_prime_batch)
-
-            # Prepare batches for numpy functions
-            numpy_x_batch = x_batch.numpy().T # (state_dim, batch_size)
-            numpy_u_batch = u_batch.numpy().T # (action_dim, batch_size)
-            numpy_r_batch = r_batch.numpy().T # (1, batch_size)
-            numpy_x_prime_batch = x_prime_batch.numpy().T # (state_dim, batch_size)
-
-            numpy_u_prime_batch = u_prime_batch.numpy().T # (1, batch_size)
-            numpy_log_prob_prime_batch = log_prob_prime_batch.numpy().T # (1, batch_size)
-
-            # Compute rewards and expected phi(x')s
-            r_prime_batch = torch.zeros((batch_size, 1))
-            # expected_phi_x_prime_batch = np.zeros((self.koopman_tensor.Phi_X.shape[0], batch_size))
-            for i in range(batch_size):
-                r_prime_batch[i, 0] = self.env.reward(
-                    np.vstack(numpy_x_prime_batch[:, i]),
-                    np.vstack(numpy_u_prime_batch[:, i])
-                )[0, 0]
-
-                # expected_phi_x_prime_batch[:, i] = self.koopman_tensor.phi_f(
-                #     np.vstack(numpy_x_batch[:, i]),
-                #     np.vstack(numpy_u_batch[:, i])
-                # )[:, 0]
-            # normalized_r_prime_batch = (r_prime_batch - r_prime_batch.mean()) / (r_prime_batch.std() + epsilon)
-            phi_x_batch = self.koopman_tensor.phi(numpy_x_batch) # (phi_dim, batch_size)
-
-            # Compute target Q(s', a')
-            target_Q_x_u_prime_batch = self.critic_target(r_prime_batch, x_prime_batch, u_prime_batch).numpy() - \
-                                    self.alpha*numpy_log_prob_prime_batch # (1, batch_size)
-            # target_Q_x_u_prime_batch = self.critic_target(normalized_r_prime_batch, x_prime_batch, u_prime_batch).numpy() - \
-            #                         self.alpha*numpy_log_prob_prime_batch # (1, batch_size)
-            Q_x_u_prime_batch = numpy_r_batch + self.gamma*target_Q_x_u_prime_batch # (1, batch_size)
-
-            # Update value function weights
+            # Get random batch of X and Phi_X from tensor training data
+            x_batch_indices = np.random.choice(
+                self.koopman_tensor.X.shape[1],
+                batch_size,
+                replace=False
+            )
+            x_batch = self.koopman_tensor.X[:, x_batch_indices] # (X.shape[0], batch_size)
+            phi_x_batch = self.koopman_tensor.Phi_X[:, x_batch_indices] # (dim_phi, batch_size)
+
+            # Compute costs indexed by the action and the state
+            costs = torch.Tensor(self.env.cost_fn(x_batch, self.all_actions, vstack=False)) # (all_actions.shape[1], batch_size)
+
+            # Compute V(x')s
+            K_us = self.koopman_tensor.K_(self.all_actions) # (all_actions.shape[1], phi_dim, phi_dim)
+            phi_x_prime_batch = np.zeros((self.all_actions.shape[1], self.koopman_tensor.phi_dim, batch_size))
+            V_x_prime_batch = torch.zeros((self.all_actions.shape[1], batch_size))
+            for action_index in range(phi_x_prime_batch.shape[0]):
+                phi_x_prime_hat_batch = K_us[action_index] @ phi_x_batch # (phi_dim, batch_size)
+                phi_x_prime_batch[action_index] = phi_x_prime_hat_batch
+                V_x_prime_batch[action_index] = self.critic.w.T @ torch.Tensor(phi_x_prime_batch[action_index]) # (1, batch_size)
+
+            # Compute policy distribution
+            inner_pi_us_values = -(costs + self.gamma*V_x_prime_batch) # (all_actions.shape[1], batch_size)
+            inner_pi_us = inner_pi_us_values / self.regularization_lambda # (all_actions.shape[1], batch_size)
+            real_inner_pi_us = torch.real(inner_pi_us) # (all_actions.shape[1], batch_size)
+
+            # Max trick
+            max_inner_pi_u = torch.amax(real_inner_pi_us, axis=0) # (batch_size,)
+            diff = real_inner_pi_us - max_inner_pi_u # (all_actions.shape[1], batch_size)
+
+            # Softmax distribution
+            pi_us = torch.exp(diff) + epsilon # (all_actions.shape[1], batch_size)
+            Z_x = torch.sum(pi_us, axis=0) # (batch_size,)
+            pis_response = pi_us / Z_x # (all_actions.shape[1], batch_size)
+
+            # Compute log pi
+            log_pis = torch.log(pis_response) # (all_actions.shape[1], batch_size)
+
+            # Compute expectations
+            expectation_term_1 = torch.sum(
+                (costs + \
+                    self.regularization_lambda*log_pis + \
+                        self.gamma*V_x_prime_batch) * pis_response,
+                dim=0
+            ).reshape(1, -1) # (1, batch_size)
+
+            # Optimize value function weights using OLS as in Lewis
             self.critic.w = torch.linalg.lstsq(
-                # torch.Tensor(expected_phi_x_prime_batch.T),
                 torch.Tensor(phi_x_batch.T),
-                torch.Tensor((Q_x_u_prime_batch - numpy_r_batch).T)
+                expectation_term_1.T
             ).solution
 
-            # norms = np.linalg.norm(
-            #     (Q_x_u_prime_batch - numpy_r_batch) - (self.critic.w.numpy().T @ expected_phi_x_prime_batch),
-            #     axis=0
-            # ) / np.linalg.norm(
-            #     Q_x_u_prime_batch - numpy_r_batch,
-            #     axis=0
-            # ).mean()
-            # print(norms.mean())
-
     def update_parameters(self, memory, batch_size, updates):
         # Sample a batch from memory
         state_batch, action_batch, log_prob_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)