Debugging Multiagent algorithms. Added plan_on function to multiagent…

…q_learner to run updates as a planner instead of learner.
markkho · Jan 17, 2021 · 0fcab63 · 0fcab63
1 parent 5af13b6
commit 0fcab63
Show file tree

Hide file tree

Showing 7 changed files with 32,465 additions and 32,670 deletions.
diff --git a/demos/MultiAgentQLearning.ipynb b/demos/MultiAgentQLearning.ipynb
diff --git a/msdm/algorithms/correlatedq.py b/msdm/algorithms/correlatedq.py
@@ -9,7 +9,10 @@
 from tqdm import tqdm
 from cvxopt.modeling import op 
 from cvxopt.modeling import variable
+from cvxopt import solvers
 import cvxopt
+import numpy as np
+from scipy.optimize import linprog
 
 
 class CorrelatedQLearner(TabularMultiAgentQLearner):
@@ -35,53 +38,97 @@ def __init__(self,learning_agents: Iterable,
         else:
             raise Exception("Please enter one of ['Utilitarian','Egalitarian','Republican','Libertarian'] for the objective_func parameter")
         self.equilibrium_type = objective_func
+        self.curr_equilibrium = None
+        self.curr_equilibrium_policy = None
+
+    def utilitarian_Q(self,q_values,next_state,problem,agent_name=None):
+        q_matrix = np.zeros((len(self.all_agents),len(problem.joint_action_list)))
+        for agent_i,agent in enumerate(self.all_agents):
+            for ai, action in enumerate(problem.joint_action_list):
+                q_matrix[agent_i][ai] = q_values[agent][next_state][action]
+        q_matrix = np.sum(q_matrix,axis=0)
+        return q_matrix
 
-    def utilitarian_Q(self,agent_name,q_values,policy,curr_state,problem):
-        agents = [] 
-        agents.extend(self.learning_agents)
-        agents.extend(list(self.other_policies.keys()))
-        total_val = 0.0
-        for agent in agents:
-            for ai,action in enumerate(problem.joint_action_list):
-                total_val += policy[ai]*q_values[agent][curr_state][action]
-        # Negated so the LP solver will maximize instead of minimize this value
-        return -total_val
-
-    def egalitarian_Q(self,agent_name,q_values,policy,curr_state,problem):
-        agents = [] 
-        agents.extend(self.learning_agents)
-        agents.extend(list(self.other_policies.keys()))
-        agent_vals = []
-        for agent in agents:
-            agent_total = 0.0
-            for ai,action in enumerate(problem.joint_action_list):
-                agent_total += policy[ai]*q_values[agent][curr_state][action]
-            agent_vals.append(agent_total)
-        # Negated so the LP solver will maximize instead of minimize this value
-        return -cvxopt.modeling.min(agent_vals)
+    def egalitarian_Q(self,q_values,next_state,problem,agent_name=None):
+        q_matrix = np.zeros((len(self.all_agents),len(problem.joint_action_list)))
+        for agent_i,agent in enumerate(self.all_agents):
+            for ai, action in enumerate(problem.joint_action_list):
+                q_matrix[agent_i][ai] = q_values[agent][next_state][action]
+        return np.amin(q_matrix,axis=0)
 
-    def republican_Q(self,agent_name,q_values,policy,curr_state,problem):
-        agents = [] 
-        agents.extend(self.learning_agents)
-        agents.extend(list(self.other_policies.keys()))
-        agent_vals = []
-        for agent in agents:
-            agent_total = 0.0
-            # subtracting the values to allow for the min function at the end(max throws an error in the LP)
-            for ai,action in enumerate(problem.joint_action_list):
-                agent_total -= policy[ai]*q_values[agent][curr_state][action]
-            agent_vals.append(agent_total)
-        # Negated so the LP solver will maximize instead of minimize this value
-        return -cvxopt.modeling.min(agent_vals)
+    def republican_Q(self,q_values,next_state,problem,agent_name=None):
+        q_matrix = np.zeros((len(self.all_agents),len(problem.joint_action_list)))
+        for agent_i,agent in enumerate(self.all_agents):
+            for ai, action in enumerate(problem.joint_action_list):
+                q_matrix[agent_i][ai] = q_values[agent][next_state][action]
+        return np.amax(q_matrix,axis=0)
 
-    def libertarian_Q(self,agent_name,q_values,policy,curr_state,problem):
-        agent_total = 0.0
-        for ai,action in enumerate(problem.joint_action_list):
-            agent_total += policy[ai]*q_values[agent_name][curr_state][action]
-        # Negated so the LP solver will maximize instead of minimize this value
-        return -agent_total
+    def libertarian_Q(self,q_values,next_state,problem,agent_name):
+        q_matrix = np.zeros((len(problem.joint_action_list)))
+        for ai, action in enumerate(problem.joint_action_list):
+            q_matrix[ai] = q_values[agent_name][next_state][action]
+        return q_matrix
 
     def update(self,agent_name,actions,q_values,joint_rewards,curr_state,next_state,problem):
+        if problem.is_terminal(next_state):
+            return self.lr*(joint_rewards[agent_name])
+        if problem.is_terminal(curr_state):
+            return 0.0
+
+        if self.equilibrium_type != "Libertarian":
+            # compute equilibrium for first agent, then reuse for others
+            if agent_name == self.learning_agents[0]:
+                self.curr_equilibrium_policy,self.curr_equilibrium = self.cvxopt_equilibrium(q_values,next_state,problem)
+            expected_val = 0.0
+            for ai, action in enumerate(problem.joint_action_list):
+                q_val = q_values[agent_name][next_state][action]
+                expected_val += self.curr_equilibrium_policy[ai]*q_val
+            q_del = joint_rewards[agent_name] + self.dr*expected_val
+            return q_del
+        # Have to recompute each equilibria for lCEQ
+        else:
+            self.curr_equilibrium_policy,self.curr_equilibrium = self.compute_equilibrium(q_values,next_state,problem,agent_name=agent_name)
+            expected_val = 0.0
+            for ai, action in enumerate(problem.joint_action_list):
+                q_val = q_values[agent_name][next_state][action]
+                expected_val += self.curr_equilibrium_policy[ai]*q_val
+            q_del = joint_rewards[agent_name] + self.dr*expected_val
+            return q_del 
+
+    def compute_equilibrium(self,q_values,next_state,problem,agent_name=None):
+        next_actions = problem.joint_actions(next_state)
+        joint_actions = problem.joint_action_list
+        # Assumes all agents have same action space 
+        individual_actions = list(next_actions[self.all_agents[0]])
+        num_variables = len(joint_actions)
+        num_inequality_constraints = len(individual_actions)*len(self.all_agents)
+        num_equality_constraints = 1 
+        A_ineq = np.zeros((num_inequality_constraints,len(joint_actions)))
+        for ai,action in enumerate(joint_actions): 
+            curr_index = 0
+            for agent_i,agent in enumerate(self.all_agents):
+                altered_action = copy.deepcopy(action)
+                q_val = q_values[agent][next_state][action]
+                for iai,indiv_action in enumerate(individual_actions):
+                    if indiv_action == action[agent]:
+                        curr_index +=1 
+                        continue 
+                    else:
+                        altered_action[agent] = indiv_action
+                        altered_q_val = q_values[agent][next_state][altered_action]
+                        constraint_val = (q_val - altered_q_val)
+                        A_ineq[curr_index,ai] = constraint_val
+                        curr_index += 1
+
+        A_eq = np.ones((1,num_variables))
+        b_ineq = np.zeros((num_inequality_constraints))
+        c = self.objective_func(q_values,next_state,problem,agent_name)
+        lp = linprog(-c,A_ub=-A_ineq,b_ub=b_ineq,A_eq=A_eq,b_eq=1)
+        equilibrium = -1*lp.fun
+        policy = lp.x
+        return policy,equilibrium
+
+    def cvxopt_equilibrium(self,q_values,next_state,problem,agent_name=None):
         cvxopt.solvers.options['show_progress'] = False
         next_actions = problem.joint_actions(next_state)
         joint_actions = problem.joint_action_list
@@ -94,10 +141,10 @@ def update(self,agent_name,actions,q_values,joint_rewards,curr_state,next_state,
             for agent in action:
                 indiv_actions = list(separate_actions[agent])
                 altered_action = copy.deepcopy(action)
-                q_val = q_values[agent][curr_state][action]
+                q_val = q_values[agent][next_state][action]
                 for indiv_action in indiv_actions:
                     altered_action[agent] = indiv_action
-                    altered_q_val = q_values[agent][curr_state][altered_action]
+                    altered_q_val = q_values[agent][next_state][altered_action]
                     constraint_val = (q_val - altered_q_val)*policy_var
                     total_sum[agent] += constraint_val
         for agent in total_sum:
@@ -108,11 +155,10 @@ def update(self,agent_name,actions,q_values,joint_rewards,curr_state,next_state,
         constraints.append(sum_constraint)
         constraints.append(non_negative)
         constraints.append(less_than_one)
-        objective_value = self.objective_func(agent_name,q_values,policies,curr_state,problem)
-        lp = op(objective_value,constraints)
+        objective_value = cvxopt.matrix(self.objective_func(q_values,next_state,problem,agent_name))
+        objective_value = cvxopt.modeling.dot(objective_value,policies)
+        lp = op(-objective_value,constraints)
         lp.solve()
         equilibrium = float(lp.objective.value()[0])
-        q_del = self.lr*(joint_rewards[agent_name] + self.dr*equilibrium)
-        return q_del
-
-
+        policies = list(policies.value)
+        return policies,equilibrium
diff --git a/msdm/algorithms/friendfoeq.py b/msdm/algorithms/friendfoeq.py
@@ -25,8 +25,11 @@ def __init__(self,learning_agents: Iterable,
                          alg_name=alg_name,render=render,render_from=render_from)
         self.friends = friends 
         self.foes = foes 
+        self.equilibria = []
 
     def update(self,agent_name,actions,q_values,joint_rewards,curr_state,next_state,problem):
+        if problem.is_terminal(next_state):
+            return self.lr*joint_rewards[agent_name]
         # Pure friend-Q case:
         if len(self.foes[agent_name]) == 0:
             ffq_equilibrium = max(q_values[agent_name][next_state].items(),key=lambda x:x[1])[1]
@@ -58,21 +61,22 @@ def update(self,agent_name,actions,q_values,joint_rewards,curr_state,next_state,
                         joint_action[friend] = action[friend_order[friend]]
                     for foe in foe_order:
                         joint_action[foe] = foe_action[foe_order[foe]]
-                    payoff_matrix[i,j] = q_vals[joint_action]
-
+                    payoff_matrix[i][j] = q_vals[joint_action]
             cvxopt.solvers.options['show_progress'] = False
-            payoff_matrix = cvxopt.matrix(payoff_matrix)    
-    #         pi = variable(len(friendly_actions),"policy")
-            pi = variable(len(foe_actions),"policy")
+            cvx_payoff_matrix = cvxopt.matrix(payoff_matrix.T)    
+            pi = variable(len(friendly_actions),"policy")
             c1 = (cvxopt.modeling.sum(pi) == 1.0)
             c2 = (pi >= 0.0)
             c4 = (pi <= 1.0)
             minimax_value = variable(1,"minimax_value")
-            c3 = (minimax_value >= payoff_matrix*pi)
+            c3 = (minimax_value >= cvx_payoff_matrix*pi)
             constraints = [c1,c2,c3,c4]
-            lp = op(minimax_value,constraints)
+            lp = op(-minimax_value,constraints)
             lp.solve()
-            ffq_equilibrium = float(lp.objective.value()[0])
-        q_del = self.lr*(joint_rewards[agent_name] + self.dr*ffq_equilibrium)
+            policy = np.array(pi.value)
+            expected_val = np.amin(np.dot(payoff_matrix.T,policy))
+            ffq_equilibrium = expected_val
+            self.equilibria.append((ffq_equilibrium,next_state))
+        q_del = (joint_rewards[agent_name] + self.dr*ffq_equilibrium)
         return q_del