Merge pull request real-stanford#7 from columbia-ai-robotics/cchi/bug…

…_fix_eval_sample fixed bug where only n_envs samples of metrics are used
Rafa-zy · Jun 1, 2023 · 74b6391 · 74b6391
2 parents 27395b7 + 5e36d50
commit 74b6391
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 6 deletions.
diff --git a/diffusion_policy/env_runner/blockpush_lowdim_runner.py b/diffusion_policy/env_runner/blockpush_lowdim_runner.py
@@ -235,7 +235,15 @@ def run(self, policy: BaseLowdimPolicy):
         prefix_counts = collections.defaultdict(lambda : 0)
 
         log_data = dict()
-        for i in range(len(self.env_fns)):
+        # results reported in the paper are generated using the commented out line below
+        # which will only report and average metrics from first n_envs initial condition and seeds
+        # fortunately this won't invalidate our conclusion since
+        # 1. This bug only affects the variance of metrics, not their mean
+        # 2. All baseline methods are evaluated using the same code
+        # to completely reproduce reported numbers, uncomment this line:
+        # for i in range(len(self.env_fns)):
+        # and comment out this line
+        for i in range(n_inits):
             seed = self.env_seeds[i]
             prefix = self.env_prefixs[i]
             this_rewards = all_rewards[i]

diff --git a/diffusion_policy/env_runner/kitchen_lowdim_runner.py b/diffusion_policy/env_runner/kitchen_lowdim_runner.py
@@ -279,7 +279,15 @@ def run(self, policy: BaseLowdimPolicy):
         log_data = dict()
         prefix_total_reward_map = collections.defaultdict(list)
         prefix_n_completed_map = collections.defaultdict(list)
-        for i in range(len(self.env_fns)):
+        # results reported in the paper are generated using the commented out line below
+        # which will only report and average metrics from first n_envs initial condition and seeds
+        # fortunately this won't invalidate our conclusion since
+        # 1. This bug only affects the variance of metrics, not their mean
+        # 2. All baseline methods are evaluated using the same code
+        # to completely reproduce reported numbers, uncomment this line:
+        # for i in range(len(self.env_fns)):
+        # and comment out this line
+        for i in range(n_inits):
             seed = self.env_seeds[i]
             prefix = self.env_prefixs[i]
             this_rewards = all_rewards[i]

diff --git a/diffusion_policy/env_runner/pusht_image_runner.py b/diffusion_policy/env_runner/pusht_image_runner.py
@@ -221,7 +221,15 @@ def run(self, policy: BaseImagePolicy):
         # log
         max_rewards = collections.defaultdict(list)
         log_data = dict()
-        for i in range(len(self.env_fns)):
+        # results reported in the paper are generated using the commented out line below
+        # which will only report and average metrics from first n_envs initial condition and seeds
+        # fortunately this won't invalidate our conclusion since
+        # 1. This bug only affects the variance of metrics, not their mean
+        # 2. All baseline methods are evaluated using the same code
+        # to completely reproduce reported numbers, uncomment this line:
+        # for i in range(len(self.env_fns)):
+        # and comment out this line
+        for i in range(n_inits):
             seed = self.env_seeds[i]
             prefix = self.env_prefixs[i]
             max_reward = np.max(all_rewards[i])

diff --git a/diffusion_policy/env_runner/pusht_keypoints_runner.py b/diffusion_policy/env_runner/pusht_keypoints_runner.py
@@ -243,7 +243,15 @@ def run(self, policy: BaseLowdimPolicy):
         # log
         max_rewards = collections.defaultdict(list)
         log_data = dict()
-        for i in range(len(self.env_fns)):
+        # results reported in the paper are generated using the commented out line below
+        # which will only report and average metrics from first n_envs initial condition and seeds
+        # fortunately this won't invalidate our conclusion since
+        # 1. This bug only affects the variance of metrics, not their mean
+        # 2. All baseline methods are evaluated using the same code
+        # to completely reproduce reported numbers, uncomment this line:
+        # for i in range(len(self.env_fns)):
+        # and comment out this line
+        for i in range(n_inits):
             seed = self.env_seeds[i]
             prefix = self.env_prefixs[i]
             max_reward = np.max(all_rewards[i])

diff --git a/diffusion_policy/env_runner/robomimic_image_runner.py b/diffusion_policy/env_runner/robomimic_image_runner.py
@@ -324,7 +324,15 @@ def run(self, policy: BaseImagePolicy):
         # log
         max_rewards = collections.defaultdict(list)
         log_data = dict()
-        for i in range(len(self.env_fns)):
+        # results reported in the paper are generated using the commented out line below
+        # which will only report and average metrics from first n_envs initial condition and seeds
+        # fortunately this won't invalidate our conclusion since
+        # 1. This bug only affects the variance of metrics, not their mean
+        # 2. All baseline methods are evaluated using the same code
+        # to completely reproduce reported numbers, uncomment this line:
+        # for i in range(len(self.env_fns)):
+        # and comment out this line
+        for i in range(n_inits):
             seed = self.env_seeds[i]
             prefix = self.env_prefixs[i]
             max_reward = np.max(all_rewards[i])

diff --git a/diffusion_policy/env_runner/robomimic_lowdim_runner.py b/diffusion_policy/env_runner/robomimic_lowdim_runner.py
@@ -317,7 +317,15 @@ def run(self, policy: BaseLowdimPolicy):
         # log
         max_rewards = collections.defaultdict(list)
         log_data = dict()
-        for i in range(len(self.env_fns)):
+        # results reported in the paper are generated using the commented out line below
+        # which will only report and average metrics from first n_envs initial condition and seeds
+        # fortunately this won't invalidate our conclusion since
+        # 1. This bug only affects the variance of metrics, not their mean
+        # 2. All baseline methods are evaluated using the same code
+        # to completely reproduce reported numbers, uncomment this line:
+        # for i in range(len(self.env_fns)):
+        # and comment out this line
+        for i in range(n_inits):
             seed = self.env_seeds[i]
             prefix = self.env_prefixs[i]
             max_reward = np.max(all_rewards[i])