Merge pull request #65 from rail-berkeley/gripper-penalty-update

Gripper penalty and blocking environment
rail-berkeley · Jun 25, 2024 · 2f9f315 · 2f9f315
2 parents 0187476 + d5dfaf0
commit 2f9f315
Show file tree

Hide file tree

Showing 10 changed files with 57 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,15 @@ SERL provides a set of libraries, env wrappers, and examples to train RL policie
   - [Contribution](#contribution)
   - [Citation](#citation)
 
-## Major bug fix
+## Major updates
+#### June 24, 2024
+For people who use SERL for tasks involving controlling the gripper (e.g.,pick up objects), we strong recommend adding a small penalty to the gripper action change, as it will greatly improves the training speed.
+For detail, please refer to: [PR #65](https://github.com/rail-berkeley/serl/pull/65).
+
+
+Further, we also recommend  providing interventions online during training in addition to loading the offline demos. If you have a Franka robot and SpaceMouse, this can be as easy as just touching the SpaceMouse during training.
+
+#### April 25, 2024
 We fixed a major issue in the intervention action frame. See release [v0.1.1](https://github.com/rail-berkeley/serl/releases/tag/v0.1.1) Please update your code with the main branch.
 
 ## Installation

diff --git a/serl_robot_infra/franka_env/envs/bin_relocation_env/config.py b/serl_robot_infra/franka_env/envs/bin_relocation_env/config.py
@@ -21,7 +21,7 @@ class BinEnvConfig(DefaultEnvConfig):
         ]
     )
     RESET_POSE = TARGET_POSE + np.array([0.0, 0.0, 0.1, 0.0, 0.0, 0.0])
-    REWARD_THRESHOLD: np.ndarray = np.array([0.01, 0.01, 0.01, 0.2, 0.2, 0.2])
+    REWARD_THRESHOLD: np.ndarray = np.zeros(6)
     ACTION_SCALE = np.array([0.05, 0.1, 1])
     RANDOM_RESET = False
     RANDOM_XY_RANGE = 0.1

diff --git a/serl_robot_infra/franka_env/envs/cable_env/config.py b/serl_robot_infra/franka_env/envs/cable_env/config.py
@@ -21,7 +21,8 @@ class CableEnvConfig(DefaultEnvConfig):
         ]
     )
     RESET_POSE = TARGET_POSE + np.array([0.0, 0.0, 0.1, 0.0, 0.0, 0.0])
-    REWARD_THRESHOLD: np.ndarray = np.array([0.01, 0.01, 0.01, 0.2, 0.2, 0.2])
+    REWARD_THRESHOLD: np.ndarray = np.zeros(6)
+    APPLY_GRIPPER_PENALTY = False
     ACTION_SCALE = np.array([0.05, 0.3, 1])
     RANDOM_RESET = True
     RANDOM_XY_RANGE = 0.1

diff --git a/serl_robot_infra/franka_env/envs/cable_env/franka_cable_route.py b/serl_robot_infra/franka_env/envs/cable_env/franka_cable_route.py
@@ -19,6 +19,7 @@ def go_to_rest(self, joint_reset=False):
         Move to the rest position defined in base class.
         Add a small z offset before going to rest to avoid collision with object.
         """
+        self._send_gripper_command(-1)
         self._update_currpos()
         self._send_pos_command(self.currpos)
         time.sleep(0.5)

diff --git a/serl_robot_infra/franka_env/envs/franka_env.py b/serl_robot_infra/franka_env/envs/franka_env.py
@@ -59,6 +59,9 @@ class DefaultEnvConfig:
     ABS_POSE_LIMIT_LOW = np.zeros((6,))
     COMPLIANCE_PARAM: Dict[str, float] = {}
     PRECISION_PARAM: Dict[str, float] = {}
+    BINARY_GRIPPER_THREASHOLD: float = 0.5
+    APPLY_GRIPPER_PENALTY: bool = True
+    GRIPPER_PENALTY: float = 0.1
 
 
 ##############################################################################
@@ -94,6 +97,7 @@ def __init__(
         self.currjacobian = np.zeros((6, 7))
 
         self.curr_gripper_pos = 0
+        self.gripper_binary_state = 0  # 0 for open, 1 for closed
         self.lastsent = time.time()
         self.randomreset = config.RANDOM_RESET
         self.random_xy_range = config.RANDOM_XY_RANGE
@@ -201,7 +205,7 @@ def step(self, action: np.ndarray) -> tuple:
 
         gripper_action = action[6] * self.action_scale[2]
 
-        self._send_gripper_command(gripper_action)
+        gripper_action_effective = self._send_gripper_command(gripper_action)
         self._send_pos_command(self.clip_safety_box(self.nextpos))
 
         self.curr_path_length += 1
@@ -210,11 +214,11 @@ def step(self, action: np.ndarray) -> tuple:
 
         self._update_currpos()
         ob = self._get_obs()
-        reward = self.compute_reward(ob)
-        done = self.curr_path_length >= self.max_episode_length or reward
-        return ob, int(reward), done, False, {}
+        reward = self.compute_reward(ob, gripper_action_effective)
+        done = self.curr_path_length >= self.max_episode_length or reward == 1
+        return ob, reward, done, False, {}
 
-    def compute_reward(self, obs) -> bool:
+    def compute_reward(self, obs, gripper_action_effective) -> bool:
         """We are using a sparse reward function."""
         current_pose = obs["state"]["tcp_pose"]
         # convert from quat to euler first
@@ -223,10 +227,15 @@ def compute_reward(self, obs) -> bool:
         current_pose = np.hstack([current_pose[:3], euler_angles])
         delta = np.abs(current_pose - self._TARGET_POSE)
         if np.all(delta < self._REWARD_THRESHOLD):
-            return True
+            reward = 1
         else:
             # print(f'Goal not reached, the difference is {delta}, the desired threshold is {_REWARD_THRESHOLD}')
-            return False
+            reward = 0
+
+        if self.config.APPLY_GRIPPER_PENALTY and gripper_action_effective:
+            reward -= self.config.GRIPPER_PENALTY
+
+        return reward
 
     def crop_image(self, name, image) -> np.ndarray:
         """Crop realsense images to be a square."""
@@ -379,12 +388,24 @@ def _send_pos_command(self, pos: np.ndarray):
     def _send_gripper_command(self, pos: float, mode="binary"):
         """Internal function to send gripper command to the robot."""
         if mode == "binary":
-            if (pos >= -1) and (pos <= -0.9):  # close gripper
+            if (
+                pos <= -self.config.BINARY_GRIPPER_THREASHOLD
+                and self.gripper_binary_state == 0
+            ):  # close gripper
                 requests.post(self.url + "close_gripper")
-            elif (pos >= 0.9) and (pos <= 1):  # open gripper
+                time.sleep(0.6)
+                self.gripper_binary_state = 1
+                return True
+            elif (
+                pos >= self.config.BINARY_GRIPPER_THREASHOLD
+                and self.gripper_binary_state == 1
+            ):  # open gripper
                 requests.post(self.url + "open_gripper")
+                time.sleep(0.6)
+                self.gripper_binary_state = 0
+                return True
             else:  # do nothing to the gripper
-                return
+                return False
         elif mode == "continuous":
             raise NotImplementedError("Continuous gripper control is optional")
 

diff --git a/serl_robot_infra/franka_env/envs/pcb_env/config.py b/serl_robot_infra/franka_env/envs/pcb_env/config.py
@@ -22,6 +22,7 @@ class PCBEnvConfig(DefaultEnvConfig):
     )
     RESET_POSE = TARGET_POSE + np.array([0.0, 0.0, 0.04, 0.0, 0.0, 0.0])
     REWARD_THRESHOLD = [0.003, 0.003, 0.001, 0.1, 0.1, 0.1]
+    APPLY_GRIPPER_PENALTY = False
     ACTION_SCALE = (0.02, 0.2, 1)
     RANDOM_RESET = True
     RANDOM_XY_RANGE = 0.05

diff --git a/serl_robot_infra/franka_env/envs/pcb_env/franka_pcb_insert.py b/serl_robot_infra/franka_env/envs/pcb_env/franka_pcb_insert.py
@@ -27,6 +27,7 @@ def go_to_rest(self, joint_reset=False):
         Move to the rest position defined in base class.
         Add a small z offset before going to rest to avoid collision with object.
         """
+        self._send_gripper_command(-1)
         self._update_currpos()
         self._send_pos_command(self.currpos)
         time.sleep(0.5)

diff --git a/serl_robot_infra/franka_env/envs/peg_env/config.py b/serl_robot_infra/franka_env/envs/peg_env/config.py
@@ -22,6 +22,7 @@ class PegEnvConfig(DefaultEnvConfig):
     )
     RESET_POSE = TARGET_POSE + np.array([0.0, 0.0, 0.1, 0.0, 0.0, 0.0])
     REWARD_THRESHOLD: np.ndarray = np.array([0.01, 0.01, 0.01, 0.2, 0.2, 0.2])
+    APPLY_GRIPPER_PENALTY = False
     ACTION_SCALE = np.array([0.02, 0.1, 1])
     RANDOM_RESET = True
     RANDOM_XY_RANGE = 0.05

diff --git a/serl_robot_infra/franka_env/envs/peg_env/franka_peg_insert.py b/serl_robot_infra/franka_env/envs/peg_env/franka_peg_insert.py
@@ -18,6 +18,7 @@ def go_to_rest(self, joint_reset=False):
         Move to the rest position defined in base class.
         Add a small z offset before going to rest to avoid collision with object.
         """
+        self._send_gripper_command(-1)
         self._update_currpos()
         self._send_pos_command(self.currpos)
         time.sleep(0.5)

diff --git a/serl_robot_infra/franka_env/envs/wrappers.py b/serl_robot_infra/franka_env/envs/wrappers.py
@@ -49,8 +49,9 @@ def compute_reward(self, obs):
 
     def step(self, action):
         obs, rew, done, truncated, info = self.env.step(action)
-        rew = self.compute_reward(self.env.get_front_cam_obs())
-        done = done or rew
+        success = self.compute_reward(self.env.get_front_cam_obs())
+        rew += success
+        done = done or success
         return obs, rew, done, truncated, info
 
 
@@ -72,8 +73,9 @@ def compute_reward(self, obs):
 
     def step(self, action):
         obs, rew, done, truncated, info = self.env.step(action)
-        rew = self.compute_reward(self.env.get_front_cam_obs())
-        done = done or rew
+        success = self.compute_reward(self.env.get_front_cam_obs())
+        rew += success
+        done = done or success
         return obs, rew, done, truncated, info
 
 
@@ -94,8 +96,9 @@ def compute_reward(self, obs):
 
     def step(self, action):
         obs, rew, done, truncated, info = self.env.step(action)
-        rew = self.compute_reward(obs)
-        done = done or rew
+        success = self.compute_reward(obs)
+        rew += success
+        done = done or success
         return obs, rew, done, truncated, info