TopK sampling instead of Top1

CarperAI · Dahoas · Jun 5, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
commit f58170dc3022f1c21f7bd53c5c88882984240751
diff --git a/trlx/data/default_configs.py b/trlx/data/default_configs.py
@@ -50,6 +50,7 @@ def default_ppo_config():
             ref_std=None,
             cliprange_reward=10,
             num_return_sequences=1,
+            num_train_sequences=1,
             gen_kwargs=dict(
                 max_new_tokens=40,
                 top_k=0,

diff --git a/trlx/models/modeling_ppo.py b/trlx/models/modeling_ppo.py
@@ -131,6 +131,7 @@ class PPOConfig(MethodConfig):
     cliprange_reward: float
     gen_kwargs: dict
     num_return_sequences: int
+    num_train_sequences: int
     gen_experience_kwargs: Optional[dict] = None
 
     def get_advantages_and_returns(

diff --git a/trlx/trainer/accelerate_ppo_trainer.py b/trlx/trainer/accelerate_ppo_trainer.py
@@ -320,10 +320,10 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
             else:
                 scores = all_scores[0].clone().detach()
             # Best-of-N Sampling. 
-            max_score_indices = self.get_max_indices(scores, self.config.method.num_return_sequences, device)
-            scores = scores.index_select(0, max_score_indices)
-            samples = samples.index_select(0, max_score_indices)
-            prompt_tensors = prompt_tensors.index_select(0, max_score_indices)
+            train_indices = self.get_topk_indices(input_tensor=scores, window_size=self.config.method.num_return_sequences,k=self.config.method.num_train_sequences, device=device)
+            scores = scores.index_select(0, train_indices)
+            samples = samples.index_select(0, train_indices)
+            prompt_tensors = prompt_tensors.index_select(0, train_indices)
 
             str_samples, str_prompts, str_outputs = self.decode(prompt_tensors, samples, append_eos_token=True)
 
@@ -514,14 +514,11 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
         self.push_to_store(ppo_rl_elements)
 
     @staticmethod
-    def get_max_indices(input_tensor, window_size, device):
+    def get_topk_indices(input_tensor, window_size: int, k: int, device):
         # Use unfold to create the sliding windows
         unfolded = input_tensor.unfold(0, window_size, window_size)
-
-        # Find the max values and indices along the unfolded dimension
-        values, indices = unfolded.max(dim=2)
-
+        # Find the topk values and indices along the unfolded dimension
+        _, indices = torch.topk(unfolded, k, dim=2)
         # Adjust indices to be relative to original tensor
-        indices += torch.arange(0, input_tensor.size(0) - window_size + 1, window_size).to(device).unsqueeze(1)
-
-        return indices.squeeze()
+        indices = indices.squeeze(1) +  torch.arange(0, input_tensor.size(0) - window_size + 1, window_size).to(device).unsqueeze(1)
+        return indices.reshape(-1)