🔧 Add scale fix arg to configs

and-computers · May 20, 2021 · 7eabbc5 · 7eabbc5
1 parent cebf234
commit 7eabbc5
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 0 deletions.
diff --git a/training_config/contrastive_only.jsonnet b/training_config/contrastive_only.jsonnet
@@ -47,6 +47,12 @@ local min_length = 32;
             "type": "nt_xent",
             "temperature": 0.05,
         },
+        // There was a small bug in the original implementation that caused gradients derived from
+        // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
+        // training. This has been fixed. To reproduce results from the paper, set this to false.
+        // Note that this will have no effect if you are not using distributed training with more
+        // than 1 GPU.
+        "scale_fix": false
     },
     "data_loader": {
         "batch_size": 4,

diff --git a/training_config/declutr.jsonnet b/training_config/declutr.jsonnet
@@ -47,6 +47,12 @@ local min_length = 32;
             "type": "nt_xent",
             "temperature": 0.05,
         },
+        // There was a small bug in the original implementation that caused gradients derived from
+        // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
+        // training. This has been fixed. To reproduce results from the paper, set this to false.
+        // Note that this will have no effect if you are not using distributed training with more
+        // than 1 GPU.
+        "scale_fix": false
     },
     "data_loader": {
         "batch_size": 4,

diff --git a/training_config/declutr_base.jsonnet b/training_config/declutr_base.jsonnet
@@ -47,6 +47,12 @@ local min_length = 32;
             "type": "nt_xent",
             "temperature": 0.05,
         },
+        // There was a small bug in the original implementation that caused gradients derived from
+        // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
+        // training. This has been fixed. To reproduce results from the paper, set this to false.
+        // Note that this will have no effect if you are not using distributed training with more
+        // than 1 GPU.
+        "scale_fix": false
     },
     "data_loader": {
         "batch_size": 4,

diff --git a/training_config/declutr_small.jsonnet b/training_config/declutr_small.jsonnet
@@ -47,6 +47,12 @@ local min_length = 32;
             "type": "nt_xent",
             "temperature": 0.05,
         },
+        // There was a small bug in the original implementation that caused gradients derived from
+        // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
+        // training. This has been fixed. To reproduce results from the paper, set this to false.
+        // Note that this will have no effect if you are not using distributed training with more
+        // than 1 GPU.
+        "scale_fix": false
     },
     "data_loader": {
         "batch_size": 4,

diff --git a/training_config/mlm_only.jsonnet b/training_config/mlm_only.jsonnet
@@ -46,6 +46,12 @@ local min_length = 32;
                 },
             },
         },
+        // There was a small bug in the original implementation that caused gradients derived from
+        // the contrastive loss to be scaled by 1/N, where N is the number of GPUs used during
+        // training. This has been fixed. To reproduce results from the paper, set this to false.
+        // Note that this will have no effect if you are not using distributed training with more
+        // than 1 GPU.
+        "scale_fix": false
     },
     "data_loader": {
         "batch_size": 4,