Merge pull request #3 from Katsumata420/introduce-hf-datasets

Added cache_dir to Args
Katsumata420 · Dec 15, 2022 · 92b1f02 · 92b1f02
2 parents ddcd365 + 36ce7a1
commit 92b1f02
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/blink/biencoder/train_biencoder.py b/blink/biencoder/train_biencoder.py
@@ -142,7 +142,10 @@ def main(params):
 
     # Load train data
     train_samples = load_dataset(
-        "json", data_files={"train": os.path.join(params["data_path"], "train.jsonl")}, streaming=False
+        "json",
+        data_files={"train": os.path.join(params["data_path"], "train.jsonl")},
+        streaming=False,
+        cache_dir=params["cache_dir"],
     )["train"]
     logger.info("Read %d train samples." % len(train_samples))
 

diff --git a/blink/common/params.py b/blink/common/params.py
@@ -163,6 +163,12 @@ def add_model_args(self, args=None):
             required=True,
             help="The output directory where generated output file (model, etc.) is to be dumped.",
         )
+        parser.add_argument(
+            "--cache_dir",
+            default=None,
+            type=str,
+            help="dataset cache dir. Default path may be /home/user/.cache/...",
+        )
 
 
     def add_training_args(self, args=None):