Merge pull request karpathy#73 from danielgross/master

Use relative paths
Jkback2 · Jan 20, 2023 · a6bffee · a6bffee
2 parents 2c7806d + edb7a7e
commit a6bffee
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 6 deletions.
diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py
@@ -1,6 +1,7 @@
 # saves the openwebtext dataset to a binary file for training. following was helpful:
 # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
 
+import os
 from tqdm import tqdm
 import numpy as np
 import tiktoken
@@ -50,7 +51,7 @@ def process(example):
 # concatenate all the ids in each dataset into one large file we can use for training
 for split, dset in tokenized.items():
     arr_len = np.sum(dset['len'])
-    filename = f'{split}.bin'
+    filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
     dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
     arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
 

diff --git a/data/shakespeare/prepare.py b/data/shakespeare/prepare.py
@@ -25,8 +25,8 @@
 # export to bin files
 train_ids = np.array(train_ids, dtype=np.uint16)
 val_ids = np.array(val_ids, dtype=np.uint16)
-train_ids.tofile('train.bin')
-val_ids.tofile('val.bin')
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
 
 # train.bin has 301,966 tokens
 # val.bin has 36,059 tokens
diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py
@@ -47,16 +47,16 @@ def decode(l):
 # export to bin files
 train_ids = np.array(train_ids, dtype=np.uint16)
 val_ids = np.array(val_ids, dtype=np.uint16)
-train_ids.tofile('train.bin')
-val_ids.tofile('val.bin')
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
 
 # save the meta information as well, to help us encode/decode later
 meta = {
     'vocab_size': vocab_size,
     'itos': itos,
     'stoi': stoi,
 }
-with open('meta.pkl', 'wb') as f:
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
     pickle.dump(meta, f)
 
 # length of dataset in characters:  1115394