Skip to content

Commit

Permalink
change default chunk size to equal context size
Browse files Browse the repository at this point in the history
  • Loading branch information
sid committed Dec 10, 2020
1 parent f512ae1 commit 225d983
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion data/create_tfrecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
parser.add_argument("--minimum_size", type=int, default=100, help="Minimum size a document has to be to be included")
parser.add_argument("--no_ftfy", action="store_true", help="If set skips unicode normalization with ftfy")
parser.add_argument("--separator", nargs="+", type=int, default=[0], help="separator to place between files in chunk mode")
parser.add_argument("--chunk_size", type=int, default=2049, help="How big a chunk should be in chunk mode. Choose one more than the desired size.")
parser.add_argument("--chunk_size", type=int, default=2048, help="How big a chunk should be in chunk mode. "
"Should equal your model's context size")
parser.add_argument("--write_dataset_config", action="store_true", help="Write the dataset config file on completion")
parser.add_argument("--processes", type=int, default=0, help="Number of processes to use. Defaults to cpu count.")

Expand Down Expand Up @@ -205,6 +206,7 @@ def create_tfrecords_mp(files, args):
files = get_files(args.input_dir)
logging.warning(f"Make sure that chunk size ({args.chunk_size}) = your model's context size *PLUS ONE* - \
You need an extra token as a label and the script does not increment this internally.")
args.chunk_size += 1 # we shift the data by 1 to the right for targets, so increment the chunk size here
args.processes = 2
if args.processes == 0:
args.processes = cpu_count()
Expand Down

0 comments on commit 225d983

Please sign in to comment.