forked from facebookresearch/fairseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_docs.py
58 lines (49 loc) · 1.74 KB
/
count_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Count the number of documents and average number of lines and tokens per
document in a large file. Documents should be separated by a single empty line.
"""
import argparse
import gzip
import sys
import numpy as np
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input')
parser.add_argument('--gzip', action='store_true')
args = parser.parse_args()
def gopen():
if args.gzip:
return gzip.open(args.input, 'r')
else:
return open(args.input, 'r', encoding='utf-8')
num_lines = []
num_toks = []
with gopen() as h:
num_docs = 1
num_lines_in_doc = 0
num_toks_in_doc = 0
for i, line in enumerate(h):
if len(line.strip()) == 0: # empty line indicates new document
num_docs += 1
num_lines.append(num_lines_in_doc)
num_toks.append(num_toks_in_doc)
num_lines_in_doc = 0
num_toks_in_doc = 0
else:
num_lines_in_doc += 1
num_toks_in_doc += len(line.rstrip().split())
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
print(file=sys.stderr, flush=True)
print("found {} docs".format(num_docs))
print("average num lines per doc: {}".format(np.mean(num_lines)))
print("average num toks per doc: {}".format(np.mean(num_toks)))
if __name__ == '__main__':
main()