scripts/count_docs.py

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Count the number of documents and average number of lines and tokens per
document in a large file. Documents should be separated by a single empty line.
"""

import argparse
import gzip
import sys

import numpy as np


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('--gzip', action='store_true')
    args = parser.parse_args()

    def gopen():
        if args.gzip:
            return gzip.open(args.input, 'r')
        else:
            return open(args.input, 'r', encoding='utf-8')

    num_lines = []
    num_toks = []
    with gopen() as h:
        num_docs = 1
        num_lines_in_doc = 0
        num_toks_in_doc = 0
        for i, line in enumerate(h):
            if len(line.strip()) == 0:  # empty line indicates new document
                num_docs += 1
                num_lines.append(num_lines_in_doc)
                num_toks.append(num_toks_in_doc)
                num_lines_in_doc = 0
                num_toks_in_doc = 0
            else:
                num_lines_in_doc += 1
                num_toks_in_doc += len(line.rstrip().split())
            if i % 1000000 == 0:
                print(i, file=sys.stderr, end="", flush=True)
            elif i % 100000 == 0:
                print(".", file=sys.stderr, end="", flush=True)
        print(file=sys.stderr, flush=True)

    print("found {} docs".format(num_docs))
    print("average num lines per doc: {}".format(np.mean(num_lines)))
    print("average num toks per doc: {}".format(np.mean(num_toks)))


if __name__ == '__main__':
    main()