Folder and files for unichem plugin

biothings · Jul 28, 2020 · f924c2a · f924c2a
1 parent addbedb
commit f924c2a
Show file tree

Hide file tree

Showing 8 changed files with 778 additions and 0 deletions.
diff --git a/src/hub/dataload/sources/unichem/__init__.py b/src/hub/dataload/sources/unichem/__init__.py
@@ -0,0 +1,2 @@
+from .dump import Unichem_biothings_sdkDumper
+from .upload import Unichem_biothings_sdkUploader
diff --git a/src/hub/dataload/sources/unichem/csvsort/PKG-INFO b/src/hub/dataload/sources/unichem/csvsort/PKG-INFO
@@ -0,0 +1,49 @@
+Metadata-Version: 1.0
+Name: csvsort
+Version: 1.6.1
+Summary: Sort large CSV files on disk rather than in memory
+Home-page: https://github.com/richardpenman/csvsort
+Author: Richard Penman
+Author-email: [email protected]
+License: lgpl
+Description: ========
+        CSV Sort
+        ========
+
+        For sorting CSV files on disk that do not fit into memory. The merge sort algorithm is used to break up the original file into smaller chunks, sort these in memory, and then merge these sorted files.
+
+
+        =============
+        Example usage
+        =============
+
+        .. sourcecode:: python
+
+            >>> from csvsort import csvsort
+            >>> # sort this CSV on the 5th and 3rd columns (columns are 0 indexed)
+            >>> csvsort('test1.csv', [4,2])  
+            >>> # sort this CSV with no header on 4th column and save results to separate file
+            >>> csvsort('test2.csv', [3], output_filename='test3.csv', has_header=False)  
+            >>> # sort this TSV on the first column and use a maximum of 10MB per split
+            >>> csvsort('test3.tsv', [0], max_size=10, delimiter='\t')  
+            >>> # sort this CSV on the first column and force quotes around every field (default is csv.QUOTE_MINIMAL)
+            >>> import csv
+            >>> csvsort('test4.csv', [0], quoting=csv.QUOTE_ALL) 
+
+        ..
+
+
+        =======
+        Install
+        =======
+
+        Supports python 2 & 3:
+
+        .. sourcecode:: bash
+
+            $ pip install csvsort
+            $ pip3 install csvsort
+
+        ..
+
+Platform: UNKNOWN
diff --git a/src/hub/dataload/sources/unichem/csvsort/README.rst b/src/hub/dataload/sources/unichem/csvsort/README.rst
@@ -0,0 +1,39 @@
+========
+CSV Sort
+========
+
+For sorting CSV files on disk that do not fit into memory. The merge sort algorithm is used to break up the original file into smaller chunks, sort these in memory, and then merge these sorted files.
+
+
+=============
+Example usage
+=============
+
+.. sourcecode:: python
+
+    >>> from csvsort import csvsort
+    >>> # sort this CSV on the 5th and 3rd columns (columns are 0 indexed)
+    >>> csvsort('test1.csv', [4,2])  
+    >>> # sort this CSV with no header on 4th column and save results to separate file
+    >>> csvsort('test2.csv', [3], output_filename='test3.csv', has_header=False)  
+    >>> # sort this TSV on the first column and use a maximum of 10MB per split
+    >>> csvsort('test3.tsv', [0], max_size=10, delimiter='\t')  
+    >>> # sort this CSV on the first column and force quotes around every field (default is csv.QUOTE_MINIMAL)
+    >>> import csv
+    >>> csvsort('test4.csv', [0], quoting=csv.QUOTE_ALL) 
+
+..
+
+
+=======
+Install
+=======
+
+Supports python 2 & 3:
+
+.. sourcecode:: bash
+
+    $ pip install csvsort
+    $ pip3 install csvsort
+
+..
diff --git a/src/hub/dataload/sources/unichem/csvsort/__init__.py b/src/hub/dataload/sources/unichem/csvsort/__init__.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+
+import csv, heapq, logging, multiprocessing, os, sys, tempfile
+if sys.version_info.major == 2:
+    from io import open
+from optparse import OptionParser
+csv.field_size_limit(2**30) # can't use sys.maxsize because of Windows error
+
+
+class CsvSortError(Exception):
+    pass
+
+
+def csvsort(input_filename,
+            columns,
+            output_filename=None,
+            max_size=100,
+            has_header=True,
+            delimiter=',',
+            show_progress=False,
+            parallel=True,
+            quoting=csv.QUOTE_MINIMAL,
+            encoding=None,
+            numeric_column=False):
+
+    """Sort the CSV file on disk rather than in memory.
+
+    The merge sort algorithm is used to break the file into smaller sub files
+
+    Args:
+        input_filename: the CSV filename to sort.
+        columns: a list of columns to sort on (can be 0 based indices or header
+            keys).
+        output_filename: optional filename for sorted file. If not given then
+            input file will be overriden.
+        max_size: the maximum size (in MB) of CSV file to load in memory at
+            once.
+        has_header: whether the CSV contains a header to keep separated from
+            sorting.
+        delimiter: character used to separate fields, default ','.
+        show_progress (Boolean): A flag whether or not to show progress.
+            The default is False, which does not print any merge information.
+        quoting: How much quoting is needed in the final CSV file.  Default is
+            csv.QUOTE_MINIMAL.
+        encoding: The name of the encoding to use when opening or writing the
+            csv files. Default is None which uses the system default.
+    """
+
+    with open(input_filename, newline='', encoding=encoding) as input_fp:
+        reader = csv.reader(input_fp, delimiter=delimiter)
+        if has_header:
+            header = next(reader)
+        else:
+            header = None
+
+        columns = parse_columns(columns, header)
+
+        filenames = csvsplit(reader, max_size)
+        if show_progress:
+            logging.info('Merging %d splits' % len(filenames))
+
+        if parallel:
+            concurrency = multiprocessing.cpu_count()
+            with multiprocessing.Pool(processes=concurrency) as pool:
+                map_args = [(filename, columns, numeric_column, encoding) for filename in filenames]
+                pool.starmap(memorysort, map_args)
+        else:
+            for filename in filenames:
+                memorysort(filename, columns, numeric_column, encoding)
+        sorted_filename = mergesort(filenames, columns, numeric_column, encoding=encoding)
+
+    # XXX make more efficient by passing quoting, delimiter, and moving result
+    # generate the final output file
+    with open(output_filename or input_filename, 'w', newline='', encoding=encoding) as output_fp:
+        writer = csv.writer(output_fp, delimiter=delimiter, quoting=quoting)
+        if header:
+            writer.writerow(header)
+        with open(sorted_filename, newline='', encoding=encoding) as sorted_fp:
+            for row in csv.reader(sorted_fp):
+                writer.writerow(row)
+
+    os.remove(sorted_filename)
+
+
+def parse_columns(columns, header):
+    """check the provided column headers
+    """
+    for i, column in enumerate(columns):
+        if isinstance(column, int):
+            if header:
+                if column >= len(header):
+                    raise CsvSortError(
+                        'Column index is out of range: "{}"'.format(column))
+        else:
+            # find index of column from header
+            if header is None:
+                raise CsvSortError(
+                    'CSV needs a header to find index of this column name:' +
+                    ' "{}"'.format(column))
+            else:
+                if column in header:
+                    columns[i] = header.index(column)
+                else:
+                    raise CsvSortError(
+                        'Column name is not in header: "{}"'.format(column))
+    return columns
+
+
+def csvsplit(reader, max_size):
+    """Split into smaller CSV files of maximum size and return the filenames.
+    """
+    max_size = max_size * 1024 * 1024  # convert to bytes
+    writer = None
+    current_size = 0
+    split_filenames = []
+
+    # break CSV file into smaller merge files
+    for row in reader:
+        if writer is None:
+            ntf = tempfile.NamedTemporaryFile(delete=False, mode='w')
+            writer = csv.writer(ntf)
+            split_filenames.append(ntf.name)
+
+        writer.writerow(row)
+        current_size += sys.getsizeof(row)
+        if current_size > max_size:
+            writer = None
+            current_size = 0
+    return split_filenames
+
+
+def memorysort(filename, columns, numeric_column, encoding=None):
+    """Sort this CSV file in memory on the given columns
+    """
+    with open(filename, newline='', encoding=encoding) as input_fp:
+        rows = [row for row in csv.reader(input_fp) if row]
+
+    rows.sort(key=lambda row: get_key(row, columns, numeric_column))
+    with open(filename, 'w', newline='', encoding=encoding) as output_fp:
+        writer = csv.writer(output_fp)
+        for row in rows:
+            writer.writerow(row)
+
+
+def get_key(row, columns, numeric_column):
+    """Get sort key for this row
+    """
+    if(numeric_column):
+        return [int(row[column]) for column in columns]
+    else:
+        return [row[column] for column in columns]
+
+def decorated_csv(filename, columns, numeric_column, encoding=None):
+    """Iterator to sort CSV rows
+    """
+    with open(filename, newline='', encoding=encoding) as fp:
+        for row in csv.reader(fp):
+            yield get_key(row, columns, numeric_column), row
+
+
+def mergesort(sorted_filenames, columns, numeric_column, nway=2, encoding=None):
+    """Merge these 2 sorted csv files into a single output file
+    """
+    merge_n = 0
+    while len(sorted_filenames) > 1:
+        merge_filenames, sorted_filenames = \
+           sorted_filenames[:nway], sorted_filenames[nway:]
+
+        with tempfile.NamedTemporaryFile(delete=False, mode='w') as output_fp:
+            writer = csv.writer(output_fp)
+            merge_n += 1
+            for _, row in heapq.merge(*[decorated_csv(filename, columns, numeric_column, encoding)
+                                        for filename in merge_filenames]):
+                writer.writerow(row)
+
+            sorted_filenames.append(output_fp.name)
+
+        for filename in merge_filenames:
+            os.remove(filename)
+    return sorted_filenames[0]
+
+
+def main():
+    parser = OptionParser()
+    parser.add_option(
+        '-c',
+        '--column',
+        dest='columns',
+        action='append',
+        help='column of CSV to sort on')
+    parser.add_option(
+        '-s',
+        '--size',
+        dest='max_size',
+        type='float',
+        default=100,
+        help='maximum size of each split CSV file in MB (default 100)')
+    parser.add_option(
+        '-n',
+        '--no-header',
+        dest='has_header',
+        action='store_false',
+        default=True,
+        help='set CSV file has no header')
+    parser.add_option(
+        '-d',
+        '--delimiter',
+        default=',',
+        help='set CSV delimiter (default ",")')
+    parser.add_option(
+        '-e',
+        '--encoding',
+        default=None,
+        help='character encoding (eg utf-8) to use when reading/writing files (default uses system default)')
+    args, input_files = parser.parse_args()
+
+    if not input_files:
+        parser.error('What CSV file should be sorted?')
+    elif not args.columns:
+        parser.error('Which columns should be sorted on?')
+    else:
+        # escape backslashes
+        args.delimiter = args.delimiter.decode('string_escape')
+        args.columns = [int(column) if column.isdigit() else column
+                        for column in args.columns]
+        csvsort(
+            input_files[0],
+            columns=args.columns,
+            max_size=args.max_size,
+            has_header=args.has_header,
+            delimiter=args.delimiter,
+            encoding=args.encoding)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/hub/dataload/sources/unichem/csvsort/setup.py b/src/hub/dataload/sources/unichem/csvsort/setup.py
@@ -0,0 +1,18 @@
+import os
+from distutils.core import setup
+
+def read(filename):
+    return open(os.path.join(os.path.dirname(__file__), filename)).read()
+
+setup(
+    name='csvsort', 
+    version='1.6.1',
+    packages=['csvsort'],
+    package_dir={'csvsort' : '.'}, 
+    author='Richard Penman',
+    author_email='[email protected]',
+    description='Sort large CSV files on disk rather than in memory',
+    long_description=read('README.rst'),
+    url='https://github.com/richardpenman/csvsort',
+    license='lgpl',
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .dump import Unichem_biothings_sdkDumper
		from .upload import Unichem_biothings_sdkUploader