diff --git a/csvs_to_sqlite/cli.py b/csvs_to_sqlite/cli.py index 57b5d9d..6f9f29a 100644 --- a/csvs_to_sqlite/cli.py +++ b/csvs_to_sqlite/cli.py @@ -27,6 +27,7 @@ @click.argument('dbname', nargs=1) @click.option('--separator', '-s', default=',', help='Field separator in input .csv') @click.option('--quoting', '-q', default=0, help='Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).') +@click.option('--skip-errors', is_flag=True, help='Skip lines with too many fields instead of stopping the import') @click.option('--replace-tables', is_flag=True, help='Replace tables if they already exist') @click.option('--extract-column', '-c', multiple=True, help=( "One or more columns to 'extract' into a separate lookup table. " @@ -45,7 +46,7 @@ "One or more columns to use to populate a full-text index" )) @click.version_option() -def cli(paths, dbname, separator, quoting, replace_tables, extract_column, fts): +def cli(paths, dbname, separator, quoting, skip_errors, replace_tables, extract_column, fts): """ PATHS: paths to individual .csv files or to directories containing .csvs @@ -72,7 +73,7 @@ def cli(paths, dbname, separator, quoting, replace_tables, extract_column, fts): csvs = csvs_from_paths(paths) for name, path in csvs.items(): try: - df = load_csv(path, separator, quoting) + df = load_csv(path, separator, skip_errors, quoting) df.table_name = name dataframes.append(df) except LoadCsvError as e: diff --git a/csvs_to_sqlite/utils.py b/csvs_to_sqlite/utils.py index 9668c1e..77303d4 100644 --- a/csvs_to_sqlite/utils.py +++ b/csvs_to_sqlite/utils.py @@ -10,11 +10,11 @@ class LoadCsvError(Exception): pass -def load_csv(filepath, separator, quoting, encodings_to_try=('utf8', 'latin-1')): +def load_csv(filepath, separator, skip_errors, quoting, encodings_to_try=('utf8', 'latin-1')): try: for encoding in encodings_to_try: try: - return pd.read_csv(filepath, sep=separator, quoting=quoting, low_memory=True, encoding=encoding) + return pd.read_csv(filepath, sep=separator, quoting=quoting, error_bad_lines=not skip_errors, low_memory=True, encoding=encoding) except UnicodeDecodeError: continue except pd.errors.ParserError as e: