Skip to content

Commit

Permalink
Fixed csv double quote escaping behavior. New behavior is as in csv s…
Browse files Browse the repository at this point in the history
…tandards.

Added two backward compatibility flags to allow returning to the (broken) 1.4.0 functionality if needed

Added tests to match

Quoting output properly will be done separately
  • Loading branch information
harelba committed Oct 25, 2014
1 parent 73fdaf9 commit ddf52cb
Show file tree
Hide file tree
Showing 2 changed files with 256 additions and 4 deletions.
15 changes: 13 additions & 2 deletions bin/q
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#
# Run with --help for command line details
#
q_version = "1.4.1"
q_version = "1.5.0" # not released yet

import os
import sys
Expand Down Expand Up @@ -141,6 +141,10 @@ input_data_option_group.add_option("-c", "--column-count", dest="column_count",
help="Specific column count when using relaxed or strict mode")
input_data_option_group.add_option("-k", "--keep-leading-whitespace", dest="keep_leading_whitespace_in_values", default=False, action="store_true",
help="Keep leading whitespace in values. Default behavior strips leading whitespace off values, in order to provide out-of-the-box usability for simple use cases. If you need to preserve whitespace, use this flag.")
input_data_option_group.add_option("--disable-double-double-quoting", dest="disable_double_double_quoting", default=True, action="store_false",
help="Disable support for double double-quoting for escaping the double quote character. By default, you can use \"\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
input_data_option_group.add_option("--disable-escaped-double-quoting", dest="disable_escaped_double_quoting", default=True, action="store_false",
help="Disable support for escaped double-quoting for escaping the double quote character. By default, you can use \\\" inside double quoted fields to escape double quotes. Mainly for backward compatibility.")
parser.add_option_group(input_data_option_group)
#-----------------------------------------------
output_data_option_group = OptionGroup(parser,"Output Options")
Expand All @@ -155,6 +159,7 @@ output_data_option_group.add_option("-f", "--formatting", dest="formatting", def
help="Output-level formatting, in the format X=fmt,Y=fmt etc, where X,Y are output column numbers (e.g. 1 for first SELECT column etc.")
output_data_option_group.add_option("-E", "--output-encoding", dest="output_encoding", default=default_output_encoding,
help="Output encoding. Defaults to 'none', leading to selecting the system/terminal encoding")
# -M will be added here for supporting output quoting mode in the future
parser.add_option_group(output_data_option_group)
#-----------------------------------------------
query_option_group = OptionGroup(parser,"Query Related Options")
Expand Down Expand Up @@ -972,7 +977,13 @@ else:
skip_initial_space = True

q_dialect = {'skipinitialspace': skip_initial_space, 'quoting': 0,
'delimiter': options.delimiter, 'quotechar': '"', 'doublequote': False}
'delimiter': options.delimiter, 'quotechar': '"' }

q_dialect['doublequote'] = options.disable_double_double_quoting;

if options.disable_escaped_double_quoting:
q_dialect['escapechar'] = '\\'

csv.register_dialect('q', **q_dialect)
file_reading_method = 'csv'

Expand Down
245 changes: 243 additions & 2 deletions test/test-suite
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def run_command(cmd_to_run):
p = Popen(cmd_to_run, stdout=PIPE, stderr=PIPE, shell=True)
o, e = p.communicate()
# remove last newline
o = o.strip()
o = o.rstrip()
e = e.strip()
# split rows
if o != '':
Expand Down Expand Up @@ -72,6 +72,26 @@ sample_data_with_empty_string_no_header = "\n".join(
sample_data_with_header = header_row + "\n" + sample_data_no_header
sample_data_with_missing_header_names = "name,value1\n" + sample_data_no_header

sample_quoted_data = '''non_quoted regular_double_quoted double_double_quoted escaped_double_quoted multiline_double_double_quoted multiline_escaped_double_quoted
control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
non-quoted-value "this is a quoted value" "this is a ""double double"" quoted value" "this is an escaped \\"quoted value\\"" "this is a double double quoted ""multiline
value""." "this is an escaped \\"multiline
value\\"."
control-value-1 "control-value-2" control-value-3 "control-value-4" control-value-5 "control-value-6"
'''

double_double_quoted_data = '''regular_double_quoted double_double_quoted
"this is a quoted value" "this is a quoted value with ""double double quotes"""
'''

escaped_double_quoted_data = '''regular_double_quoted escaped_double_quoted
"this is a quoted value" "this is a quoted value with \\"escaped double quotes\\""
'''

combined_quoted_data = '''regular_double_quoted double_double_quoted escaped_double_quoted
"this is a quoted value" "this is a quoted value with ""double double quotes""" "this is a quoted value with \\"escaped double quotes\\""
'''

# Values with leading whitespace
sample_data_rows_with_spaces = ['a,1,0', ' b, 2,0', 'c,,0']
sample_data_with_spaces_no_header = "\n".join(
Expand Down Expand Up @@ -715,8 +735,229 @@ class BasicTests(AbstractQTestCase):

self.assertTrue(e[0].startswith("Could not read query from file"))


def test_non_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)

cmd = '../bin/q -d " " "select c1 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)


self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),4)

self.assertTrue(o[0],'non_quoted')
self.assertTrue(o[1],'control-value-1')
self.assertTrue(o[2],'non-quoted-value')
self.assertTrue(o[3],'control-value-1')

self.cleanup(tmp_data_file)

def test_regular_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)

cmd = '../bin/q -d " " "select c2 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),4)

self.assertTrue(o[0],'regular_double_quoted')
self.assertTrue(o[1],'control-value-2')
self.assertTrue(o[2],'this is a quoted value')
self.assertTrue(o[3],'control-value-2')

self.cleanup(tmp_data_file)

def test_double_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)

cmd = '../bin/q -d " " "select c3 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),4)

self.assertTrue(o[0],'double_double_quoted')
self.assertTrue(o[1],'control-value-3')
self.assertTrue(o[2],'this is a "double double" quoted value')
self.assertTrue(o[3],'control-value-3')

self.cleanup(tmp_data_file)

def test_escaped_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)

cmd = '../bin/q -d " " "select c4 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),4)

self.assertTrue(o[0],'escaped_double_quoted')
self.assertTrue(o[1],'control-value-4')
self.assertTrue(o[2],'this is an escaped "quoted value"')
self.assertTrue(o[3],'control-value-4')

self.cleanup(tmp_data_file)

def test_multiline_double_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)

# FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
cmd = '../bin/q -d " " "select replace(c5,X\'0A\',\'::\') from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),4)

self.assertTrue(o[0],'multiline_double_double_quoted')
self.assertTrue(o[1],'control-value-5')
self.assertTrue(o[2],'this is a double double quoted "multiline\n value".')
self.assertTrue(o[3],'control-value-5')

self.cleanup(tmp_data_file)

def test_multiline_escaped_double_quoted_values_in_quoted_data(self):
tmp_data_file = self.create_file_with_data(sample_quoted_data)

# FIXME Need to convert \0a to proper encoding suitable for the person running the tests.
cmd = '../bin/q -d " " "select replace(c6,X\'0A\',\'::\') from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),4)

self.assertTrue(o[0],'multiline_escaped_double_quoted')
self.assertTrue(o[1],'control-value-6')
self.assertTrue(o[2],'this is an escaped "multiline:: value".')
self.assertTrue(o[3],'control-value-6')

self.cleanup(tmp_data_file)

def test_disable_double_double_quoted_data_flag__values(self):
# This test (and flag) is meant to verify backward comptibility only. It is possible that
# this flag will be removed completely in the future

tmp_data_file = self.create_file_with_data(double_double_quoted_data)

cmd = '../bin/q -d " " --disable-double-double-quoting "select c2 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),2)

self.assertEquals(o[0],'double_double_quoted')
self.assertEquals(o[1],'this is a quoted value with "double')

cmd = '../bin/q -d " " --disable-double-double-quoting "select c3 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),2)

self.assertEquals(o[0],'')
self.assertEquals(o[1],'double')

cmd = '../bin/q -d " " --disable-double-double-quoting "select c4 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),2)

self.assertEquals(o[0],'')
self.assertEquals(o[1],'quotes"""')

self.cleanup(tmp_data_file)

def test_disable_escaped_double_quoted_data_flag__values(self):
# This test (and flag) is meant to verify backward comptibility only. It is possible that
# this flag will be removed completely in the future

tmp_data_file = self.create_file_with_data(escaped_double_quoted_data)

cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c2 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),2)

self.assertEquals(o[0],'escaped_double_quoted')
self.assertEquals(o[1],'this is a quoted value with \\escaped')

cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c3 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),2)

self.assertEquals(o[0],'')
self.assertEquals(o[1],'double')

cmd = '../bin/q -d " " --disable-escaped-double-quoting "select c4 from %s"' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
self.assertEquals(len(o),2)

self.assertEquals(o[0],'')
self.assertEquals(o[1],'quotes\\""')

self.cleanup(tmp_data_file)

def test_combined_quoted_data_flags__number_of_columns_detected(self):
# This test (and flags) is meant to verify backward comptibility only. It is possible that
# these flags will be removed completely in the future
tmp_data_file = self.create_file_with_data(combined_quoted_data)

cmd = '../bin/q -d " " --disable-double-double-quoting --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
o = o[1:] # remove the first "Table for file..." line in the output

self.assertEquals(len(o),7) # found 7 fields

cmd = '../bin/q -d " " --disable-escaped-double-quoting "select * from %s" -A' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
o = o[1:] # remove the first "Table for file..." line in the output

self.assertEquals(len(o),5) # found 5 fields

cmd = '../bin/q -d " " --disable-double-double-quoting "select * from %s" -A' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
o = o[1:] # remove the first "Table for file..." line in the output

self.assertEquals(len(o),5) # found 5 fields

cmd = '../bin/q -d " " "select * from %s" -A' % tmp_data_file.name
retcode, o, e = run_command(cmd)

self.assertEquals(retcode,0)
self.assertEquals(len(e),0)
o = o[1:] # remove the first "Table for file..." line in the output

self.assertEquals(len(o),3) # found only 3 fields, which is the correct amount

self.cleanup(tmp_data_file)

class ParsingModeTests(AbstractQTestCase):

Expand Down

0 comments on commit ddf52cb

Please sign in to comment.