Skip to content

Commit

Permalink
Support multiple custom missing value options in Readers (#918)
Browse files Browse the repository at this point in the history
* Switch ReadOptions.missingValueIndicator(String) to varargs

changes made. needs tests

* apply changes to JsonReadOptions

* Added test. Simplified a few other test methods

* Moved initialization of the locale readOption to the builder

This simplifies it by removing the null check, and makes it consistent with other options
  • Loading branch information
lwhite1 authored May 9, 2021
1 parent 74a54aa commit b52b943
Show file tree
Hide file tree
Showing 17 changed files with 65 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ public BooleanParser(ColumnType columnType) {

public BooleanParser(BooleanColumnType booleanColumnType, ReadOptions readOptions) {
super(booleanColumnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ public DateParser(ColumnType type, ReadOptions readOptions) {
if (readOptions.locale() != null) {
locale = readOptions.locale();
}
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ public DateTimeParser(DateTimeColumnType dateTimeColumnType, ReadOptions readOpt
if (readOptions.locale() != null) {
locale = readOptions.locale();
}
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ public DoubleParser(ColumnType columnType) {

public DoubleParser(DoubleColumnType doubleColumnType, ReadOptions readOptions) {
super(doubleColumnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ public FloatParser(ColumnType columnType) {

public FloatParser(FloatColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ public IntParser(ColumnType columnType) {

public IntParser(IntColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
ignoreZeroDecimal = readOptions.ignoreZeroDecimal();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ public LongParser(ColumnType columnType) {

public LongParser(LongColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
ignoreZeroDecimal = readOptions.ignoreZeroDecimal();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ public ShortParser(ShortColumnType columnType) {

public ShortParser(ShortColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
ignoreZeroDecimal = readOptions.ignoreZeroDecimal();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ public StringParser(ColumnType columnType) {

public StringParser(ColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ public TimeParser(ColumnType columnType, ReadOptions readOptions) {
if (readOptions.locale() != null) {
locale = readOptions.locale();
}
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
if (readOptions.missingValueIndicators().length > 0) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicators());
}
}

Expand Down
24 changes: 9 additions & 15 deletions core/src/main/java/tech/tablesaw/io/ReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public class ReadOptions {
protected final String dateTimeFormat;
protected final String timeFormat;
protected final Locale locale;
protected final String missingValueIndicator;
protected final String[] missingValueIndicators;
protected final boolean minimizeColumnSizes;
protected final int maxCharsPerColumn;
protected final boolean ignoreZeroDecimal;
Expand All @@ -105,7 +105,7 @@ protected ReadOptions(ReadOptions.Builder builder) {
dateFormat = builder.dateFormat;
timeFormat = builder.timeFormat;
dateTimeFormat = builder.dateTimeFormat;
missingValueIndicator = builder.missingValueIndicator;
missingValueIndicators = builder.missingValueIndicators;
minimizeColumnSizes = builder.minimizeColumnSizes;
header = builder.header;
maxCharsPerColumn = builder.maxCharsPerColumn;
Expand All @@ -115,8 +115,8 @@ protected ReadOptions(ReadOptions.Builder builder) {
dateFormatter = builder.dateFormatter;
timeFormatter = builder.timeFormatter;
dateTimeFormatter = builder.dateTimeFormatter;

allowDuplicateColumnNames = builder.allowDuplicateColumnNames;
locale = builder.locale;

if (builder.columnTypes != null)
columnTypeReadOptions = new ByIdxColumnTypeReadOptions(builder.columnTypes);
Expand All @@ -128,12 +128,6 @@ else if (builder.completeColumnTypeFunction != null)
else if (builder.columnTypeFunction != null)
columnTypeReadOptions = new PartialFunctionColumnTypeReadOptions(builder.columnTypeFunction);
else columnTypeReadOptions = ColumnTypeReadOptions.EMPTY;

if (builder.locale == null) {
locale = Locale.getDefault();
} else {
locale = builder.locale;
}
}

public Source source() {
Expand All @@ -160,8 +154,8 @@ public boolean minimizeColumnSizes() {
return minimizeColumnSizes;
}

public String missingValueIndicator() {
return missingValueIndicator;
public String[] missingValueIndicators() {
return missingValueIndicators;
}

public Locale locale() {
Expand Down Expand Up @@ -227,8 +221,8 @@ protected static class Builder {
protected DateTimeFormatter timeFormatter;
protected String dateTimeFormat;
protected DateTimeFormatter dateTimeFormatter;
protected Locale locale;
protected String missingValueIndicator;
protected Locale locale = Locale.getDefault();
protected String[] missingValueIndicators = new String[0];
protected boolean minimizeColumnSizes = false;
protected boolean header = true;
protected int maxCharsPerColumn = 4096;
Expand Down Expand Up @@ -321,8 +315,8 @@ public Builder dateTimeFormat(DateTimeFormatter dateFormat) {
return this;
}

public Builder missingValueIndicator(String missingValueIndicator) {
this.missingValueIndicator = missingValueIndicator;
public Builder missingValueIndicator(String... missingValueIndicators) {
this.missingValueIndicators = missingValueIndicators;
return this;
}

Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/tech/tablesaw/io/csv/CsvReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,8 @@ public Builder locale(Locale locale) {
}

@Override
public Builder missingValueIndicator(String missingValueIndicator) {
super.missingValueIndicator(missingValueIndicator);
public Builder missingValueIndicator(String... missingValueIndicators) {
super.missingValueIndicator(missingValueIndicators);
return this;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ public Builder locale(Locale locale) {
}

@Override
public Builder missingValueIndicator(String missingValueIndicator) {
public Builder missingValueIndicator(String... missingValueIndicator) {
super.missingValueIndicator(missingValueIndicator);
return this;
}
Expand Down
39 changes: 27 additions & 12 deletions core/src/test/java/tech/tablesaw/io/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -477,33 +477,48 @@ public void testLocalDateTimeDetectionFrench() {
}

@Test
public void testWithMissingValue() throws IOException {
void testWithMissingValue() throws IOException {

CsvReadOptions options =
CsvReadOptions.builder("../data/missing_values.csv")
.dateFormat(DateTimeFormatter.ofPattern("yyyy.MM.dd"))
.header(true)
.missingValueIndicator("-")
.build();
CsvReadOptions.builder("../data/missing_values.csv").missingValueIndicator("-").build();

Table t = Table.read().csv(options);
assertEquals(1, t.stringColumn(0).countMissing());
assertEquals(1, t.numberColumn(1).countMissing());
assertEquals(1, t.numberColumn(2).countMissing());
}

/** Tests the auto-detection of missing values, using multiple missing value indicators */
/**
* Tests using multiple, non-standard missing value indicators, some columns also contains NA or
* N/A, which are treated as regular string values rather than missing values. This has the
* side-effect of treating those otherwise numeric columns as StringColumns
*/
@Test
public void testWithMissingValue2() throws IOException {
void testWithMissingValues() throws IOException {

Reader reader =
new StringReader(
"Products,Sales,Market_Share\n"
+ "a,?,-\n"
+ "b,12200,NA\n"
+ "c,60000,33\n"
+ "d,N/A,10\n"
+ ",32000,42");
CsvReadOptions options =
CsvReadOptions.builder("../data/missing_values2.csv")
.dateFormat(DateTimeFormatter.ofPattern("yyyy.MM.dd"))
.header(true)
.build();
CsvReadOptions.builder(reader).sample(false).missingValueIndicator("-", "?", "").build();

Table t = Table.read().csv(options);
assertEquals(1, t.stringColumn(0).countMissing());
assertEquals(1, t.stringColumn(1).countMissing());
assertEquals(1, t.stringColumn(2).countMissing());
}

/** Tests the auto-detection of missing values, using multiple missing value indicators */
@Test
void testWithMissingValue2() throws IOException {

Table t = Table.read().csv("../data/missing_values2.csv");
assertEquals(1, t.stringColumn(0).countMissing());
assertEquals(1, t.numberColumn(1).countMissing());
assertEquals(1, t.numberColumn(2).countMissing());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ public Builder locale(Locale locale) {
}

@Override
public Builder missingValueIndicator(String missingValueIndicator) {
super.missingValueIndicator(missingValueIndicator);
public Builder missingValueIndicator(String... missingValueIndicators) {
super.missingValueIndicator(missingValueIndicators);
return this;
}

Expand Down
4 changes: 2 additions & 2 deletions html/src/main/java/tech/tablesaw/io/html/HtmlReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ public Builder locale(Locale locale) {
}

@Override
public Builder missingValueIndicator(String missingValueIndicator) {
super.missingValueIndicator(missingValueIndicator);
public Builder missingValueIndicator(String... missingValueIndicators) {
super.missingValueIndicator(missingValueIndicators);
return this;
}

Expand Down
4 changes: 2 additions & 2 deletions json/src/main/java/tech/tablesaw/io/json/JsonReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ public Builder locale(Locale locale) {
}

@Override
public Builder missingValueIndicator(String missingValueIndicator) {
super.missingValueIndicator(missingValueIndicator);
public Builder missingValueIndicator(String... missingValueIndicators) {
super.missingValueIndicator(missingValueIndicators);
return this;
}

Expand Down

0 comments on commit b52b943

Please sign in to comment.