Skip to content

Commit

Permalink
Issue datacleaner#506: Big refactor of RowAnnotationFactor implementa…
Browse files Browse the repository at this point in the history
…tions to get

memory footprint under control. Introducing "max sets" flag.
  • Loading branch information
kaspersorensen committed Jul 17, 2015
1 parent 7fb50aa commit 5f161aa
Show file tree
Hide file tree
Showing 54 changed files with 443 additions and 2,162 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@
*/
package org.datacleaner.storage;

import java.util.Map;

import org.datacleaner.api.Component;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Provided;

/**
Expand All @@ -36,7 +32,7 @@
* The RowAnnotationFactory is injectable into any {@link Component} (analyzer,
* transformer, filter) using the {@link Provided} annotation.
*/
public interface RowAnnotationFactory {
public interface RowAnnotationFactory extends RowAnnotationSampleContainer, RowAnnotationHandler {

/**
* Creates a new annotation
Expand All @@ -45,57 +41,4 @@ public interface RowAnnotationFactory {
*/
public RowAnnotation createAnnotation();

/**
* Annotates an array of rows (all assumed to have distinct count = 1).
*
* @param rows
* @param annotation
*/
public void annotate(InputRow[] rows, RowAnnotation annotation);

/**
* Annotates/labels a row with an annotation. The row will be retrievable
* using the getRows(...) method later in the process.
*
* @param row
* @param distinctCount
* @param annotation
*/
public void annotate(InputRow row, int distinctCount, RowAnnotation annotation);

/**
* Removes/resets all annotations of a specific kind. This method can be
* used for situations where eg. an analyzer is annotating extreme values
* (highest/lowest values etc.) and the threshold is changing, cancelling
* all previous annotations.
*
* @param annotation
*/
public void reset(RowAnnotation annotation);

/**
* Gets all the available rows with a given annotation.
*
* @param annotation
* @return
*/
public InputRow[] getRows(RowAnnotation annotation);

/**
* Gets a summarized view of the distinct values and their counts for a
* single column and annotation.
*
* @param annotation
* @param inputColumn
* @return
*/
public Map<Object, Integer> getValueCounts(RowAnnotation annotation, InputColumn<?> inputColumn);

/**
* Transfers registered annotated rows from one annotation to the other.
*
* @param from
* @param to
*/
public void transferAnnotations(RowAnnotation from, RowAnnotation to);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.storage;

import org.datacleaner.api.InputRow;

/**
* Represents a component that is capable of connecting {@link RowAnnotation} to
* {@link InputRow}s, typically to publish them via a
* {@link RowAnnotationSampleContainer}.
*/
public interface RowAnnotationHandler {

/**
* Annotates/labels a row with an annotation. The row will be sampled and
* usually retrievable using the getRows(...) method later in the process.
*
* @param row
* @param annotation
*/
public void annotate(InputRow row, RowAnnotation annotation);

public void annotate(InputRow row, int distinctCount, RowAnnotation annotation);

/**
* Transfers registered annotated rows from one annotation to the other.
*
* @param from
* @param to
*/
public void transferAnnotations(RowAnnotation from, RowAnnotation to);

/**
* Removes/resets all annotations of a specific kind. This method can be
* used for situations where eg. an analyzer is annotating extreme values
* (highest/lowest values etc.) and the threshold is changing, cancelling
* all previous annotations.
*
* @param annotation
*/
public void resetAnnotation(RowAnnotation annotation);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.storage;

import java.util.List;

import org.datacleaner.api.InputRow;

/**
* A component for retrieving sample {@link InputRow}s that are annotated using
* with a {@link RowAnnotation}.
*/
public interface RowAnnotationSampleContainer {

/**
* Determines if there are sample rows available for a specific
* {@link RowAnnotation}.
*
* @param annotation
* @return
*/
public boolean hasSampleRows(RowAnnotation annotation);

/**
* Gets all the available sample rows with a given annotation.
*
* @param annotation
* @return
*/
public List<InputRow> getSampleRows(RowAnnotation annotation);
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
import org.datacleaner.result.Crosstab;
import org.datacleaner.result.CrosstabDimension;
import org.datacleaner.result.CrosstabNavigator;
import org.datacleaner.storage.InMemoryRowAnnotationFactory;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.storage.RowAnnotations;
import org.datacleaner.util.ValueCombination;

@Named("Boolean analyzer")
Expand Down Expand Up @@ -83,7 +83,7 @@ public int compare(Entry<ValueCombination<Boolean>, RowAnnotation> o1,

public BooleanAnalyzer(InputColumn<Boolean>[] columns) {
_columns = columns;
_annotationFactory = new InMemoryRowAnnotationFactory();
_annotationFactory = RowAnnotations.getDefaultFactory();
}

public BooleanAnalyzer() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.datacleaner.beans;

import java.util.Collection;
import java.util.List;

import javax.inject.Inject;

Expand Down Expand Up @@ -48,12 +49,14 @@ public CompletenessAnalyzerResult reduce(Collection<? extends CompletenessAnalyz

int totalRowCount = 0;
for (CompletenessAnalyzerResult result : results) {
final InputRow[] rows = result.getRows();
final List<InputRow> sampleRows = result.getSampleRows();
final int invalidRowCount = result.getInvalidRowCount();
if (invalidRowCount == rows.length) {
if (invalidRowCount == sampleRows.size()) {
// if the rows are included for preview/sampling - then
// re-annotate them in the master result
_rowAnnotationFactory.annotate(rows, annotation);
for (InputRow sampleRow : sampleRows) {
_rowAnnotationFactory.annotate(sampleRow, annotation);
}
} else {
// else we just transfer annotation counts
_rowAnnotationFactory.transferAnnotations(result.getAnnotation(), annotation);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,18 @@ public synchronized void run(final Date value, final InputRow row, final int dis
} else {
if (localDate.isAfter(_maxDate)) {
_maxDate = localDate;
_annotationFactory.reset(_maxDateAnnotation);
_annotationFactory.resetAnnotation(_maxDateAnnotation);
} else if (localDate.isBefore(_minDate)) {
_minDate = localDate;
_annotationFactory.reset(_minDateAnnotation);
_annotationFactory.resetAnnotation(_minDateAnnotation);
}

if (localTime.isAfter(_maxTime)) {
_maxTime = localTime;
_annotationFactory.reset(_maxTimeAnnotation);
_annotationFactory.resetAnnotation(_maxTimeAnnotation);
} else if (localTime.isBefore(_minTime)) {
_minTime = localTime;
_annotationFactory.reset(_minTimeAnnotation);
_annotationFactory.resetAnnotation(_minTimeAnnotation);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
import org.datacleaner.result.Crosstab;
import org.datacleaner.result.CrosstabDimension;
import org.datacleaner.result.CrosstabNavigator;
import org.datacleaner.storage.InMemoryRowAnnotationFactory;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.storage.RowAnnotations;

/**
* Number analyzer, which provides statistical information for number values:
Expand Down Expand Up @@ -105,7 +105,7 @@ public NumberAnalyzer() {
public NumberAnalyzer(InputColumn<? extends Number>... columns) {
this();
_columns = columns;
_annotationFactory = new InMemoryRowAnnotationFactory();
_annotationFactory = RowAnnotations.getDefaultFactory();
init();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ public synchronized void run(InputRow row, Number value, int distinctCount) {
double min = _statistics.getMin();

if (max < doubleValue) {
_annotationFactory.reset(_maxAnnotation);
_annotationFactory.resetAnnotation(_maxAnnotation);
}
if (min > doubleValue) {
_annotationFactory.reset(_minAnnotation);
_annotationFactory.resetAnnotation(_minAnnotation);
}

for (int i = 0; i < distinctCount; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
import org.datacleaner.result.Crosstab;
import org.datacleaner.result.CrosstabDimension;
import org.datacleaner.result.CrosstabNavigator;
import org.datacleaner.storage.InMemoryRowAnnotationFactory;
import org.datacleaner.storage.RowAnnotation;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.storage.RowAnnotations;
import org.datacleaner.util.AverageBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -96,7 +96,7 @@ public StringAnalyzer() {
@SafeVarargs
public StringAnalyzer(InputColumn<String>... columns) {
_columns = columns;
_annotationFactory = new InMemoryRowAnnotationFactory();
_annotationFactory = RowAnnotations.getDefaultFactory();
init();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,15 +177,15 @@ public synchronized void run(InputRow row, final String value, int distinctCount
}

if (_maxChars < numChars) {
_annotationFactory.reset(_maxCharsAnnotation);
_annotationFactory.resetAnnotation(_maxCharsAnnotation);
_maxChars = numChars;
}
if (_maxChars == numChars) {
_annotationFactory.annotate(row, distinctCount, _maxCharsAnnotation);
}

if (_minChars > numChars) {
_annotationFactory.reset(_minCharsAnnotation);
_annotationFactory.resetAnnotation(_minCharsAnnotation);
_minChars = numChars;
}
if (_minChars == numChars) {
Expand All @@ -194,30 +194,30 @@ public synchronized void run(InputRow row, final String value, int distinctCount

if (_maxWords < numWords) {
_maxWords = numWords;
_annotationFactory.reset(_maxWordsAnnotation);
_annotationFactory.resetAnnotation(_maxWordsAnnotation);
}
if (_maxWords == numWords) {
_annotationFactory.annotate(row, distinctCount, _maxWordsAnnotation);
}
if (_minWords > numWords) {
_minWords = numWords;
_annotationFactory.reset(_minWordsAnnotation);
_annotationFactory.resetAnnotation(_minWordsAnnotation);
}
if (_minWords == numWords) {
_annotationFactory.annotate(row, distinctCount, _minWordsAnnotation);
}

if (_maxWhitespace < numWhitespace) {
_maxWhitespace = numWhitespace;
_annotationFactory.reset(_maxWhitespaceAnnotation);
_annotationFactory.resetAnnotation(_maxWhitespaceAnnotation);
}
if (_maxWhitespace == numWhitespace) {
_annotationFactory.annotate(row, distinctCount, _maxWhitespaceAnnotation);
}

if (_minWhitespace > numWhitespace) {
_minWhitespace = numWhitespace;
_annotationFactory.reset(_minWhitespaceAnnotation);
_annotationFactory.resetAnnotation(_minWhitespaceAnnotation);
}
if (_minWhitespace == numWhitespace) {
_annotationFactory.annotate(row, distinctCount, _minWhitespaceAnnotation);
Expand Down
Loading

0 comments on commit 5f161aa

Please sign in to comment.