Issue datacleaner#506: Big refactor of RowAnnotationFactor implementa…

…tions to get memory footprint under control. Introducing "max sets" flag.
robinroby · Jul 17, 2015 · 5f161aa · 5f161aa
1 parent 7fb50aa
commit 5f161aa
Show file tree

Hide file tree

Showing 54 changed files with 443 additions and 2,162 deletions.
diff --git a/api/src/main/java/org/datacleaner/storage/RowAnnotationFactory.java b/api/src/main/java/org/datacleaner/storage/RowAnnotationFactory.java
@@ -19,11 +19,7 @@
  */
 package org.datacleaner.storage;
 
-import java.util.Map;
-
 import org.datacleaner.api.Component;
-import org.datacleaner.api.InputColumn;
-import org.datacleaner.api.InputRow;
 import org.datacleaner.api.Provided;
 
 /**
@@ -36,7 +32,7 @@
  * The RowAnnotationFactory is injectable into any {@link Component} (analyzer,
  * transformer, filter) using the {@link Provided} annotation.
  */
-public interface RowAnnotationFactory {
+public interface RowAnnotationFactory extends RowAnnotationSampleContainer, RowAnnotationHandler {
 
     /**
      * Creates a new annotation
@@ -45,57 +41,4 @@ public interface RowAnnotationFactory {
      */
     public RowAnnotation createAnnotation();
 
-    /**
-     * Annotates an array of rows (all assumed to have distinct count = 1).
-     * 
-     * @param rows
-     * @param annotation
-     */
-    public void annotate(InputRow[] rows, RowAnnotation annotation);
-
-    /**
-     * Annotates/labels a row with an annotation. The row will be retrievable
-     * using the getRows(...) method later in the process.
-     * 
-     * @param row
-     * @param distinctCount
-     * @param annotation
-     */
-    public void annotate(InputRow row, int distinctCount, RowAnnotation annotation);
-
-    /**
-     * Removes/resets all annotations of a specific kind. This method can be
-     * used for situations where eg. an analyzer is annotating extreme values
-     * (highest/lowest values etc.) and the threshold is changing, cancelling
-     * all previous annotations.
-     * 
-     * @param annotation
-     */
-    public void reset(RowAnnotation annotation);
-
-    /**
-     * Gets all the available rows with a given annotation.
-     * 
-     * @param annotation
-     * @return
-     */
-    public InputRow[] getRows(RowAnnotation annotation);
-
-    /**
-     * Gets a summarized view of the distinct values and their counts for a
-     * single column and annotation.
-     * 
-     * @param annotation
-     * @param inputColumn
-     * @return
-     */
-    public Map<Object, Integer> getValueCounts(RowAnnotation annotation, InputColumn<?> inputColumn);
-
-    /**
-     * Transfers registered annotated rows from one annotation to the other.
-     * 
-     * @param from
-     * @param to
-     */
-    public void transferAnnotations(RowAnnotation from, RowAnnotation to);
 }
diff --git a/api/src/main/java/org/datacleaner/storage/RowAnnotationHandler.java b/api/src/main/java/org/datacleaner/storage/RowAnnotationHandler.java
@@ -0,0 +1,59 @@
+/**
+ * DataCleaner (community edition)
+ * Copyright (C) 2014 Neopost - Customer Information Management
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA  02110-1301  USA
+ */
+package org.datacleaner.storage;
+
+import org.datacleaner.api.InputRow;
+
+/**
+ * Represents a component that is capable of connecting {@link RowAnnotation} to
+ * {@link InputRow}s, typically to publish them via a
+ * {@link RowAnnotationSampleContainer}.
+ */
+public interface RowAnnotationHandler {
+
+    /**
+     * Annotates/labels a row with an annotation. The row will be sampled and
+     * usually retrievable using the getRows(...) method later in the process.
+     * 
+     * @param row
+     * @param annotation
+     */
+    public void annotate(InputRow row, RowAnnotation annotation);
+
+    public void annotate(InputRow row, int distinctCount, RowAnnotation annotation);
+
+    /**
+     * Transfers registered annotated rows from one annotation to the other.
+     * 
+     * @param from
+     * @param to
+     */
+    public void transferAnnotations(RowAnnotation from, RowAnnotation to);
+
+    /**
+     * Removes/resets all annotations of a specific kind. This method can be
+     * used for situations where eg. an analyzer is annotating extreme values
+     * (highest/lowest values etc.) and the threshold is changing, cancelling
+     * all previous annotations.
+     * 
+     * @param annotation
+     */
+    public void resetAnnotation(RowAnnotation annotation);
+}
diff --git a/api/src/main/java/org/datacleaner/storage/RowAnnotationSampleContainer.java b/api/src/main/java/org/datacleaner/storage/RowAnnotationSampleContainer.java
@@ -0,0 +1,48 @@
+/**
+ * DataCleaner (community edition)
+ * Copyright (C) 2014 Neopost - Customer Information Management
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA  02110-1301  USA
+ */
+package org.datacleaner.storage;
+
+import java.util.List;
+
+import org.datacleaner.api.InputRow;
+
+/**
+ * A component for retrieving sample {@link InputRow}s that are annotated using
+ * with a {@link RowAnnotation}.
+ */
+public interface RowAnnotationSampleContainer {
+
+    /**
+     * Determines if there are sample rows available for a specific
+     * {@link RowAnnotation}.
+     * 
+     * @param annotation
+     * @return
+     */
+    public boolean hasSampleRows(RowAnnotation annotation);
+
+    /**
+     * Gets all the available sample rows with a given annotation.
+     * 
+     * @param annotation
+     * @return
+     */
+    public List<InputRow> getSampleRows(RowAnnotation annotation);
+}
diff --git a/components/basic-analyzers/src/main/java/org/datacleaner/beans/BooleanAnalyzer.java b/components/basic-analyzers/src/main/java/org/datacleaner/beans/BooleanAnalyzer.java
@@ -39,9 +39,9 @@
 import org.datacleaner.result.Crosstab;
 import org.datacleaner.result.CrosstabDimension;
 import org.datacleaner.result.CrosstabNavigator;
-import org.datacleaner.storage.InMemoryRowAnnotationFactory;
 import org.datacleaner.storage.RowAnnotation;
 import org.datacleaner.storage.RowAnnotationFactory;
+import org.datacleaner.storage.RowAnnotations;
 import org.datacleaner.util.ValueCombination;
 
 @Named("Boolean analyzer")
@@ -83,7 +83,7 @@ public int compare(Entry<ValueCombination<Boolean>, RowAnnotation> o1,
 
     public BooleanAnalyzer(InputColumn<Boolean>[] columns) {
         _columns = columns;
-        _annotationFactory = new InMemoryRowAnnotationFactory();
+        _annotationFactory = RowAnnotations.getDefaultFactory();
     }
 
     public BooleanAnalyzer() {

diff --git a/...asic-analyzers/src/main/java/org/datacleaner/beans/CompletenessAnalyzerResultReducer.java b/...asic-analyzers/src/main/java/org/datacleaner/beans/CompletenessAnalyzerResultReducer.java
@@ -20,6 +20,7 @@
 package org.datacleaner.beans;
 
 import java.util.Collection;
+import java.util.List;
 
 import javax.inject.Inject;
 
@@ -48,12 +49,14 @@ public CompletenessAnalyzerResult reduce(Collection<? extends CompletenessAnalyz
 
         int totalRowCount = 0;
         for (CompletenessAnalyzerResult result : results) {
-            final InputRow[] rows = result.getRows();
+            final List<InputRow> sampleRows = result.getSampleRows();
             final int invalidRowCount = result.getInvalidRowCount();
-            if (invalidRowCount == rows.length) {
+            if (invalidRowCount == sampleRows.size()) {
                 // if the rows are included for preview/sampling - then
                 // re-annotate them in the master result
-                _rowAnnotationFactory.annotate(rows, annotation);
+                for (InputRow sampleRow : sampleRows) {
+                    _rowAnnotationFactory.annotate(sampleRow, annotation);
+                }
             } else {
                 // else we just transfer annotation counts
                 _rowAnnotationFactory.transferAnnotations(result.getAnnotation(), annotation);

diff --git a/...asic-analyzers/src/main/java/org/datacleaner/beans/DateAndTimeAnalyzerColumnDelegate.java b/...asic-analyzers/src/main/java/org/datacleaner/beans/DateAndTimeAnalyzerColumnDelegate.java
@@ -93,18 +93,18 @@ public synchronized void run(final Date value, final InputRow row, final int dis
             } else {
                 if (localDate.isAfter(_maxDate)) {
                     _maxDate = localDate;
-                    _annotationFactory.reset(_maxDateAnnotation);
+                    _annotationFactory.resetAnnotation(_maxDateAnnotation);
                 } else if (localDate.isBefore(_minDate)) {
                     _minDate = localDate;
-                    _annotationFactory.reset(_minDateAnnotation);
+                    _annotationFactory.resetAnnotation(_minDateAnnotation);
                 }
 
                 if (localTime.isAfter(_maxTime)) {
                     _maxTime = localTime;
-                    _annotationFactory.reset(_maxTimeAnnotation);
+                    _annotationFactory.resetAnnotation(_maxTimeAnnotation);
                 } else if (localTime.isBefore(_minTime)) {
                     _minTime = localTime;
-                    _annotationFactory.reset(_minTimeAnnotation);
+                    _annotationFactory.resetAnnotation(_minTimeAnnotation);
                 }
             }
 

diff --git a/components/basic-analyzers/src/main/java/org/datacleaner/beans/NumberAnalyzer.java b/components/basic-analyzers/src/main/java/org/datacleaner/beans/NumberAnalyzer.java
@@ -41,9 +41,9 @@
 import org.datacleaner.result.Crosstab;
 import org.datacleaner.result.CrosstabDimension;
 import org.datacleaner.result.CrosstabNavigator;
-import org.datacleaner.storage.InMemoryRowAnnotationFactory;
 import org.datacleaner.storage.RowAnnotation;
 import org.datacleaner.storage.RowAnnotationFactory;
+import org.datacleaner.storage.RowAnnotations;
 
 /**
  * Number analyzer, which provides statistical information for number values:
@@ -105,7 +105,7 @@ public NumberAnalyzer() {
     public NumberAnalyzer(InputColumn<? extends Number>... columns) {
         this();
         _columns = columns;
-        _annotationFactory = new InMemoryRowAnnotationFactory();
+        _annotationFactory = RowAnnotations.getDefaultFactory();
         init();
     }
 

diff --git a/...nts/basic-analyzers/src/main/java/org/datacleaner/beans/NumberAnalyzerColumnDelegate.java b/...nts/basic-analyzers/src/main/java/org/datacleaner/beans/NumberAnalyzerColumnDelegate.java
@@ -59,10 +59,10 @@ public synchronized void run(InputRow row, Number value, int distinctCount) {
 			double min = _statistics.getMin();
 
 			if (max < doubleValue) {
-				_annotationFactory.reset(_maxAnnotation);
+				_annotationFactory.resetAnnotation(_maxAnnotation);
 			}
 			if (min > doubleValue) {
-				_annotationFactory.reset(_minAnnotation);
+				_annotationFactory.resetAnnotation(_minAnnotation);
 			}
 
 			for (int i = 0; i < distinctCount; i++) {

diff --git a/components/basic-analyzers/src/main/java/org/datacleaner/beans/StringAnalyzer.java b/components/basic-analyzers/src/main/java/org/datacleaner/beans/StringAnalyzer.java
@@ -37,9 +37,9 @@
 import org.datacleaner.result.Crosstab;
 import org.datacleaner.result.CrosstabDimension;
 import org.datacleaner.result.CrosstabNavigator;
-import org.datacleaner.storage.InMemoryRowAnnotationFactory;
 import org.datacleaner.storage.RowAnnotation;
 import org.datacleaner.storage.RowAnnotationFactory;
+import org.datacleaner.storage.RowAnnotations;
 import org.datacleaner.util.AverageBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -96,7 +96,7 @@ public StringAnalyzer() {
 	@SafeVarargs
 	public StringAnalyzer(InputColumn<String>... columns) {
 		_columns = columns;
-		_annotationFactory = new InMemoryRowAnnotationFactory();
+		_annotationFactory = RowAnnotations.getDefaultFactory();
 		init();
 	}
 

diff --git a/...nts/basic-analyzers/src/main/java/org/datacleaner/beans/StringAnalyzerColumnDelegate.java b/...nts/basic-analyzers/src/main/java/org/datacleaner/beans/StringAnalyzerColumnDelegate.java
@@ -177,15 +177,15 @@ public synchronized void run(InputRow row, final String value, int distinctCount
             }
 
             if (_maxChars < numChars) {
-                _annotationFactory.reset(_maxCharsAnnotation);
+                _annotationFactory.resetAnnotation(_maxCharsAnnotation);
                 _maxChars = numChars;
             }
             if (_maxChars == numChars) {
                 _annotationFactory.annotate(row, distinctCount, _maxCharsAnnotation);
             }
 
             if (_minChars > numChars) {
-                _annotationFactory.reset(_minCharsAnnotation);
+                _annotationFactory.resetAnnotation(_minCharsAnnotation);
                 _minChars = numChars;
             }
             if (_minChars == numChars) {
@@ -194,30 +194,30 @@ public synchronized void run(InputRow row, final String value, int distinctCount
 
             if (_maxWords < numWords) {
                 _maxWords = numWords;
-                _annotationFactory.reset(_maxWordsAnnotation);
+                _annotationFactory.resetAnnotation(_maxWordsAnnotation);
             }
             if (_maxWords == numWords) {
                 _annotationFactory.annotate(row, distinctCount, _maxWordsAnnotation);
             }
             if (_minWords > numWords) {
                 _minWords = numWords;
-                _annotationFactory.reset(_minWordsAnnotation);
+                _annotationFactory.resetAnnotation(_minWordsAnnotation);
             }
             if (_minWords == numWords) {
                 _annotationFactory.annotate(row, distinctCount, _minWordsAnnotation);
             }
 
             if (_maxWhitespace < numWhitespace) {
                 _maxWhitespace = numWhitespace;
-                _annotationFactory.reset(_maxWhitespaceAnnotation);
+                _annotationFactory.resetAnnotation(_maxWhitespaceAnnotation);
             }
             if (_maxWhitespace == numWhitespace) {
                 _annotationFactory.annotate(row, distinctCount, _maxWhitespaceAnnotation);
             }
 
             if (_minWhitespace > numWhitespace) {
                 _minWhitespace = numWhitespace;
-                _annotationFactory.reset(_minWhitespaceAnnotation);
+                _annotationFactory.resetAnnotation(_minWhitespaceAnnotation);
             }
             if (_minWhitespace == numWhitespace) {
                 _annotationFactory.annotate(row, distinctCount, _minWhitespaceAnnotation);