Skip to content

Commit

Permalink
Issue datacleaner#395: Added @ExternalDocumentation annotation and pu…
Browse files Browse the repository at this point in the history
…t video links

to various components
  • Loading branch information
kaspersorensen committed May 12, 2015
1 parent f0ea2ee commit 8c6ea63
Show file tree
Hide file tree
Showing 8 changed files with 274 additions and 146 deletions.
106 changes: 106 additions & 0 deletions api/src/main/java/org/datacleaner/api/ExternalDocumentation.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.api;

import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Inherited;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;

/**
* Annotation used to provide links to extra documentation (beyond
* {@link Description} and similar annotations) about a {@link Component}.
*/
@Retention(RetentionPolicy.RUNTIME)
@Target({ ElementType.TYPE, ElementType.FIELD, ElementType.METHOD })
@Documented
@Inherited
public @interface ExternalDocumentation {

/**
* Defines the types of {@link DocumentationLink}s that are available
*/
public static enum DocumentationType {

/**
* Written reference documentation style documents.
*/
REFERENCE,

/**
* Tutorials/use-cases that are explained to put the component in a
* particular light.
*/
TUTORIAL,

/**
* An video explaining the component.
*/
VIDEO,

/**
* A technical background piece, typically for engineers or people with
* a specialized interest in this component.
*/
TECH
}

public static @interface DocumentationLink {

/**
* Gets the title/name of the documentation item.
*
* @return
*/
public String title();

/**
* Gets the HTTP(S) URL that this documentation item resides at
*
* @return
*/
public String url();

/**
* Gets the {@link DocumentationType} of this link
*
* @return
*/
public DocumentationType type();

/**
* Defines the version of DataCleaner that this documentation item was
* based on
*
* @return
*/
public String version();
}

/**
* Defines the external documentation links that are defined for this
* component.
*
* @return
*/
public DocumentationLink[] value();
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExternalDocumentation;
import org.datacleaner.api.ExternalDocumentation.DocumentationLink;
import org.datacleaner.api.ExternalDocumentation.DocumentationType;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
Expand All @@ -35,6 +38,7 @@

@Named("Regex parser")
@Description("Parses strings using a regular expression and transforms it into substrings based on regex groups")
@ExternalDocumentation({ @DocumentationLink(title = "Regex parsing with DataCleaner", url = "https://www.youtube.com/watch?v=VA6dw5Nv2AM", type = DocumentationType.VIDEO, version = "3.0") })
@Categorized(StringManipulationCategory.class)
public class RegexParserTransformer implements Transformer {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExternalDocumentation;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Provided;
import org.datacleaner.api.ExternalDocumentation.DocumentationLink;
import org.datacleaner.api.ExternalDocumentation.DocumentationType;
import org.datacleaner.result.AnnotatedRowsResult;
import org.datacleaner.result.CharacterSetDistributionResult;
import org.datacleaner.result.Crosstab;
Expand All @@ -47,125 +50,118 @@

@Named("Character set distribution")
@Description("Inspects and maps text characters according to character set affinity, such as Latin, Hebrew, Cyrillic, Chinese and more.")
@ExternalDocumentation({ @DocumentationLink(title = "Internationalization in DataCleaner", url = "https://www.youtube.com/watch?v=ApA-nhtLbhI", type = DocumentationType.VIDEO, version = "3.0") })
@Concurrent(true)
public class CharacterSetDistributionAnalyzer implements
Analyzer<CharacterSetDistributionResult> {

private static final Map<String, UnicodeSet> UNICODE_SETS = createUnicodeSets();

@Inject
@Configured
InputColumn<String>[] _columns;

@Inject
@Provided
RowAnnotationFactory _annotationFactory;

private final Map<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate> _columnDelegates = new HashMap<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate>();

@Initialize
public void init() {
for (InputColumn<String> column : _columns) {
CharacterSetDistributionAnalyzerColumnDelegate delegate = new CharacterSetDistributionAnalyzerColumnDelegate(
_annotationFactory, UNICODE_SETS);
_columnDelegates.put(column, delegate);
}
}

/**
* Creates a map of unicode sets, with their names as keys.
*
* There's a usable list of Unicode scripts on this page:
* http://unicode.org/cldr/utility/properties.jsp?a=Script#Script
*
* Additionally, this page has some explanations on some of the more exotic
* sources, like japanese:
* http://userguide.icu-project.org/transforms/general#TOC-Japanese
*
* @return
*/
protected static Map<String, UnicodeSet> createUnicodeSets() {
Map<String, UnicodeSet> unicodeSets = new TreeMap<String, UnicodeSet>();
unicodeSets.put("Latin, ASCII", new UnicodeSet("[:ASCII:]"));
unicodeSets.put("Latin, non-ASCII",
subUnicodeSet("[:Latin:]", "[:ASCII:]"));
unicodeSets.put("Arabic", new UnicodeSet("[:Script=Arabic:]"));
unicodeSets.put("Armenian", new UnicodeSet("[:Script=Armenian:]"));
unicodeSets.put("Bengali", new UnicodeSet("[:Script=Bengali:]"));
unicodeSets.put("Cyrillic", new UnicodeSet("[:Script=Cyrillic:]"));
unicodeSets.put("Devanagari", new UnicodeSet("[:Script=Devanagari:]"));
unicodeSets.put("Greek", new UnicodeSet("[:Script=Greek:]"));
unicodeSets.put("Han", new UnicodeSet("[:Script=Han:]"));
unicodeSets.put("Gujarati", new UnicodeSet("[:Script=Gujarati:]"));
unicodeSets.put("Georgian", new UnicodeSet("[:Script=Georgian:]"));
unicodeSets.put("Gurmukhi", new UnicodeSet("[:Script=Gurmukhi:]"));
unicodeSets.put("Hangul", new UnicodeSet("[:Script=Hangul:]"));
unicodeSets.put("Hebrew", new UnicodeSet("[:Script=Hebrew:]"));
unicodeSets.put("Hiragana", new UnicodeSet("[:Script=Hiragana:]"));
// unicodeSets.put("Kanji", new UnicodeSet("[:Script=Kanji:]"));
unicodeSets.put("Kannada", new UnicodeSet("[:Script=Kannada:]"));
unicodeSets.put("Katakana", new UnicodeSet("[:Script=Katakana:]"));
unicodeSets.put("Malayalam", new UnicodeSet("[:Script=Malayalam:]"));
// unicodeSets.put("Mandarin", new UnicodeSet("[:Script=Mandarin:]"));
unicodeSets.put("Oriya", new UnicodeSet("[:Script=Oriya:]"));
unicodeSets.put("Syriac", new UnicodeSet("[:Script=Syriac:]"));
unicodeSets.put("Tamil", new UnicodeSet("[:Script=Tamil:]"));
unicodeSets.put("Telugu", new UnicodeSet("[:Script=Telugu:]"));
unicodeSets.put("Thaana", new UnicodeSet("[:Script=Thaana:]"));
unicodeSets.put("Thai", new UnicodeSet("[:Script=Thai:]"));
return unicodeSets;
}

private static UnicodeSet subUnicodeSet(String pattern1, String pattern2) {
UnicodeSet unicodeSet = new UnicodeSet();
unicodeSet.addAll(new UnicodeSet(pattern1));
unicodeSet.removeAll(new UnicodeSet(pattern2));
return unicodeSet;
}

@Override
public void run(InputRow row, int distinctCount) {
for (InputColumn<String> column : _columns) {
String value = row.getValue(column);
CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates
.get(column);
delegate.run(value, row, distinctCount);
}
}

@Override
public CharacterSetDistributionResult getResult() {
CrosstabDimension measureDimension = new CrosstabDimension("Measures");
Set<String> unicodeSetNames = UNICODE_SETS.keySet();
for (String name : unicodeSetNames) {
measureDimension.addCategory(name);
}

CrosstabDimension columnDimension = new CrosstabDimension("Column");

Crosstab<Number> crosstab = new Crosstab<Number>(Number.class,
columnDimension, measureDimension);

for (InputColumn<String> column : _columns) {
String columnName = column.getName();
CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates
.get(column);
columnDimension.addCategory(columnName);

CrosstabNavigator<Number> nav = crosstab.navigate().where(
columnDimension, columnName);

for (String name : unicodeSetNames) {
RowAnnotation annotation = delegate.getAnnotation(name);
int rowCount = annotation.getRowCount();
nav.where(measureDimension, name).put(rowCount);
if (rowCount > 0) {
nav.attach(new AnnotatedRowsResult(annotation,
_annotationFactory, column));
}
}
}
return new CharacterSetDistributionResult(_columns, unicodeSetNames,
crosstab);
}
public class CharacterSetDistributionAnalyzer implements Analyzer<CharacterSetDistributionResult> {

private static final Map<String, UnicodeSet> UNICODE_SETS = createUnicodeSets();

@Inject
@Configured
InputColumn<String>[] _columns;

@Inject
@Provided
RowAnnotationFactory _annotationFactory;

private final Map<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate> _columnDelegates = new HashMap<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate>();

@Initialize
public void init() {
for (InputColumn<String> column : _columns) {
CharacterSetDistributionAnalyzerColumnDelegate delegate = new CharacterSetDistributionAnalyzerColumnDelegate(
_annotationFactory, UNICODE_SETS);
_columnDelegates.put(column, delegate);
}
}

/**
* Creates a map of unicode sets, with their names as keys.
*
* There's a usable list of Unicode scripts on this page:
* http://unicode.org/cldr/utility/properties.jsp?a=Script#Script
*
* Additionally, this page has some explanations on some of the more exotic
* sources, like japanese:
* http://userguide.icu-project.org/transforms/general#TOC-Japanese
*
* @return
*/
protected static Map<String, UnicodeSet> createUnicodeSets() {
Map<String, UnicodeSet> unicodeSets = new TreeMap<String, UnicodeSet>();
unicodeSets.put("Latin, ASCII", new UnicodeSet("[:ASCII:]"));
unicodeSets.put("Latin, non-ASCII", subUnicodeSet("[:Latin:]", "[:ASCII:]"));
unicodeSets.put("Arabic", new UnicodeSet("[:Script=Arabic:]"));
unicodeSets.put("Armenian", new UnicodeSet("[:Script=Armenian:]"));
unicodeSets.put("Bengali", new UnicodeSet("[:Script=Bengali:]"));
unicodeSets.put("Cyrillic", new UnicodeSet("[:Script=Cyrillic:]"));
unicodeSets.put("Devanagari", new UnicodeSet("[:Script=Devanagari:]"));
unicodeSets.put("Greek", new UnicodeSet("[:Script=Greek:]"));
unicodeSets.put("Han", new UnicodeSet("[:Script=Han:]"));
unicodeSets.put("Gujarati", new UnicodeSet("[:Script=Gujarati:]"));
unicodeSets.put("Georgian", new UnicodeSet("[:Script=Georgian:]"));
unicodeSets.put("Gurmukhi", new UnicodeSet("[:Script=Gurmukhi:]"));
unicodeSets.put("Hangul", new UnicodeSet("[:Script=Hangul:]"));
unicodeSets.put("Hebrew", new UnicodeSet("[:Script=Hebrew:]"));
unicodeSets.put("Hiragana", new UnicodeSet("[:Script=Hiragana:]"));
// unicodeSets.put("Kanji", new UnicodeSet("[:Script=Kanji:]"));
unicodeSets.put("Kannada", new UnicodeSet("[:Script=Kannada:]"));
unicodeSets.put("Katakana", new UnicodeSet("[:Script=Katakana:]"));
unicodeSets.put("Malayalam", new UnicodeSet("[:Script=Malayalam:]"));
// unicodeSets.put("Mandarin", new UnicodeSet("[:Script=Mandarin:]"));
unicodeSets.put("Oriya", new UnicodeSet("[:Script=Oriya:]"));
unicodeSets.put("Syriac", new UnicodeSet("[:Script=Syriac:]"));
unicodeSets.put("Tamil", new UnicodeSet("[:Script=Tamil:]"));
unicodeSets.put("Telugu", new UnicodeSet("[:Script=Telugu:]"));
unicodeSets.put("Thaana", new UnicodeSet("[:Script=Thaana:]"));
unicodeSets.put("Thai", new UnicodeSet("[:Script=Thai:]"));
return unicodeSets;
}

private static UnicodeSet subUnicodeSet(String pattern1, String pattern2) {
UnicodeSet unicodeSet = new UnicodeSet();
unicodeSet.addAll(new UnicodeSet(pattern1));
unicodeSet.removeAll(new UnicodeSet(pattern2));
return unicodeSet;
}

@Override
public void run(InputRow row, int distinctCount) {
for (InputColumn<String> column : _columns) {
String value = row.getValue(column);
CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates.get(column);
delegate.run(value, row, distinctCount);
}
}

@Override
public CharacterSetDistributionResult getResult() {
CrosstabDimension measureDimension = new CrosstabDimension("Measures");
Set<String> unicodeSetNames = UNICODE_SETS.keySet();
for (String name : unicodeSetNames) {
measureDimension.addCategory(name);
}

CrosstabDimension columnDimension = new CrosstabDimension("Column");

Crosstab<Number> crosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension);

for (InputColumn<String> column : _columns) {
String columnName = column.getName();
CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates.get(column);
columnDimension.addCategory(columnName);

CrosstabNavigator<Number> nav = crosstab.navigate().where(columnDimension, columnName);

for (String name : unicodeSetNames) {
RowAnnotation annotation = delegate.getAnnotation(name);
int rowCount = annotation.getRowCount();
nav.where(measureDimension, name).put(rowCount);
if (rowCount > 0) {
nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column));
}
}
}
return new CharacterSetDistributionResult(_columns, unicodeSetNames, crosstab);
}
}
Loading

0 comments on commit 8c6ea63

Please sign in to comment.