/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.TreeMap; import javax.inject.Inject; import javax.inject.Named; import org.datacleaner.api.Analyzer; import org.datacleaner.api.Concurrent; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.ExternalDocumentation; import org.datacleaner.api.ExternalDocumentation.DocumentationLink; import org.datacleaner.api.ExternalDocumentation.DocumentationType; import org.datacleaner.api.Initialize; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.api.Provided; import org.datacleaner.result.AnnotatedRowsResult; import org.datacleaner.result.CharacterSetDistributionResult; import org.datacleaner.result.Crosstab; import org.datacleaner.result.CrosstabDimension; import org.datacleaner.result.CrosstabNavigator; import org.datacleaner.storage.RowAnnotation; import org.datacleaner.storage.RowAnnotationFactory; import com.ibm.icu.text.UnicodeSet; @Named("Character set distribution") @Description("Inspects and maps text characters according to character set affinity, " + "such as Latin, Hebrew, Cyrillic, Chinese and more.") @ExternalDocumentation({ @DocumentationLink(title = "Internationalization in DataCleaner", url = "https://www.youtube.com/watch?v=ApA-nhtLbhI", type = DocumentationType.VIDEO, version = "3.0") }) @Concurrent(true) public class CharacterSetDistributionAnalyzer implements Analyzer<CharacterSetDistributionResult> { private static final Map<String, UnicodeSet> UNICODE_SETS = createUnicodeSets(); private final Map<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate> _columnDelegates = new HashMap<>(); @Inject @Configured InputColumn<String>[] _columns; @Inject @Provided RowAnnotationFactory _annotationFactory; /** * Creates a map of unicode sets, with their names as keys. * * There's a usable list of Unicode scripts on this page: * http://unicode.org/cldr/utility/properties.jsp?a=Script#Script * * Additionally, this page has some explanations on some of the more exotic * sources, like japanese: * http://userguide.icu-project.org/transforms/general#TOC-Japanese * * @return */ protected static Map<String, UnicodeSet> createUnicodeSets() { final Map<String, UnicodeSet> unicodeSets = new TreeMap<>(); unicodeSets.put("Latin, ASCII", new UnicodeSet("[:ASCII:]")); unicodeSets.put("Latin, non-ASCII", subUnicodeSet("[:Latin:]", "[:ASCII:]")); unicodeSets.put("Arabic", new UnicodeSet("[:Script=Arabic:]")); unicodeSets.put("Armenian", new UnicodeSet("[:Script=Armenian:]")); unicodeSets.put("Bengali", new UnicodeSet("[:Script=Bengali:]")); unicodeSets.put("Cyrillic", new UnicodeSet("[:Script=Cyrillic:]")); unicodeSets.put("Devanagari", new UnicodeSet("[:Script=Devanagari:]")); unicodeSets.put("Greek", new UnicodeSet("[:Script=Greek:]")); unicodeSets.put("Han", new UnicodeSet("[:Script=Han:]")); unicodeSets.put("Gujarati", new UnicodeSet("[:Script=Gujarati:]")); unicodeSets.put("Georgian", new UnicodeSet("[:Script=Georgian:]")); unicodeSets.put("Gurmukhi", new UnicodeSet("[:Script=Gurmukhi:]")); unicodeSets.put("Hangul", new UnicodeSet("[:Script=Hangul:]")); unicodeSets.put("Hebrew", new UnicodeSet("[:Script=Hebrew:]")); unicodeSets.put("Hiragana", new UnicodeSet("[:Script=Hiragana:]")); // unicodeSets.put("Kanji", new UnicodeSet("[:Script=Kanji:]")); unicodeSets.put("Kannada", new UnicodeSet("[:Script=Kannada:]")); unicodeSets.put("Katakana", new UnicodeSet("[:Script=Katakana:]")); unicodeSets.put("Malayalam", new UnicodeSet("[:Script=Malayalam:]")); // unicodeSets.put("Mandarin", new UnicodeSet("[:Script=Mandarin:]")); unicodeSets.put("Oriya", new UnicodeSet("[:Script=Oriya:]")); unicodeSets.put("Syriac", new UnicodeSet("[:Script=Syriac:]")); unicodeSets.put("Tamil", new UnicodeSet("[:Script=Tamil:]")); unicodeSets.put("Telugu", new UnicodeSet("[:Script=Telugu:]")); unicodeSets.put("Thaana", new UnicodeSet("[:Script=Thaana:]")); unicodeSets.put("Thai", new UnicodeSet("[:Script=Thai:]")); return unicodeSets; } private static UnicodeSet subUnicodeSet(final String pattern1, final String pattern2) { final UnicodeSet unicodeSet = new UnicodeSet(); unicodeSet.addAll(new UnicodeSet(pattern1)); unicodeSet.removeAll(new UnicodeSet(pattern2)); return unicodeSet; } @Initialize public void init() { for (final InputColumn<String> column : _columns) { final CharacterSetDistributionAnalyzerColumnDelegate delegate = new CharacterSetDistributionAnalyzerColumnDelegate(_annotationFactory, UNICODE_SETS); _columnDelegates.put(column, delegate); } } @Override public void run(final InputRow row, final int distinctCount) { for (final InputColumn<String> column : _columns) { final String value = row.getValue(column); final CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates.get(column); delegate.run(value, row, distinctCount); } } @Override public CharacterSetDistributionResult getResult() { final CrosstabDimension measureDimension = new CrosstabDimension("Measures"); final Set<String> unicodeSetNames = UNICODE_SETS.keySet(); for (final String name : unicodeSetNames) { measureDimension.addCategory(name); } final CrosstabDimension columnDimension = new CrosstabDimension("Column"); final Crosstab<Number> crosstab = new Crosstab<>(Number.class, columnDimension, measureDimension); for (final InputColumn<String> column : _columns) { final String columnName = column.getName(); final CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates.get(column); columnDimension.addCategory(columnName); final CrosstabNavigator<Number> nav = crosstab.navigate().where(columnDimension, columnName); for (final String name : unicodeSetNames) { final RowAnnotation annotation = delegate.getAnnotation(name); final int rowCount = annotation.getRowCount(); nav.where(measureDimension, name).put(rowCount); if (rowCount > 0) { nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column)); } } } return new CharacterSetDistributionResult(_columns, unicodeSetNames, crosstab); } }