/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.beans; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.TreeMap; import javax.inject.Inject; import org.eobjects.analyzer.beans.api.Analyzer; import org.eobjects.analyzer.beans.api.AnalyzerBean; import org.eobjects.analyzer.beans.api.Concurrent; import org.eobjects.analyzer.beans.api.Configured; import org.eobjects.analyzer.beans.api.Description; import org.eobjects.analyzer.beans.api.Initialize; import org.eobjects.analyzer.beans.api.Provided; import org.eobjects.analyzer.data.InputColumn; import org.eobjects.analyzer.data.InputRow; import org.eobjects.analyzer.result.AnnotatedRowsResult; import org.eobjects.analyzer.result.CharacterSetDistributionResult; import org.eobjects.analyzer.result.Crosstab; import org.eobjects.analyzer.result.CrosstabDimension; import org.eobjects.analyzer.result.CrosstabNavigator; import org.eobjects.analyzer.storage.RowAnnotation; import org.eobjects.analyzer.storage.RowAnnotationFactory; import com.ibm.icu.text.UnicodeSet; @AnalyzerBean("Character set distribution") @Description("Inspects and maps text characters according to character set affinity, such as Latin, Hebrew, Cyrillic, Chinese and more.") @Concurrent(true) public class CharacterSetDistributionAnalyzer implements Analyzer<CharacterSetDistributionResult> { private static final Map<String, UnicodeSet> UNICODE_SETS = createUnicodeSets(); @Inject @Configured InputColumn<String>[] _columns; @Inject @Provided RowAnnotationFactory _annotationFactory; private final Map<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate> _columnDelegates = new HashMap<InputColumn<String>, CharacterSetDistributionAnalyzerColumnDelegate>(); @Initialize public void init() { for (InputColumn<String> column : _columns) { CharacterSetDistributionAnalyzerColumnDelegate delegate = new CharacterSetDistributionAnalyzerColumnDelegate( _annotationFactory, UNICODE_SETS); _columnDelegates.put(column, delegate); } } /** * Creates a map of unicode sets, with their names as keys. * * There's a usable list of Unicode scripts on this page: * http://unicode.org/cldr/utility/properties.jsp?a=Script#Script * * Additionally, this page has some explanations on some of the more exotic * sources, like japanese: * http://userguide.icu-project.org/transforms/general#TOC-Japanese * * @return */ protected static Map<String, UnicodeSet> createUnicodeSets() { Map<String, UnicodeSet> unicodeSets = new TreeMap<String, UnicodeSet>(); unicodeSets.put("Latin, ASCII", new UnicodeSet("[:ASCII:]")); unicodeSets.put("Latin, non-ASCII", subUnicodeSet("[:Latin:]", "[:ASCII:]")); unicodeSets.put("Arabic", new UnicodeSet("[:Script=Arabic:]")); unicodeSets.put("Armenian", new UnicodeSet("[:Script=Armenian:]")); unicodeSets.put("Bengali", new UnicodeSet("[:Script=Bengali:]")); unicodeSets.put("Cyrillic", new UnicodeSet("[:Script=Cyrillic:]")); unicodeSets.put("Devanagari", new UnicodeSet("[:Script=Devanagari:]")); unicodeSets.put("Greek", new UnicodeSet("[:Script=Greek:]")); unicodeSets.put("Han", new UnicodeSet("[:Script=Han:]")); unicodeSets.put("Gujarati", new UnicodeSet("[:Script=Gujarati:]")); unicodeSets.put("Georgian", new UnicodeSet("[:Script=Georgian:]")); unicodeSets.put("Gurmukhi", new UnicodeSet("[:Script=Gurmukhi:]")); unicodeSets.put("Hangul", new UnicodeSet("[:Script=Hangul:]")); unicodeSets.put("Hebrew", new UnicodeSet("[:Script=Hebrew:]")); unicodeSets.put("Hiragana", new UnicodeSet("[:Script=Hiragana:]")); // unicodeSets.put("Kanji", new UnicodeSet("[:Script=Kanji:]")); unicodeSets.put("Kannada", new UnicodeSet("[:Script=Kannada:]")); unicodeSets.put("Katakana", new UnicodeSet("[:Script=Katakana:]")); unicodeSets.put("Malayalam", new UnicodeSet("[:Script=Malayalam:]")); // unicodeSets.put("Mandarin", new UnicodeSet("[:Script=Mandarin:]")); unicodeSets.put("Oriya", new UnicodeSet("[:Script=Oriya:]")); unicodeSets.put("Syriac", new UnicodeSet("[:Script=Syriac:]")); unicodeSets.put("Tamil", new UnicodeSet("[:Script=Tamil:]")); unicodeSets.put("Telugu", new UnicodeSet("[:Script=Telugu:]")); unicodeSets.put("Thaana", new UnicodeSet("[:Script=Thaana:]")); unicodeSets.put("Thai", new UnicodeSet("[:Script=Thai:]")); return unicodeSets; } private static UnicodeSet subUnicodeSet(String pattern1, String pattern2) { UnicodeSet unicodeSet = new UnicodeSet(); unicodeSet.addAll(new UnicodeSet(pattern1)); unicodeSet.removeAll(new UnicodeSet(pattern2)); return unicodeSet; } @Override public void run(InputRow row, int distinctCount) { for (InputColumn<String> column : _columns) { String value = row.getValue(column); CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates .get(column); delegate.run(value, row, distinctCount); } } @Override public CharacterSetDistributionResult getResult() { CrosstabDimension measureDimension = new CrosstabDimension("Measures"); Set<String> unicodeSetNames = UNICODE_SETS.keySet(); for (String name : unicodeSetNames) { measureDimension.addCategory(name); } CrosstabDimension columnDimension = new CrosstabDimension("Column"); Crosstab<Number> crosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension); for (InputColumn<String> column : _columns) { String columnName = column.getName(); CharacterSetDistributionAnalyzerColumnDelegate delegate = _columnDelegates .get(column); columnDimension.addCategory(columnName); CrosstabNavigator<Number> nav = crosstab.navigate().where( columnDimension, columnName); for (String name : unicodeSetNames) { RowAnnotation annotation = delegate.getAnnotation(name); int rowCount = annotation.getRowCount(); nav.where(measureDimension, name).put(rowCount); if (rowCount > 0) { nav.attach(new AnnotatedRowsResult(annotation, _annotationFactory, column)); } } } return new CharacterSetDistributionResult(_columns, unicodeSetNames, crosstab); } }