/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.beans.stringpattern; import java.io.Serializable; import java.text.DecimalFormatSymbols; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.eobjects.analyzer.beans.api.Analyzer; import org.eobjects.analyzer.beans.api.AnalyzerBean; import org.eobjects.analyzer.beans.api.ColumnProperty; import org.eobjects.analyzer.beans.api.Concurrent; import org.eobjects.analyzer.beans.api.Configured; import org.eobjects.analyzer.beans.api.Description; import org.eobjects.analyzer.beans.api.Initialize; import org.eobjects.analyzer.beans.api.Provided; import org.eobjects.analyzer.data.InputColumn; import org.eobjects.analyzer.data.InputRow; import org.eobjects.analyzer.result.AnnotatedRowsResult; import org.eobjects.analyzer.result.Crosstab; import org.eobjects.analyzer.result.CrosstabDimension; import org.eobjects.analyzer.result.CrosstabNavigator; import org.eobjects.analyzer.storage.RowAnnotation; import org.eobjects.analyzer.storage.RowAnnotationFactory; import org.eobjects.analyzer.util.NullTolerableComparator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @AnalyzerBean("Pattern finder") @Description("The Pattern Finder will inspect your String values and generate and match string patterns that suit your data.\nIt can be used for a lot of purposes but is excellent for verifying or getting ideas about the format of the string-values in a column.") @Concurrent(true) public class PatternFinderAnalyzer implements Analyzer<PatternFinderResult> { private static final Logger logger = LoggerFactory.getLogger(PatternFinderAnalyzer.class); public static final String MEASURE_SAMPLE = "Sample"; public static final String MEASURE_MATCH_COUNT = "Match count"; public static final String DIMENSION_NAME_MEASURES = "Measures"; public static final String DIMENSION_NAME_PATTERN = "Pattern"; @Configured(order = 1) @ColumnProperty(escalateToMultipleJobs = true) InputColumn<String> column; @Configured(required = false, order = 2) @Description("Optional column to group patterns by") InputColumn<String> groupColumn; @Configured(required = false, order = 3) @Description("Separate text tokens based on case") Boolean discriminateTextCase = true; @Configured(required = false, order = 4) @Description("Separate number tokens based on negativity") Boolean discriminateNegativeNumbers = false; @Configured(required = false, order = 5) @Description("Separate number tokens for decimals") Boolean discriminateDecimals = true; @Configured(required = false, order = 6) @Description("Use '?'-tokens for mixed text and numbers") Boolean enableMixedTokens = true; @Configured(required = false, order = 7) @Description("Ignore whitespace differences") Boolean ignoreRepeatedSpaces = false; @Configured(required = false, value = "Upper case patterns expand in size", order = 8) @Description("Auto-adjust/expand uppercase text tokens") boolean upperCaseExpandable = false; @Configured(required = false, value = "Lower case patterns expand in size", order = 9) @Description("Auto-adjust/expand lowercase text tokens") boolean lowerCaseExpandable = true; @Configured(required = false, value = "Predefined token name", order = 10) String predefinedTokenName; @Configured(required = false, value = "Predefined token regexes", order = 11) String[] predefinedTokenPatterns; @Configured(required = false, order = 12) Character decimalSeparator = DecimalFormatSymbols.getInstance().getDecimalSeparator(); @Configured(required = false, order = 13) Character thousandsSeparator = DecimalFormatSymbols.getInstance().getGroupingSeparator(); @Configured(required = false, order = 14) Character minusSign = DecimalFormatSymbols.getInstance().getMinusSign(); private Map<String, DefaultPatternFinder> _patternFinders; private TokenizerConfiguration _configuration; @Provided RowAnnotationFactory _rowAnnotationFactory; @Initialize public void init() { if (enableMixedTokens != null) { _configuration = new TokenizerConfiguration(enableMixedTokens); } else { _configuration = new TokenizerConfiguration(); } _configuration.setUpperCaseExpandable(upperCaseExpandable); _configuration.setLowerCaseExpandable(lowerCaseExpandable); if (discriminateNegativeNumbers != null) { _configuration.setDiscriminateNegativeNumbers(discriminateNegativeNumbers); } if (discriminateDecimals != null) { _configuration.setDiscriminateDecimalNumbers(discriminateDecimals); } if (discriminateTextCase != null) { _configuration.setDiscriminateTextCase(discriminateTextCase); } if (ignoreRepeatedSpaces != null) { boolean ignoreSpacesLength = ignoreRepeatedSpaces.booleanValue(); _configuration.setDistriminateTokenLength(TokenType.WHITESPACE, !ignoreSpacesLength); } if (decimalSeparator != null) { _configuration.setDecimalSeparator(decimalSeparator); } if (thousandsSeparator != null) { _configuration.setThousandsSeparator(thousandsSeparator); } if (minusSign != null) { _configuration.setMinusSign(minusSign); } if (predefinedTokenName != null && predefinedTokenPatterns != null) { Set<String> tokenRegexes = new HashSet<String>(); for (String predefinedTokenPattern : predefinedTokenPatterns) { tokenRegexes.add(predefinedTokenPattern); } _configuration.getPredefinedTokens().add(new PredefinedTokenDefinition(predefinedTokenName, tokenRegexes)); } _patternFinders = new HashMap<String, DefaultPatternFinder>(); } @Override public void run(InputRow row, int distinctCount) { final String group; if (groupColumn == null) { group = null; } else { group = row.getValue(groupColumn); } final String value = row.getValue(column); run(group, value, row, distinctCount); } private void run(String group, String value, InputRow row, int distinctCount) { DefaultPatternFinder patternFinder = getPatternFinderForGroup(group); patternFinder.run(row, value, distinctCount); } private DefaultPatternFinder getPatternFinderForGroup(String group) { DefaultPatternFinder patternFinder = _patternFinders.get(group); if (patternFinder == null) { synchronized (this) { patternFinder = _patternFinders.get(group); if (patternFinder == null) { patternFinder = new DefaultPatternFinder(_configuration, _rowAnnotationFactory); _patternFinders.put(group, patternFinder); } } } return patternFinder; } @Override public PatternFinderResult getResult() { if (groupColumn == null) { Crosstab<?> crosstab = createCrosstab(getPatternFinderForGroup(null)); return new PatternFinderResult(column, crosstab, _configuration); } else { final Map<String, Crosstab<?>> crosstabs = new TreeMap<String, Crosstab<?>>( NullTolerableComparator.get(String.class)); final Set<Entry<String, DefaultPatternFinder>> patternFinderEntries = _patternFinders.entrySet(); for (Entry<String, DefaultPatternFinder> entry : patternFinderEntries) { final DefaultPatternFinder patternFinder = entry.getValue(); final Crosstab<Serializable> crosstab = createCrosstab(patternFinder); crosstabs.put(entry.getKey(), crosstab); } if (logger.isInfoEnabled()) { logger.info("Grouped result contains {} groups", crosstabs.size()); } return new PatternFinderResult(column, groupColumn, crosstabs, _configuration); } } public static Crosstab<Serializable> createCrosstab() { CrosstabDimension measuresDimension = new CrosstabDimension(DIMENSION_NAME_MEASURES); measuresDimension.addCategory(MEASURE_MATCH_COUNT); measuresDimension.addCategory(MEASURE_SAMPLE); CrosstabDimension patternDimension = new CrosstabDimension(DIMENSION_NAME_PATTERN); Crosstab<Serializable> crosstab = new Crosstab<Serializable>(Serializable.class, measuresDimension, patternDimension); return crosstab; } private Crosstab<Serializable> createCrosstab(DefaultPatternFinder patternFinder) { final Crosstab<Serializable> crosstab = createCrosstab(); final Set<Entry<TokenPattern, RowAnnotation>> entrySet = patternFinder.getAnnotations().entrySet(); // sort the entries so that the ones with the highest amount of // matches are at the top final Set<Entry<TokenPattern, RowAnnotation>> sortedEntrySet = new TreeSet<Entry<TokenPattern, RowAnnotation>>( new Comparator<Entry<TokenPattern, RowAnnotation>>() { public int compare(Entry<TokenPattern, RowAnnotation> o1, Entry<TokenPattern, RowAnnotation> o2) { int result = o2.getValue().getRowCount() - o1.getValue().getRowCount(); if (result == 0) { result = o1.getKey().toSymbolicString().compareTo(o2.getKey().toSymbolicString()); } return result; } }); sortedEntrySet.addAll(entrySet); for (Entry<TokenPattern, RowAnnotation> entry : sortedEntrySet) { final TokenPattern pattern = entry.getKey(); final CrosstabNavigator<Serializable> nav = crosstab.navigate(); nav.where(DIMENSION_NAME_PATTERN, pattern.toSymbolicString()); nav.where(DIMENSION_NAME_MEASURES, MEASURE_MATCH_COUNT); RowAnnotation annotation = entry.getValue(); int size = annotation.getRowCount(); nav.put(size, true); nav.attach(new AnnotatedRowsResult(annotation, _rowAnnotationFactory, column)); nav.where(DIMENSION_NAME_MEASURES, MEASURE_SAMPLE); nav.put(pattern.getSampleString(), true); } return crosstab; } // setter methods for unittesting purposes public void setRowAnnotationFactory(RowAnnotationFactory rowAnnotationFactory) { _rowAnnotationFactory = rowAnnotationFactory; } public void setColumn(InputColumn<String> column) { this.column = column; } public void setPredefinedTokenName(String predefinedTokenName) { this.predefinedTokenName = predefinedTokenName; } public void setPredefinedTokenPatterns(String[] predefinedTokenPatterns) { this.predefinedTokenPatterns = predefinedTokenPatterns; } public void setDiscriminateTextCase(Boolean discriminateTextCase) { this.discriminateTextCase = discriminateTextCase; } public void setDiscriminateNegativeNumbers(Boolean discriminateNegativeNumbers) { this.discriminateNegativeNumbers = discriminateNegativeNumbers; } public void setDiscriminateDecimals(Boolean discriminateDecimals) { this.discriminateDecimals = discriminateDecimals; } public void setEnableMixedTokens(Boolean enableMixedTokens) { this.enableMixedTokens = enableMixedTokens; } public void setUpperCaseExpandable(boolean upperCaseExpandable) { this.upperCaseExpandable = upperCaseExpandable; } public void setLowerCaseExpandable(boolean lowerCaseExpandable) { this.lowerCaseExpandable = lowerCaseExpandable; } public void setDecimalSeparator(Character decimalSeparator) { this.decimalSeparator = decimalSeparator; } public void setIgnoreRepeatedSpaces(Boolean ignoreRepeatedSpaces) { this.ignoreRepeatedSpaces = ignoreRepeatedSpaces; } public void setMinusSign(Character minusSign) { this.minusSign = minusSign; } public void setThousandsSeparator(Character thousandsSeparator) { this.thousandsSeparator = thousandsSeparator; } public void setGroupColumn(InputColumn<String> groupColumn) { this.groupColumn = groupColumn; } }