/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.stringpattern; import java.io.Serializable; import java.text.DecimalFormatSymbols; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import javax.inject.Named; import org.datacleaner.api.Analyzer; import org.datacleaner.api.ColumnProperty; import org.datacleaner.api.Concurrent; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.ExternalDocumentation; import org.datacleaner.api.ExternalDocumentation.DocumentationLink; import org.datacleaner.api.ExternalDocumentation.DocumentationType; import org.datacleaner.api.Initialize; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.api.Provided; import org.datacleaner.result.AnnotatedRowsResult; import org.datacleaner.result.Crosstab; import org.datacleaner.result.CrosstabDimension; import org.datacleaner.result.CrosstabNavigator; import org.datacleaner.storage.RowAnnotation; import org.datacleaner.storage.RowAnnotationFactory; import org.datacleaner.util.NullTolerableComparator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @Named("Pattern finder") @Description( "The Pattern Finder will inspect your String values and generate and match string patterns that suit " + "your data.\nIt can be used for a lot of purposes but is excellent for verifying or getting ideas about " + "the format of the string-values in a column.") @ExternalDocumentation(value = { @DocumentationLink(title = "Kasper's Source: Pattern Finder 2.0", url = "http://kasper.eobjects.org/2010/09/pattern-finder-20-latest-feature-in.html", type = DocumentationType.TECH, version = "2.0") }) @Concurrent(true) public class PatternFinderAnalyzer implements Analyzer<PatternFinderResult> { public static final String PROPERTY_COLUMN = "Column"; public static final String PROPERTY_GROUP_COLUMN = "Group column"; public static final String PROPERTY_DISCRIMINATE_TEXT_CASE = "Discriminate text case"; public static final String PROPERTY_DISCRIMINATE_NEGATIVE_NUMBERS = "Discriminate negative numbers"; public static final String PROPERTY_DISCRIMINATE_DECIMALS = "Discriminate decimals"; public static final String PROPERTY_ENABLE_MIXED_TOKENS = "Enable mixed tokens"; public static final String PROPERTY_IGNORE_REPEATED_SPACES = "Ignore repeated spaces"; public static final String MEASURE_SAMPLE = "Sample"; public static final String MEASURE_MATCH_COUNT = "Match count"; public static final String DIMENSION_NAME_MEASURES = "Measures"; public static final String DIMENSION_NAME_PATTERN = "Pattern"; private static final Logger logger = LoggerFactory.getLogger(PatternFinderAnalyzer.class); @Configured(order = 1, value = PROPERTY_COLUMN) @ColumnProperty(escalateToMultipleJobs = true) InputColumn<String> column; @Configured(required = false, order = 2, value = PROPERTY_GROUP_COLUMN) @Description("Optional column to group patterns by") InputColumn<String> groupColumn; @Configured(required = false, order = 3, value = PROPERTY_DISCRIMINATE_TEXT_CASE) @Description("Separate text tokens based on case") boolean discriminateTextCase = true; @Configured(required = false, order = 4, value = PROPERTY_DISCRIMINATE_NEGATIVE_NUMBERS) @Description("Separate number tokens based on negativity") boolean discriminateNegativeNumbers = false; @Configured(required = false, order = 5, value = PROPERTY_DISCRIMINATE_DECIMALS) @Description("Separate number tokens for decimals") boolean discriminateDecimals = true; @Configured(required = false, order = 6, value = PROPERTY_ENABLE_MIXED_TOKENS) @Description("Use '?'-tokens for mixed text and numbers") boolean enableMixedTokens = true; @Configured(required = false, order = 7, value = PROPERTY_IGNORE_REPEATED_SPACES) @Description("Ignore whitespace differences") boolean ignoreRepeatedSpaces = false; @Configured(required = false, value = "Upper case patterns expand in size", order = 8) @Description("Auto-adjust/expand uppercase text tokens") boolean upperCaseExpandable = false; @Configured(required = false, value = "Lower case patterns expand in size", order = 9) @Description("Auto-adjust/expand lowercase text tokens") boolean lowerCaseExpandable = true; @Configured(required = false, value = "Predefined token name", order = 10) String predefinedTokenName; @Configured(required = false, value = "Predefined token regexes", order = 11) String[] predefinedTokenPatterns; @Configured(required = false, order = 12) Character decimalSeparator = DecimalFormatSymbols.getInstance().getDecimalSeparator(); @Configured(required = false, order = 13) Character thousandsSeparator = DecimalFormatSymbols.getInstance().getGroupingSeparator(); @Configured(required = false, order = 14) Character minusSign = DecimalFormatSymbols.getInstance().getMinusSign(); @Provided RowAnnotationFactory _rowAnnotationFactory; private Map<String, DefaultPatternFinder> _patternFinders; private TokenizerConfiguration _configuration; public static Crosstab<Serializable> createCrosstab() { final CrosstabDimension measuresDimension = new CrosstabDimension(DIMENSION_NAME_MEASURES); measuresDimension.addCategory(MEASURE_MATCH_COUNT); measuresDimension.addCategory(MEASURE_SAMPLE); final CrosstabDimension patternDimension = new CrosstabDimension(DIMENSION_NAME_PATTERN); return new Crosstab<>(Serializable.class, measuresDimension, patternDimension); } @Initialize public void init() { _configuration = new TokenizerConfiguration(enableMixedTokens); _configuration.setUpperCaseExpandable(upperCaseExpandable); _configuration.setLowerCaseExpandable(lowerCaseExpandable); _configuration.setDiscriminateNegativeNumbers(discriminateNegativeNumbers); _configuration.setDiscriminateDecimalNumbers(discriminateDecimals); _configuration.setDiscriminateTextCase(discriminateTextCase); _configuration.setDistriminateTokenLength(TokenType.WHITESPACE, !ignoreRepeatedSpaces); if (decimalSeparator != null) { _configuration.setDecimalSeparator(decimalSeparator); } if (thousandsSeparator != null) { _configuration.setThousandsSeparator(thousandsSeparator); } if (minusSign != null) { _configuration.setMinusSign(minusSign); } if (predefinedTokenName != null && predefinedTokenPatterns != null) { final Set<String> tokenRegexes = new HashSet<>(); for (final String predefinedTokenPattern : predefinedTokenPatterns) { tokenRegexes.add(predefinedTokenPattern); } _configuration.getPredefinedTokens().add(new PredefinedTokenDefinition(predefinedTokenName, tokenRegexes)); } _patternFinders = new HashMap<>(); } @Override public void run(final InputRow row, final int distinctCount) { final String group; if (groupColumn == null) { group = null; } else { group = row.getValue(groupColumn); } final String value = row.getValue(column); run(group, value, row, distinctCount); } private void run(final String group, final String value, final InputRow row, final int distinctCount) { final DefaultPatternFinder patternFinder = getPatternFinderForGroup(group); patternFinder.run(row, value, distinctCount); } private DefaultPatternFinder getPatternFinderForGroup(final String group) { DefaultPatternFinder patternFinder = _patternFinders.get(group); if (patternFinder == null) { synchronized (this) { patternFinder = _patternFinders.get(group); if (patternFinder == null) { patternFinder = new DefaultPatternFinder(_configuration, _rowAnnotationFactory); _patternFinders.put(group, patternFinder); } } } return patternFinder; } @Override public PatternFinderResult getResult() { if (groupColumn == null) { final Crosstab<?> crosstab = createCrosstab(getPatternFinderForGroup(null)); return new PatternFinderResult(column, crosstab, _configuration); } else { final Map<String, Crosstab<?>> crosstabs = new TreeMap<>(NullTolerableComparator.get(String.class)); final Set<Entry<String, DefaultPatternFinder>> patternFinderEntries = _patternFinders.entrySet(); for (final Entry<String, DefaultPatternFinder> entry : patternFinderEntries) { final DefaultPatternFinder patternFinder = entry.getValue(); final Crosstab<Serializable> crosstab = createCrosstab(patternFinder); crosstabs.put(entry.getKey(), crosstab); } if (logger.isInfoEnabled()) { logger.info("Grouped result contains {} groups", crosstabs.size()); } return new PatternFinderResult(column, groupColumn, crosstabs, _configuration); } } private Crosstab<Serializable> createCrosstab(final DefaultPatternFinder patternFinder) { final Crosstab<Serializable> crosstab = createCrosstab(); final Set<Entry<TokenPattern, RowAnnotation>> entrySet = patternFinder.getAnnotations().entrySet(); // sort the entries so that the ones with the highest amount of // matches are at the top final Set<Entry<TokenPattern, RowAnnotation>> sortedEntrySet = new TreeSet<>((o1, o2) -> { int result = o2.getValue().getRowCount() - o1.getValue().getRowCount(); if (result == 0) { result = o1.getKey().toSymbolicString().compareTo(o2.getKey().toSymbolicString()); } return result; }); sortedEntrySet.addAll(entrySet); for (final Entry<TokenPattern, RowAnnotation> entry : sortedEntrySet) { final TokenPattern pattern = entry.getKey(); final CrosstabNavigator<Serializable> nav = crosstab.navigate(); nav.where(DIMENSION_NAME_PATTERN, pattern.toSymbolicString()); nav.where(DIMENSION_NAME_MEASURES, MEASURE_MATCH_COUNT); final RowAnnotation annotation = entry.getValue(); final int size = annotation.getRowCount(); nav.put(size, true); nav.attach(AnnotatedRowsResult.createIfSampleRowsAvailable(annotation, _rowAnnotationFactory, column)); nav.where(DIMENSION_NAME_MEASURES, MEASURE_SAMPLE); nav.put(pattern.getSampleString(), true); } return crosstab; } // setter methods for unittesting purposes public void setRowAnnotationFactory(final RowAnnotationFactory rowAnnotationFactory) { _rowAnnotationFactory = rowAnnotationFactory; } public void setColumn(final InputColumn<String> column) { this.column = column; } public void setPredefinedTokenName(final String predefinedTokenName) { this.predefinedTokenName = predefinedTokenName; } public void setPredefinedTokenPatterns(final String[] predefinedTokenPatterns) { this.predefinedTokenPatterns = predefinedTokenPatterns; } public void setDiscriminateTextCase(final boolean discriminateTextCase) { this.discriminateTextCase = discriminateTextCase; } public void setDiscriminateNegativeNumbers(final boolean discriminateNegativeNumbers) { this.discriminateNegativeNumbers = discriminateNegativeNumbers; } public void setDiscriminateDecimals(final boolean discriminateDecimals) { this.discriminateDecimals = discriminateDecimals; } public void setEnableMixedTokens(final boolean enableMixedTokens) { this.enableMixedTokens = enableMixedTokens; } public void setUpperCaseExpandable(final boolean upperCaseExpandable) { this.upperCaseExpandable = upperCaseExpandable; } public void setLowerCaseExpandable(final boolean lowerCaseExpandable) { this.lowerCaseExpandable = lowerCaseExpandable; } public void setDecimalSeparator(final Character decimalSeparator) { this.decimalSeparator = decimalSeparator; } public void setIgnoreRepeatedSpaces(final boolean ignoreRepeatedSpaces) { this.ignoreRepeatedSpaces = ignoreRepeatedSpaces; } public void setMinusSign(final Character minusSign) { this.minusSign = minusSign; } public void setThousandsSeparator(final Character thousandsSeparator) { this.thousandsSeparator = thousandsSeparator; } public void setGroupColumn(final InputColumn<String> groupColumn) { this.groupColumn = groupColumn; } }