/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.beans.stringpattern; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /*** * A string pattern finder. This component can consume rows and produce string * patterns. It does not contain the actual logic to store/persist the rows, but * has callback methods so that it's rather easy to implement this on your own. * * * * @param <R> * the type representing the row. Enables the user of the class to * use his own row type, such as InputRow, String[] or even just * Object. */ public abstract class PatternFinder<R> { private final Map<String, Set<TokenPattern>> _patterns; private final TokenizerConfiguration _configuration; private final Tokenizer _tokenizer; public PatternFinder(Tokenizer tokenizer, TokenizerConfiguration configuration) { _configuration = configuration; _tokenizer = tokenizer; _patterns = new HashMap<String, Set<TokenPattern>>(); } public PatternFinder(TokenizerConfiguration configuration) { this(new DefaultTokenizer(configuration), configuration); } /** * This method should be invoked by the user of the PatternFinder. Invoke it * for each value in your dataset. Repeated values are handled correctly but * if available it is more effecient to handle only the distinct values and * their corresponding distinct counts. * * @param row * the row containing the value * @param value * the string value to be tokenized and matched against other * patterns * @param distinctCount * the count of the value */ public void run(R row, String value, int distinctCount) { final List<Token> tokens; boolean match = false; try { tokens = _tokenizer.tokenize(value); } catch (RuntimeException e) { throw new IllegalStateException("Error occurred while tokenizing value: " + value, e); } final String patternCode = getPatternCode(tokens); Set<TokenPattern> patterns; synchronized (this) { patterns = _patterns.get(patternCode); if (patterns == null) { patterns = new HashSet<TokenPattern>(); _patterns.put(patternCode, patterns); } for (TokenPattern pattern : patterns) { if (pattern.match(tokens)) { storeMatch(pattern, row, value, distinctCount); match = true; } } if (!match) { final TokenPattern pattern; try { pattern = new TokenPatternImpl(value, tokens, _configuration); } catch (RuntimeException e) { throw new IllegalStateException("Error occurred while creating pattern for: " + tokens, e); } storeNewPattern(pattern, row, value, distinctCount); patterns.add(pattern); } } } /** * Creates an almost unique String code for a list of tokens. This code is * used to improve search time when looking for potential matching patterns. * * @param tokens * @return */ private String getPatternCode(List<Token> tokens) { final StringBuilder sb = new StringBuilder(); sb.append(tokens.size()); for (Token token : tokens) { sb.append(token.getType().ordinal()); } return sb.toString(); } public Set<TokenPattern> getPatterns() { final Set<TokenPattern> result = new HashSet<TokenPattern>(); final Collection<Set<TokenPattern>> values = _patterns.values(); for (Set<TokenPattern> set : values) { result.addAll(set); } return result; } /** * This method is invoked every time a new pattern is created (ie. when a * match could not be found in the existing patterns). * * @param pattern * the newly produced pattern * @param row * the row that was handed to the run(...) method * @param value * the value that was handed to the run(...) method * @param distinctCount * the distinctCount that was handed to the run(...) method */ protected abstract void storeNewPattern(TokenPattern pattern, R row, String value, int distinctCount); /** * This method is invoked every time a tokenized value matches an existing * pattern. All existing patterns will previously have been created using * the storeNewPattern(...) method. * * @param pattern * the existing pattern * @param row * the row that was handed to the run(...) method * @param value * the value that was handed to the run(...) method * @param distinctCount * the distinctCount that was handed to the run(...) method */ protected abstract void storeMatch(TokenPattern pattern, R row, String value, int distinctCount); }