/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.stringpattern;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/***
* A string pattern finder. This component can consume rows and produce string
* patterns. It does not contain the actual logic to store/persist the rows, but
* has callback methods so that it's rather easy to implement this on your own.
*
*
*
* @param <R>
* the type representing the row. Enables the user of the class to
* use his own row type, such as InputRow, String[] or even just
* Object.
*/
public abstract class PatternFinder<R> {
private final ConcurrentHashMap<String, Collection<TokenPattern>> _patterns;
private final TokenizerConfiguration _configuration;
private final Tokenizer _tokenizer;
public PatternFinder(final Tokenizer tokenizer, final TokenizerConfiguration configuration) {
_configuration = configuration;
_tokenizer = tokenizer;
_patterns = new ConcurrentHashMap<>();
}
public PatternFinder(final TokenizerConfiguration configuration) {
this(new DefaultTokenizer(configuration), configuration);
}
/**
* This method should be invoked by the user of the PatternFinder. Invoke it
* for each value in your dataset. Repeated values are handled correctly but
* if available it is more efficient to handle only the distinct values and
* their corresponding distinct counts.
*
* @param row
* the row containing the value
* @param value
* the string value to be tokenized and matched against other
* patterns
* @param distinctCount
* the count of the value
*/
public void run(final R row, final String value, final int distinctCount) {
final List<Token> tokens;
try {
tokens = _tokenizer.tokenize(value);
} catch (final RuntimeException e) {
throw new IllegalStateException("Error occurred while tokenizing value: " + value, e);
}
final String patternCode = getPatternCode(tokens);
final Collection<TokenPattern> patterns = getOrCreatePatterns(patternCode);
// lock on "patterns" since it is going to be the same collection for
// all matching pattern codes.
synchronized (patterns) {
boolean match = false;
for (final TokenPattern pattern : patterns) {
if (pattern.match(tokens)) {
storeMatch(pattern, row, value, distinctCount);
match = true;
break;
}
}
if (!match) {
final TokenPattern pattern;
try {
pattern = new TokenPatternImpl(value, tokens, _configuration);
} catch (final RuntimeException e) {
throw new IllegalStateException("Error occurred while creating pattern for: " + tokens, e);
}
storeNewPattern(pattern, row, value, distinctCount);
patterns.add(pattern);
}
}
}
/**
* Gets a collection of known {@link TokenPattern}s that matches the pattern
* code
*
* @param patternCode
* @return
*/
private Collection<TokenPattern> getOrCreatePatterns(final String patternCode) {
// first try the cheapest get(..) method
final Collection<TokenPattern> patterns = _patterns.get(patternCode);
if (patterns != null) {
return patterns;
}
// then try the concurrent version which requires a collection
final Collection<TokenPattern> newPatterns = new ArrayList<>(3);
final Collection<TokenPattern> existingPatterns = _patterns.putIfAbsent(patternCode, newPatterns);
if (existingPatterns == null) {
return newPatterns;
}
return existingPatterns;
}
/**
* Creates an almost unique String code for a list of tokens. This code is
* used to improve search time when looking for potential matching patterns.
*
* @param tokens
* @return
*/
private String getPatternCode(final List<Token> tokens) {
final StringBuilder sb = new StringBuilder();
sb.append(tokens.size());
for (final Token token : tokens) {
sb.append(token.getType().ordinal());
}
return sb.toString();
}
public Collection<TokenPattern> getPatterns() {
final Set<TokenPattern> result = new HashSet<>();
final Collection<Collection<TokenPattern>> values = _patterns.values();
for (final Collection<TokenPattern> set : values) {
result.addAll(set);
}
return result;
}
/**
* This method is invoked every time a new pattern is created (ie. when a
* match could not be found in the existing patterns).
*
* @param pattern
* the newly produced pattern
* @param row
* the row that was handed to the run(...) method
* @param value
* the value that was handed to the run(...) method
* @param distinctCount
* the distinctCount that was handed to the run(...) method
*/
protected abstract void storeNewPattern(TokenPattern pattern, R row, String value, int distinctCount);
/**
* This method is invoked every time a tokenized value matches an existing
* pattern. All existing patterns will previously have been created using
* the storeNewPattern(...) method.
*
* @param pattern
* the existing pattern
* @param row
* the row that was handed to the run(...) method
* @param value
* the value that was handed to the run(...) method
* @param distinctCount
* the distinctCount that was handed to the run(...) method
*/
protected abstract void storeMatch(TokenPattern pattern, R row, String value, int distinctCount);
}