/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.beans.stringpattern;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PredefinedTokenTokenizer implements Tokenizer {
private List<PredefinedTokenDefinition> _predefinedTokenDefitions;
public PredefinedTokenTokenizer(PredefinedTokenDefinition... predefinedTokenDefinitions) {
_predefinedTokenDefitions = new LinkedList<PredefinedTokenDefinition>();
for (PredefinedTokenDefinition predefinedToken : predefinedTokenDefinitions) {
_predefinedTokenDefitions.add(predefinedToken);
}
}
public PredefinedTokenTokenizer(List<PredefinedTokenDefinition> predefinedTokenDefinitions) {
_predefinedTokenDefitions = predefinedTokenDefinitions;
}
/**
* Will only return either tokens with type PREDEFINED or UNDEFINED
*/
@Override
public List<Token> tokenize(String s) {
List<Token> result = new ArrayList<Token>();
result.add(new UndefinedToken(s));
for (PredefinedTokenDefinition predefinedTokenDefinition : _predefinedTokenDefitions) {
Set<Pattern> patterns = predefinedTokenDefinition.getTokenRegexPatterns();
for (Pattern pattern : patterns) {
for (ListIterator<Token> it = result.listIterator(); it.hasNext();) {
Token token = it.next();
if (token instanceof UndefinedToken) {
List<Token> replacementTokens = tokenizeInternal(token.getString(), predefinedTokenDefinition,
pattern);
if (replacementTokens.size() > 1) {
it.remove();
for (Token newToken : replacementTokens) {
it.add(newToken);
}
}
}
}
}
}
return result;
}
protected static List<Token> tokenizeInternal(String string, PredefinedTokenDefinition predefinedTokenDefinition,
Pattern pattern) {
LinkedList<Token> result = new LinkedList<Token>();
result.add(new UndefinedToken(string));
for (Matcher matcher = pattern.matcher(string); matcher.find(); matcher = pattern.matcher(string)) {
int start = matcher.start();
int end = matcher.end();
result.removeLast();
if (start > 0) {
result.add(new UndefinedToken(string.substring(0, start)));
}
result.add(new PredefinedToken(predefinedTokenDefinition, string.substring(start, end)));
if (end == string.length()) {
break;
}
string = string.substring(end);
result.add(new UndefinedToken(string));
}
return result;
}
}