// Copyright 2014 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.tokenize.rules; import java.io.Serializable; import java.util.Collection; import java.util.regex.Matcher; public class RulebasedTransformator implements Serializable { private static final long serialVersionUID = 1L; /** * This class can define a number of conversion rules for exceptions which arise in either * direction of tokenization. After finding a match in the corresponding string, a * confirmation match is searched for in the other one. Apart from confirming, little can * be done though, since this code is run strictly before alignment, so we don't know * anything about corresponding positions between the two strings. * After confirmation, the initial match is replaced with the confirmation match. */ private Collection<Rule> rules_; public RulebasedTransformator(Collection<Rule> rules) { rules_ = rules; } public String applyRules(String string) { return applyRules(string, rules_); } private String applyRules(String string, Collection<Rule> rules) { StringBuilder sb = new StringBuilder(); for(Rule rule : rules) { Matcher matcher = rule.pattern.matcher(string); int end = 0; int start = 0; while(matcher.find(start)) { start = matcher.start(1); if(end > start) { start = end; continue; } sb.append(string.substring(end, start)); sb.append(rule.replacement); end = matcher.end(1); } sb.append(string.substring(end)); string = sb.toString(); // Reset string builder. sb.setLength(0); } return string; } }