/*
* dfh.trie -- a library for generating trie regular expressions
*
* Copyright (C) 2012 David F. Houghton
*
* This software is licensed under the LGPL. Please see accompanying NOTICE file
* and lgpl.txt.
*/
package com.gisgraphy.compound;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Creates TRIE regexes out of lists of words. Regexes so created will be fast
* (optimal or near-optimal matching speed so long as non-backtracking) and
* compact. This class is thread safe.
*
* @author David Houghton
*/
public class Trie {
/**
* Whether the pattern created should be case sensitive
*/
public static final int CASEINSENSITIVE = 1;
/**
* If set, and whitespace is not being preserved, any whitespace found will
* be replaced with {@code [ \\u00a0]+} (perhaps with the possessive
* modifier {@code +}).
* <p>
* If no special whitespace option is set, whitespace becomes {@code \\s+}.
* See {@link #SPACEANDTAB}.
*/
public static final int SPACEONLY = 2;
/**
* If set, and whitespace is not being preserved, any whitespace found will
* be replaced with {@code [ \\u00a0\\t]+} (perhaps with the possessive
* modifier {@code +}).
* <p>
* If no special whitespace option is set, whitespace becomes {@code \\s+}.
* See {@link #SPACEONLY}.
*/
public static final int SPACEANDTAB = 4;
/**
* Treat any whitespace character found like any other character. See
* {@link #SPACEONLY} and {@link #SPACEANDTAB}.
*/
public static final int PRESERVE_WHITESPACE = 8;
/**
* Turns off ordinary repetition behavior, which is to make them possessive.
*/
public static final int BACKTRACKING = 16;
/**
* Whether to assume word boundaries at the edges of phrases.
*/
public static final int AUTO_BOUNDARY = 32;
/**
* If set, regexes will not include the "possessive" modifier — e.g.,
* {@code *+}, {@code ++}, {@code ?+} — which is not understood by
* versions of Perl prior to 5.10.
*/
public static final int PERL_SAFE = 64;
/**
* Whether to condense long repeating sequences; e.g.,
* {@code aaaaaaaaaaaa -> (?:aa) 6}
*/
public static final int CONDENSE = 128;
/**
* Use {@code (?:a|b|c)} in lieu of {@code [a-c]}.
*/
public static final int NO_CHAR_CLASSES = 256;
/**
* Reverse all strings before compiling the regex.
*/
public static final int REVERSE = 512;
/**
* A character used to represent word boundaries. I use the null character
* since it's unlikely to be passed in as input.
*/
private static final char BOUNDARY = '\u0000';
private static final String LONG_TEXT = "..";
/**
* A pattern matching things that might be regex meta-characters. This could
* be more carefully written, but I figure it doesn't matter all that much.
*/
private static final Pattern needsQuotes = Pattern
.compile("[\\p{Punct}&&[^!;:,'\"_-]]++");
/**
* A pattern matching characters that need to be escaped in character
* classes.
*/
private static final Pattern classMetaCharacters = Pattern
.compile("([\\[\\]\\^\\-\\&\\{\\}\\\\])");
/**
* Comparator to generate a reversed alphabetic list. Minor optimization
* achieved (in recursively looped code) by ignoring any language specific
* {@link Collator} and making the Comparator non-generic.
*/
private static final Comparator<Object> comparator = new Comparator<Object>() {
public int compare(Object o1, Object o2) {
String s1 = o1.toString(), s2 = o2.toString();
int l1 = s1.length(), l2 = s2.length(), min = l1 < l2 ? l1 : l2;
for (int i = 0; i < min; i++) {
int comparison = s2.charAt(i) - s1.charAt(i);
if (comparison == 0)
continue;
return comparison;
}
return l2 - l1;
}
};
/**
* Convenience method. See {@link #trie(String[], int)}. {@link #DEFAULTS}
* are used.
*
* @param ar
* array of {@link String} to match
* @return {@link String} parsable as a regular expression
* @throws TrieException
*/
public static String trie(String[] ar) throws TrieException {
return trie(ar, 0);
}
/**
* Delegates to {@link #trie(String[])}.
*
* @param c
* {@link Collection} of phrases to match
* @return {@link String} parsable as a regular expression
* @throws TrieException
*/
public static String trie(Collection<String> c) throws TrieException {
String[] ar = c.toArray(new String[c.size()]);
return trie(ar);
}
/**
* Delegates to {@link #trie(String[], int)}.
*
* @param c
* {@link Collection} of phrases to match
* @param flags
* modifiers for regular expression
* @return {@link String} parsable as a regular expression
* @throws TrieException
*/
public static String trie(Collection<String> c, int flags)
throws TrieException {
String[] ar = c.toArray(new String[c.size()]);
return trie(ar, flags);
}
/**
* Takes an array of {@link String} and makes a compact regular expression
* which will match only the expressions in the list. The regex so created
* is deterministic: there is at most one possible choice at each branch
* point. This means no backtracking should be necessary. This allows for
* much more rapid matching. It is possible to cause the regex to be
* backtracking, however.
*
* @param ar
* the array of {@code Strings} to match.
* @param flags
* modifiers for regular expression; e.g.,
* {@code CONDENSE | AUTOBOUNDARY}
* @return {@link String} parsable as a regular expression
* @throws TrieException
* for inconsistent parameters
*/
public static String trie(String[] ar, int flags) throws TrieException {
State state = new State();
state.caseInsensitive = (flags & CASEINSENSITIVE) == CASEINSENSITIVE;
state.backtrack = (flags & BACKTRACKING) == BACKTRACKING;
state.perlSafe = (flags & PERL_SAFE) == PERL_SAFE;
state.groupMod = state.backtrack ? ':' : '>';
state.whitespace = null;
if ((flags & PRESERVE_WHITESPACE) != PRESERVE_WHITESPACE) {
if (((flags & SPACEANDTAB) == SPACEANDTAB)
&& ((flags & SPACEONLY) == SPACEONLY))
throw new TrieException(
"SPACEANDTAB | SPACEONLY is inconsistent");
StringBuilder b = new StringBuilder();
if (state.backtrack && state.perlSafe)
b.append("(?").append(state.groupMod);
if ((flags & SPACEONLY) == SPACEONLY)
b.append("[ \u00a0]");
else if ((flags & SPACEANDTAB) == SPACEANDTAB)
b.append("[ \u00a0\\t]");
else
b.append("\\s");
b.append('+');
if (!state.backtrack) {
if (state.perlSafe)
b.append(')');
else
b.append('+');
}
state.whitespace = b.toString();
}
state.autoBoundary = (flags & AUTO_BOUNDARY) == AUTO_BOUNDARY;
state.condense = (flags & CONDENSE) == CONDENSE;
state.charclasses = (flags & NO_CHAR_CLASSES) != NO_CHAR_CLASSES;
boolean reverse = (flags & REVERSE) == REVERSE;
// clean out duplicates and estimate a maximum length for the regex
int charCount = 5;
{
// inside block to localize data structures
Set<String> types = new HashSet<String>(ar.length);
for (int i = 0; i < ar.length; i++) {
String s = ar[i];
if (state.whitespace != null)
s = s.trim().replaceAll("\\s++", " ");
if (s.equals(""))
continue;
if (state.caseInsensitive)
s = s.toLowerCase();
if (reverse)
s = reverse(s);
// add boundary characters
if (state.autoBoundary) {
if (Character.isLetterOrDigit(s.charAt(0)))
s = BOUNDARY + s;
if (Character.isLetterOrDigit(s.charAt(s.length() - 1)))
s += BOUNDARY;
}
if (!types.contains(s)) {
types.add(s);
charCount += s.length() + 1;
}
}
ar = (String[]) types.toArray(new String[types.size()]);
}
// put in reversed alphabetical order -- this puts longer words first
// and groups words with similar prefixes together
Arrays.sort(ar, comparator);
StringBuilder buffer = new StringBuilder(charCount);
if (state.caseInsensitive)
buffer.append("(?i:");
// create the lookup list
PatternAndEncapsulation pe = pattern(ar, state);
if (!(state.caseInsensitive || pe.encapsulated)) {
buffer.append("(?").append(state.groupMod);
buffer.append(pe.pattern);
buffer.append(')');
} else
buffer.append(pe.pattern);
if (state.caseInsensitive)
buffer.append(')');
// free up memory
ar = null;
if (state.condense) {
buffer = simpleRepeatingPatternCondenser(state, buffer);
}
// remove boundary characters
if (state.autoBoundary) {
// grab enough space in one go
StringBuilder b2 = new StringBuilder((int) (buffer.length() * 1.5));
int start = 0, ptr = 0;
for (int lim = buffer.length(); ptr < lim; ptr++) {
if (buffer.charAt(ptr) == BOUNDARY) {
if (start < ptr)
b2.append(buffer.subSequence(start, ptr));
b2.append("\\b");
start = ptr + 1;
}
}
if (start < ptr)
b2.append(buffer.subSequence(start, ptr));
buffer = b2;
}
return buffer.toString();
}
/**
* Reverses characters in string.
*
* @param s
* @return s reversed
*/
private static String reverse(String s) {
char[] car = new char[s.length()];
for (int i = 0, j = s.length() - 1; i < s.length(); i++, j--)
car[j] = s.charAt(i);
return new String(car);
}
// some regexes used by simpleRepeatingPatternCondenser
/**
* Just looks for repeating sequences
*/
private static final Pattern repetionPattern = Pattern
.compile("(.{1,16}?)\\1++");
private static final Pattern precedingGroupPattern = Pattern
.compile("\\{\\d*$");
private static final Pattern trailingGroupPattern = Pattern
.compile("\\d*\\}");
private static final Pattern digitPattern = Pattern.compile("\\d+");
private static StringBuilder simpleRepeatingPatternCondenser(State state,
StringBuilder buffer) {
StringBuilder s = new StringBuilder(buffer);
Matcher m = repetionPattern.matcher(s);
while (m.find()) {
String smallestMatch = m.group(1);
int pos = m.start();
// do some math to see whether this is worth our time
int smallLength = smallestMatch.length();
int allLength = m.group().length();
int iterations = allLength / smallLength;
if (smallLength == 1) {
if (allLength <= 4)
continue;
} else {
if (smallLength + 7 >= allLength)
continue;
}
// now check to make sure we aren't in an iteration counter (highly
// unlikely given previous test)
if (digitPattern.matcher(smallestMatch).matches()) {
String substring = s.substring(m.end());
Matcher trailingMatcher = trailingGroupPattern
.matcher(substring);
if (trailingMatcher.lookingAt()) {
substring = s.substring(0, pos);
Matcher precedingMatcher = precedingGroupPattern
.matcher(substring);
if (precedingMatcher.find()) {
continue;
}
}
}
try {
// see whether the substring compiles as a regex
@SuppressWarnings("unused")
Pattern testPattern = Pattern.compile(smallestMatch);
// seems to be good, so we condense it
StringBuilder b = new StringBuilder(s.substring(0, pos));
if (smallLength == 1) {
b.append(smallestMatch);
} else {
b.append("(?:").append(smallestMatch).append(")");
}
b.append('{').append(iterations).append('}');
b.append(s.substring(m.end()));
s = b;
m = repetionPattern.matcher(s);
// set the new matcher to skip the subsequence already examined
m.region(pos, s.length());
} catch (Exception e) {
}
}
return s;
}
/**
* A data structure useful as a return value.
*
* @author David Houghton
*/
private static class PatternAndEncapsulation {
String pattern;
boolean encapsulated = false;
PatternAndEncapsulation(String pattern, boolean encapsulated) {
this.pattern = pattern;
this.encapsulated = encapsulated;
}
}
/**
* Another little data structure useful as a return value.
*
* @author David Houghton
*/
private static class OffsetAndPattern {
/**
* offset into array to begin looking for strings with a common prefix
*/
int offset = 0;
/**
* sub-pattern
*/
PatternAndEncapsulation segment = null;
/**
* Record of text before quotemeta
*/
String originalText = null;
}
/**
* A data structure to ensure thread safety and to facilitate temporarily
* modifying processing to ensure the validity of non-backtracking
* expressions.
*/
private static class State {
boolean condense, autoBoundary, backtrack, caseInsensitive, perlSafe,
charclasses;
char groupMod;
String whitespace;
boolean makeSuffix = true;
State noSuffixCopy() {
State s = new State();
s.condense = condense;
s.autoBoundary = autoBoundary;
s.groupMod = groupMod;
s.whitespace = whitespace;
s.backtrack = backtrack;
s.caseInsensitive = caseInsensitive;
s.perlSafe = perlSafe;
s.charclasses = charclasses;
s.makeSuffix = false;
return s;
}
}
/**
* Reduces a sorted array of {@code Strings} to a regular expression
* matching any {@code String} in that array.
*
* @param ar
* the set of {@code Strings} that should be converted into a
* pattern
* @param state
* various fields particular to current thread
* @return matching regex
*/
private static PatternAndEncapsulation pattern(String[] ar, State state) {
// check for degenerate case
if (ar.length == 1)
return quotemeta(ar[0], state);
if (ar.length == 0)
return quotemeta("", state);
if (state.makeSuffix) {
// find common suffix
int numChars = 1;
int firstLength = ar[0].length();
FIND_SUFFIX_LOOP: while (true) {
int length = firstLength;
if (length <= numChars) {
numChars--;
break;
}
char c = ar[0].charAt(length - numChars);
for (int i = 1; i < ar.length; i++) {
String s = ar[i];
length = s.length();
if (length <= numChars || s.charAt(length - numChars) != c) {
numChars--;
break FIND_SUFFIX_LOOP;
}
}
numChars++;
}
// if a common suffix was found, subtract it from all strings and
// make regexes for whatever strings remain
if (numChars > 0) {
String suffix = ar[0].substring(ar[0].length() - numChars);
// non-backtracking patterns can have problems with suffixes; we
// shrink the suffix until we have one such that for no string
// in the array is a leading substring of the suffix equal to a
// trailing substring of the remainder of the word
if (!state.backtrack) {
while (true) {
int sublength = 1;
boolean foundMatch = false;
SHRINK_SUFFIX_LOOP: do {
String subSuffix = suffix.substring(0, sublength);
for (String s : ar) {
String substring = s.substring(0, s.length()
- numChars);
if (substring.endsWith(subSuffix)) {
foundMatch = true;
break SHRINK_SUFFIX_LOOP;
}
}
} while (++sublength <= numChars);
if (foundMatch) {
numChars--;
if (numChars == 0)
return patternNoExtractableSuffix(ar, state);
suffix = suffix.substring(1);
continue;
}
break;
}
}
suffix = quotemeta(suffix, state).pattern;
for (int i = 0; i < ar.length; i++) {
int wordLength = ar[i].length();
ar[i] = ar[i].substring(0, wordLength - numChars);
}
// resort members of array put in reversed alphabetical order --
// this puts longer words first and groups words with similar
// prefixes together
Arrays.sort(ar, comparator);
// look for common prefixes
PatternAndEncapsulation pe = patternNoExtractableSuffix(ar,
state.backtrack ? state : state.noSuffixCopy());
StringBuilder buffer = new StringBuilder(pe.pattern.length()
+ suffix.length());
buffer.append(pe.pattern);
buffer.append(suffix);
return new PatternAndEncapsulation(buffer.toString(), false);
} else
// look for common prefixes
return patternNoExtractableSuffix(ar, state);
} else {
// resume making suffixes in subpatterns
state.makeSuffix = true;
return patternNoExtractableSuffix(ar, state);
}
}
/**
* Takes a sorted array of {@code Strings} and returns a matching regex.
* This code assumes the {@code Strings} in the array have no common suffix
* that can be extracted. (It is possible that the array contains a
* one-letter {@code String} that is also a common suffix, but such a suffix
* is not extractable. If it were extracted, the set of Strings matched by
* the regex would not be the same as the set that produced it.)
*
* @param ar
* {@code Strings} to match
* @param state
* various fields particular to current thread
* @return matching regex
*/
private static PatternAndEncapsulation patternNoExtractableSuffix(
String[] ar, State state) {
OffsetAndPattern op = new OffsetAndPattern();
List<OffsetAndPattern> subsegments = new ArrayList<OffsetAndPattern>(
Math.min(ar.length, 52));
boolean compressible = state.charclasses;
// collect different prefixes and the associated remaining strings
do {
op = patternCommonPrefix(ar, op.offset, state.makeSuffix ? state
: state.noSuffixCopy());
if (op == null)
break;
compressible &= op.originalText.length() == 1;
subsegments.add(op);
} while (op.offset < ar.length);
// return appropriate string
switch (subsegments.size()) {
case 0:
return new PatternAndEncapsulation("", false);
case 1:
return subsegments.get(0).segment;
default:
StringBuilder segment = null;
// check to see if the segments can be converted to a character
// class
if (compressible) {
// reduce segments to a character class
segment = new StringBuilder(subsegments.size());
boolean containsBoundary = false;
// make sure dash isn't misinterpreted as part of a range
// expression
boolean containsDash = false;
// look for character ranges of 3 or more
char headOfRange = 0, lastChar = 0, charInRange = 0;
for (int i = subsegments.size() - 1; i >= 0; i--) {
char c = subsegments.get(i).originalText.charAt(0);
if (lastChar > 0) {
if (c == lastChar + 1) {
lastChar = c;
charInRange++;
} else {
while (true) {
switch (charInRange) {
case 1:
if (headOfRange == '-')
containsDash = true;
else
segment.append(headOfRange);
break;
case 2:
if (headOfRange == '-')
containsDash = true;
else
segment.append(headOfRange);
if (lastChar == '-')
containsDash = true;
else
segment.append(lastChar);
break;
default:
if (headOfRange == '-') {
containsDash = true;
headOfRange++;
charInRange--;
} else if (lastChar == '-') {
containsDash = true;
lastChar--;
charInRange--;
}
if (charInRange == 2)
continue;
segment.append(headOfRange).append('-')
.append(lastChar);
}
break;
}
headOfRange = lastChar = c;
charInRange = 1;
}
} else if (state.autoBoundary && c == BOUNDARY)
containsBoundary = true;
else {
lastChar = headOfRange = c;
charInRange = 1;
}
}
if (charInRange > 0) {
while (true) {
switch (charInRange) {
case 1:
if (headOfRange == '-')
containsDash = true;
else
segment.append(headOfRange);
break;
case 2:
if (headOfRange == '-')
containsDash = true;
else
segment.append(headOfRange);
if (lastChar == '-')
containsDash = true;
else
segment.append(lastChar);
break;
default:
if (headOfRange == '-') {
containsDash = true;
headOfRange++;
charInRange--;
} else if (lastChar == '-') {
containsDash = true;
lastChar--;
charInRange--;
}
if (charInRange == 2)
continue;
segment.append(headOfRange).append('-')
.append(lastChar);
}
break;
}
}
if (containsDash)
segment.append('-');
String characters = classMetaCharacters.matcher(
segment.toString()).replaceAll("$1");
segment = new StringBuilder(characters.length()
+ (containsBoundary ? 8 : 2));
if (containsBoundary)
segment.append("(?").append(state.groupMod);
segment.append('[');
segment.append(characters);
segment.append(']');
if (containsBoundary)
segment.append('|').append(BOUNDARY).append(')');
} else {
if (subsegments.size() == 1)
return subsegments.get(0).segment;
Iterator<OffsetAndPattern> i = subsegments.iterator();
segment = new StringBuilder("(?");
segment.append(state.groupMod);
segment.append(i.next().segment.pattern);
while (i.hasNext())
segment.append('|').append(i.next().segment.pattern);
segment.append(')');
}
return new PatternAndEncapsulation(segment.toString(), true);
}
}
/**
* Takes in a sorted array of {@code Strings} and an offset into that array.
* Finds the subsequence of the array that has the same prefix as the string
* at the offset. Produces a pattern matching this subsequence. Returns this
* pattern plus the offset for the beginning of the next pattern.
*
* Offset is useful for beginning search for next subsequence with a common
* prefix.
*
* @param ar
* a set of {@code Strings} to match.
* @param offset
* offset into array where set of strings matching subpattern
* begins
* @param state
* various fields particular to current thread
* @return {@code OffsetAndPattern} containing current pattern and beginning
* of next set of {@code Strings} for which a pattern must be found
*/
private static OffsetAndPattern patternCommonPrefix(String[] ar,
int offset, State state) {
if (ar[offset].length() == 0)
return null;
OffsetAndPattern op = new OffsetAndPattern();
op.offset = offset;
// look for the first string in the array that does not begin with the
// initial character
char c = ar[offset].charAt(0);
do {
op.offset++;
} while (op.offset < ar.length && ar[op.offset].length() > 0
&& ar[op.offset].charAt(0) == c);
if (op.offset == offset + 1) {
// the initial character was unique
op.originalText = ar[offset];
op.segment = quotemeta(ar[offset], state);
} else {
// figure out how long the prefix is
int length = 1;
SEARCH: do {
char endchar = ar[offset].charAt(length);
for (int i = offset + 1; i < op.offset; i++) {
if (ar[i].length() == length
|| ar[i].charAt(length) != endchar)
break SEARCH;
}
length++;
} while (true);
// figure out whether any member of the set equals the prefix
boolean lastEmpty = ar[op.offset - 1].length() == length;
StringBuilder buffer = new StringBuilder();
// quote this prefix
PatternAndEncapsulation prefix = quotemeta(
ar[offset].substring(0, length), state);
buffer.append(prefix.pattern);
// collect the non-prefixes and construct a pattern for them
int endIndex = lastEmpty ? op.offset - 1 : op.offset;
String[] subsegments = new String[endIndex - offset];
for (int i = offset; i < endIndex; i++)
subsegments[i - offset] = ar[i].substring(length);
PatternAndEncapsulation rest = pattern(subsegments, state);
if (lastEmpty && !rest.encapsulated) {
buffer.append("(?").append(state.groupMod);
buffer.append(rest.pattern);
buffer.append(')');
} else
buffer.append(rest.pattern);
if (lastEmpty) {
buffer.append('?');
if (!(state.backtrack || state.perlSafe))
buffer.append('+');
}
op.segment = new PatternAndEncapsulation(buffer.toString(), false);
op.originalText = LONG_TEXT; // only one-char strings are of
// interest
}
return op;
}
/**
* Escapes meta characters and replaces blank spaces with
* whitespace-matching pattern.
*
* @param s
* {@code String} to quote
* @param whitespace
* nature of whitespace for regex; null means preserver
* whitespace
* @return quoted {@code String}
*/
private static PatternAndEncapsulation quotemeta(String s, State state) {
boolean singleton = s.length() == 1;
if (state.whitespace == null)
return new PatternAndEncapsulation(subquote(s, state), singleton);
StringBuilder b = new StringBuilder();
if (s.startsWith(" ")) {
b.append(state.whitespace);
singleton = false;
s = s.substring(1);
}
boolean finalSpace = false;
if (s.endsWith(" ")) {
finalSpace = true;
s = s.substring(0, s.length() - 1);
}
String[] subsegments = s.split(" ");
b.append(subquote(subsegments[0], state));
for (int i = 1; i < subsegments.length; i++)
b.append(state.whitespace).append(subquote(subsegments[i], state));
if (finalSpace)
b.append(state.whitespace);
return new PatternAndEncapsulation(b.toString(), singleton);
}
/**
* Simple test adding in meta-character quoting when it might be necessary.
*
* @param s
* {@code String} to quote
* @return quoted {@code String}
*/
private static String subquote(String s, State state) {
Matcher m = needsQuotes.matcher(s);
StringBuilder b = new StringBuilder(s.length() * 2);
int start = 0;
while (m.find()) {
b.append(s.substring(start, m.start()));
start = m.end();
String match = m.group();
if (state.condense || match.length() < 4) {
for (int i = 0, lim = match.length(); i < lim; i++) {
char c = match.charAt(i);
// all this rigamarole is to sidestep a Java regex parsing
// bug: you can't simply escape an escape if the following
// character is 'Q', or perhaps 'E'
if (c == '\\' && start < s.length()) {
char c2 = s.charAt(m.start() + i + 1);
if (c2 == 'Q' || c2 == 'E') {
b.append("[\\\\]");
continue;
}
}
b.append('\\').append(c);
}
} else {
b.append("\\Q").append(match).append("\\E");
}
}
if (start > 0) {
b.append(s.substring(start));
return b.toString();
} else
return s;
}
}