package org.xbib.elasticsearch.index.analysis.autophrase; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.CharArrayMap; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.util.AttributeImpl; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * Performs "auto phrasing" on a token stream. Auto phrases refer to sequences of tokens that * are meant to describe a single thing and should be searched for as such. When these phrases * are detected in the token stream, a single token representing the phrase is emitted rather than * the individual tokens that make up the phrase. The filter supports overlapping phrases. * * The Autophrasing filter can be combined with a synonym filter to handle cases in which prefix or * suffix terms in a phrase are synonymous with the phrase, but where other parts of the phrase are * not. */ public class AutoPhrasingTokenFilter extends TokenFilter { private CharArrayMap<CharArraySet> phraseMap; private CharArraySet currentSetToCheck; private StringBuilder currentPhrase; private List<Token> unusedTokens; private boolean emitSingleTokens; private char[] lastToken; private char[] lastEmitted; private char[] lastValid; private Character replaceWhitespaceWith; private int positionIncr; public AutoPhrasingTokenFilter(TokenStream input, CharArraySet phraseSet, boolean emitSingleTokens) { super(input); this.emitSingleTokens = emitSingleTokens; this.phraseMap = convertPhraseSet(phraseSet); this.currentPhrase = new StringBuilder(); this.unusedTokens = new ArrayList<>(); this.positionIncr = 0; } public void setReplaceWhitespaceWith(Character replaceWhitespaceWith) { this.replaceWhitespaceWith = replaceWhitespaceWith; } @Override public void reset() throws IOException { currentSetToCheck = null; currentPhrase.setLength(0); lastToken = null; lastEmitted = null; unusedTokens.clear(); positionIncr = 0; super.reset(); } @Override public final boolean incrementToken() throws IOException { if (!emitSingleTokens && !unusedTokens.isEmpty()) { Token aToken = unusedTokens.remove(0); emit(aToken); return true; } if (lastToken != null) { emit(lastToken); lastToken = null; return true; } char[] nextToken = nextToken(); if (nextToken == null) { if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (emitSingleTokens && currentSetToCheck != null && !currentSetToCheck.isEmpty()) { char[] phrase = getFirst(currentSetToCheck); char[] lastTok = getCurrentBuffer(new char[0]); if (phrase != null && endsWith(lastTok, phrase)) { currentSetToCheck = remove(phrase); emit(phrase); return true; } } else if (!emitSingleTokens && currentSetToCheck != null && !currentSetToCheck.isEmpty()) { char[] currBuff = getCurrentBuffer(new char[0]); if (lastEmitted != null && !isEqualTo(fixWhitespace(lastEmitted), currBuff)) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; if (!unusedTokens.isEmpty()) { Token aToken = unusedTokens.remove(0); if (!endsWith(lastEmitted, currBuff)) { emit(aToken); return true; } } } } if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) { char[] lastTok = getCurrentBuffer(new char[0]); if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) { emit(lastTok); currentPhrase.setLength(0); return true; } else if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; currentPhrase.setLength(0); if (!unusedTokens.isEmpty()) { Token aToken = unusedTokens.remove(0); emit(aToken); return true; } } } return false; } if (emitSingleTokens) { lastToken = nextToken; } if (currentSetToCheck == null || currentSetToCheck.isEmpty()) { if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) { currentPhrase = new StringBuilder(); } else { currentPhrase.setLength(0); } currentPhrase.append(nextToken); return incrementToken(); } else { emit(nextToken); lastToken = null; return true; } } else { char[] currentBuffer = getCurrentBuffer(nextToken); if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) { currentSetToCheck = remove(currentBuffer); if (currentSetToCheck.isEmpty()) { emit(currentBuffer); lastValid = null; --positionIncr; } else { if (emitSingleTokens) { lastToken = currentBuffer; return true; } lastValid = currentBuffer; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) { currentPhrase = new StringBuilder(); } else { currentPhrase.setLength(0); } currentPhrase.append(nextToken); } return lastValid == null || incrementToken(); } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length); for (Object aNewSet : newSet) { char[] phrase = (char[]) aNewSet; currentSetToCheck.add(phrase); } } for (Object aCurrentSetToCheck : currentSetToCheck) { char[] phrase = (char[]) aCurrentSetToCheck; if (startsWith(phrase, currentBuffer)) { return incrementToken(); } } if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentPhrase.setLength(0); currentSetToCheck = null; if (!unusedTokens.isEmpty()) { Token aToken = unusedTokens.remove(0); emit(aToken); return true; } } currentSetToCheck = null; return incrementToken(); } } private char[] nextToken() throws IOException { if (input.incrementToken()) { CharTermAttribute termAttr = getTermAttribute(); if (termAttr != null) { char[] termBuf = termAttr.buffer(); char[] nextTok = new char[termAttr.length()]; System.arraycopy(termBuf, 0, nextTok, 0, termAttr.length()); return nextTok; } } return null; } private boolean startsWith(char[] buffer, char[] phrase) { if (phrase.length > buffer.length) { return false; } for (int i = 0; i < phrase.length; i++) { if (buffer[i] != phrase[i]) { return false; } } return true; } private boolean isEqualTo(char[] buffer, char[] phrase) { if (phrase.length != buffer.length) { return false; } for (int i = 0; i < phrase.length; i++) { if (buffer[i] != phrase[i]) { return false; } } return true; } private boolean endsWith(char[] buffer, char[] phrase) { if (buffer == null || phrase == null) { return false; } if (phrase.length >= buffer.length) { return false; } for (int i = 1; i < phrase.length - 1; ++i) { if (buffer[buffer.length - i] != phrase[phrase.length - i]) { return false; } } return true; } private char[] getCurrentBuffer(char[] newToken) { if (currentPhrase == null) { currentPhrase = new StringBuilder(); } if (newToken != null && newToken.length > 0) { if (currentPhrase.length() > 0) { currentPhrase.append(' '); } currentPhrase.append(newToken); } char[] currentBuff = new char[currentPhrase.length()]; currentPhrase.getChars(0, currentPhrase.length(), currentBuff, 0); return currentBuff; } private char[] getFirst(CharArraySet charSet) { if (charSet.isEmpty()) { return null; } Iterator<Object> phraseIt = charSet.iterator(); return (char[]) phraseIt.next(); } private void emit(char[] tokenChars) { char[] token = tokenChars; if (replaceWhitespaceWith != null) { token = replaceWhiteSpace(token); } CharTermAttribute termAttr = getTermAttribute(); if (termAttr != null) { termAttr.setEmpty(); termAttr.append(new StringBuilder().append(token)); } OffsetAttribute offAttr = getOffsetAttribute(); if (offAttr != null && offAttr.endOffset() >= token.length) { int start = offAttr.endOffset() - token.length; offAttr.setOffset(start, offAttr.endOffset()); } PositionIncrementAttribute pia = getPositionIncrementAttribute(); if (pia != null) { pia.setPositionIncrement(++positionIncr); } lastEmitted = token; } private void emit(Token token) { emit(token.tok); OffsetAttribute offAttr = getOffsetAttribute(); if (offAttr != null && token.endPos > token.startPos && token.startPos >= 0) { offAttr.setOffset(token.startPos, token.endPos); } } private char[] replaceWhiteSpace(char[] token) { char[] replaced = new char[token.length]; for (int i = 0; i < token.length; i++) { if (token[i] == ' ') { replaced[i] = replaceWhitespaceWith; } else { replaced[i] = token[i]; } } return replaced; } private CharTermAttribute getTermAttribute() { Iterator<AttributeImpl> attrIt = getAttributeImplsIterator(); while (attrIt != null && attrIt.hasNext()) { AttributeImpl attrImp = attrIt.next(); if (attrImp instanceof CharTermAttribute) { return (CharTermAttribute) attrImp; } } return null; } private OffsetAttribute getOffsetAttribute() { Iterator<AttributeImpl> attrIt = getAttributeImplsIterator(); while (attrIt != null && attrIt.hasNext()) { AttributeImpl attrImp = attrIt.next(); if (attrImp instanceof OffsetAttribute) { return (OffsetAttribute) attrImp; } } return null; } private PositionIncrementAttribute getPositionIncrementAttribute() { Iterator<AttributeImpl> attrIt = getAttributeImplsIterator(); while (attrIt != null && attrIt.hasNext()) { AttributeImpl attrImp = attrIt.next(); if (attrImp instanceof PositionIncrementAttribute) { return (PositionIncrementAttribute) attrImp; } } return null; } private CharArrayMap<CharArraySet> convertPhraseSet(CharArraySet phraseSet) { CharArrayMap<CharArraySet> phraseMap = new CharArrayMap<>(100, false); for (Object aPhraseSet : phraseSet) { char[] phrase = (char[]) aPhraseSet; char[] firstTerm = getFirstTerm(phrase); CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length); if (itsPhrases == null) { itsPhrases = new CharArraySet(5, false); phraseMap.put(new String(firstTerm), itsPhrases); } itsPhrases.add(phrase); } return phraseMap; } private char[] getFirstTerm(char[] phrase) { int spNdx = 0; while (spNdx < phrase.length) { if (isSpaceChar(phrase[spNdx++])) { break; } } char[] firstCh = new char[spNdx - 1]; System.arraycopy(phrase, 0, firstCh, 0, spNdx - 1); return firstCh; } private boolean isSpaceChar(char ch) { return " \t\n\r".indexOf(ch) >= 0; } private void discardCharTokens(StringBuilder phrase, List<Token> tokenList) { OffsetAttribute offAttr = getOffsetAttribute(); int lastSp = 0; int endPos = 0; if (offAttr != null) { endPos = offAttr.endOffset(); int startPos = endPos - phrase.length(); for (int i = 0; i < phrase.length(); i++) { char chAt = phrase.charAt(i); if (isSpaceChar(chAt) && i > lastSp) { char[] tok = new char[i - lastSp]; phrase.getChars(lastSp, i, tok, 0); if (lastEmitted == null || !endsWith(lastEmitted, tok)) { Token token = new Token(); token.tok = tok; token.startPos = startPos + lastSp; token.endPos = token.startPos + tok.length; tokenList.add(token); } lastSp = i + 1; } } } char[] tok = new char[phrase.length() - lastSp]; phrase.getChars(lastSp, phrase.length(), tok, 0); Token token = new Token(); token.tok = tok; token.endPos = endPos; token.startPos = endPos - tok.length; tokenList.add(token); } private CharArraySet remove(char[] charArray) { CharArraySet newSet = new CharArraySet(5, false); for (Object aCurrentSetToCheck : currentSetToCheck) { char[] phrase = (char[]) aCurrentSetToCheck; if (!isEqualTo(phrase, charArray) && startsWith(phrase, charArray) || endsWith(charArray, phrase)) { newSet.add(phrase); } } return newSet; } private char[] fixWhitespace(char[] phrase) { if (replaceWhitespaceWith == null) { return phrase; } char[] fixed = new char[phrase.length]; for (int i = 0; i < phrase.length; i++) { if (phrase[i] == replaceWhitespaceWith) { fixed[i] = ' '; } else { fixed[i] = phrase[i]; } } return fixed; } @Override public boolean equals(Object object) { return object instanceof AutoPhrasingTokenFilter && phraseMap.equals(((AutoPhrasingTokenFilter)object).phraseMap) && emitSingleTokens == ((AutoPhrasingTokenFilter)object).emitSingleTokens; } @Override public int hashCode() { return phraseMap.hashCode() ^ Boolean.hashCode(emitSingleTokens); } private class Token { char[] tok; int startPos; int endPos; } }