/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.synonym; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.FlattenGraphFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.RollingBuffer; import org.apache.lucene.util.fst.FST; // TODO: maybe we should resolve token -> wordID then run // FST on wordIDs, for better perf? // TODO: a more efficient approach would be Aho/Corasick's // algorithm // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm // It improves over the current approach here // because it does not fully re-start matching at every // token. For example if one pattern is "a b c x" // and another is "b c d" and the input is "a b c d", on // trying to parse "a b c x" but failing when you got to x, // rather than starting over again your really should // immediately recognize that "b c d" matches at the next // input. I suspect this won't matter that much in // practice, but it's possible on some set of synonyms it // will. We'd have to modify Aho/Corasick to enforce our // conflict resolving (eg greedy matching) because that algo // finds all matches. This really amounts to adding a .* // closure to the FST and then determinizing it. // // Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps /** Applies single- or multi-token synonyms from a {@link SynonymMap} * to an incoming {@link TokenStream}, producing a fully correct graph * output. This is a replacement for {@link SynonymFilter}, which produces * incorrect graphs for multi-token synonyms. * * <p>However, if you use this during indexing, you must follow it with * {@link FlattenGraphFilter} to squash tokens on top of one another * like {@link SynonymFilter}, because the indexer can't directly * consume a graph. To get fully correct positional queries when your * synonym replacements are multiple tokens, you should instead apply * synonyms using this {@code TokenFilter} at query time and translate * the resulting graph to a {@code TermAutomatonQuery} e.g. using * {@code TokenStreamToTermAutomatonQuery}. * * <p><b>NOTE</b>: this cannot consume an incoming graph; results will * be undefined. * * @lucene.experimental */ public final class SynonymGraphFilter extends TokenFilter { public static final String TYPE_SYNONYM = "SYNONYM"; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final SynonymMap synonyms; private final boolean ignoreCase; private final FST<BytesRef> fst; private final FST.BytesReader fstReader; private final FST.Arc<BytesRef> scratchArc; private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); private final BytesRef scratchBytes = new BytesRef(); private final CharsRefBuilder scratchChars = new CharsRefBuilder(); private final LinkedList<BufferedOutputToken> outputBuffer = new LinkedList<>(); private int nextNodeOut; private int lastNodeOut; private int maxLookaheadUsed; // For testing: private int captureCount; private boolean liveToken; // Start/end offset of the current match: private int matchStartOffset; private int matchEndOffset; // True once the input TokenStream is exhausted: private boolean finished; private int lookaheadNextRead; private int lookaheadNextWrite; private RollingBuffer<BufferedInputToken> lookahead = new RollingBuffer<BufferedInputToken>() { @Override protected BufferedInputToken newInstance() { return new BufferedInputToken(); } }; static class BufferedInputToken implements RollingBuffer.Resettable { final CharsRefBuilder term = new CharsRefBuilder(); AttributeSource.State state; int startOffset = -1; int endOffset = -1; @Override public void reset() { state = null; term.clear(); // Intentionally invalid to ferret out bugs: startOffset = -1; endOffset = -1; } } static class BufferedOutputToken { final String term; // Non-null if this was an incoming token: final State state; final int startNode; final int endNode; public BufferedOutputToken(State state, String term, int startNode, int endNode) { this.state = state; this.term = term; this.startNode = startNode; this.endNode = endNode; } } /** * Apply previously built synonyms to incoming tokens. * @param input input tokenstream * @param synonyms synonym map * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. * Note, if you set this to true, it's your responsibility to lowercase * the input entries when you create the {@link SynonymMap} */ public SynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { super(input); this.synonyms = synonyms; this.fst = synonyms.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } this.fstReader = fst.getBytesReader(); scratchArc = new FST.Arc<>(); this.ignoreCase = ignoreCase; } @Override public boolean incrementToken() throws IOException { //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut); assert lastNodeOut <= nextNodeOut; if (outputBuffer.isEmpty() == false) { // We still have pending outputs from a prior synonym match: releaseBufferedToken(); //System.out.println(" syn: ret buffered=" + this); assert liveToken == false; return true; } // Try to parse a new synonym match at the current token: if (parse()) { // A new match was found: releaseBufferedToken(); //System.out.println(" syn: after parse, ret buffered=" + this); assert liveToken == false; return true; } if (lookaheadNextRead == lookaheadNextWrite) { // Fast path: parse pulled one token, but it didn't match // the start for any synonym, so we now return it "live" w/o having // cloned all of its atts: if (finished) { //System.out.println(" syn: ret END"); return false; } assert liveToken; liveToken = false; // NOTE: no need to change posInc since it's relative, i.e. whatever // node our output is upto will just increase by the incoming posInc. // We also don't need to change posLen, but only because we cannot // consume a graph, so the incoming token can never span a future // synonym match. } else { // We still have buffered lookahead tokens from a previous // parse attempt that required lookahead; just replay them now: //System.out.println(" restore buffer"); assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite; BufferedInputToken token = lookahead.get(lookaheadNextRead); lookaheadNextRead++; restoreState(token.state); lookahead.freeBefore(lookaheadNextRead); //System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset()); assert liveToken == false; } lastNodeOut += posIncrAtt.getPositionIncrement(); nextNodeOut = lastNodeOut + posLenAtt.getPositionLength(); //System.out.println(" syn: ret lookahead=" + this); return true; } private void releaseBufferedToken() throws IOException { //System.out.println(" releaseBufferedToken"); BufferedOutputToken token = outputBuffer.pollFirst(); if (token.state != null) { // This is an original input token (keepOrig=true case): //System.out.println(" hasState"); restoreState(token.state); //System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset()); } else { clearAttributes(); //System.out.println(" no state"); termAtt.append(token.term); // We better have a match already: assert matchStartOffset != -1; offsetAtt.setOffset(matchStartOffset, matchEndOffset); //System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset); typeAtt.setType(TYPE_SYNONYM); } //System.out.println(" lastNodeOut=" + lastNodeOut); //System.out.println(" term=" + termAtt); posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut); lastNodeOut = token.startNode; posLenAtt.setPositionLength(token.endNode - token.startNode); } /** Scans the next input token(s) to see if a synonym matches. Returns true * if a match was found. */ private boolean parse() throws IOException { // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this)); // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); // How many tokens in the current match int matchLength = 0; boolean doFinalCapture = false; int lookaheadUpto = lookaheadNextRead; matchStartOffset = -1; byToken: while (true) { //System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos()); // Pull next token's chars: final char[] buffer; final int bufferLen; final int inputEndOffset; if (lookaheadUpto <= lookahead.getMaxPos()) { // Still in our lookahead buffer BufferedInputToken token = lookahead.get(lookaheadUpto); lookaheadUpto++; buffer = token.term.chars(); bufferLen = token.term.length(); inputEndOffset = token.endOffset; //System.out.println(" use buffer now max=" + lookahead.getMaxPos()); if (matchStartOffset == -1) { matchStartOffset = token.startOffset; } } else { // We used up our lookahead buffer of input tokens // -- pull next real input token: assert finished || liveToken == false; if (finished) { //System.out.println(" break: finished"); break; } else if (input.incrementToken()) { //System.out.println(" input.incrToken"); liveToken = true; buffer = termAtt.buffer(); bufferLen = termAtt.length(); if (matchStartOffset == -1) { matchStartOffset = offsetAtt.startOffset(); } inputEndOffset = offsetAtt.endOffset(); lookaheadUpto++; } else { // No more input tokens finished = true; //System.out.println(" break: now set finished"); break; } } matchLength++; //System.out.println(" cycle term=" + new String(buffer, 0, bufferLen)); // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += Character.charCount(codePoint); } assert bufUpto == bufferLen; // OK, entire token matched; now see if this is a final // state in the FST (a match): if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = matchLength; matchEndOffset = inputEndOffset; //System.out.println(" ** match"); } // See if the FST can continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); doFinalCapture = true; if (liveToken) { capture(); } } } if (doFinalCapture && liveToken && finished == false) { // Must capture the final token if we captured any prior tokens: capture(); } if (matchOutput != null) { if (liveToken) { // Single input token synonym; we must buffer it now: capture(); } // There is a match! bufferOutputTokens(matchOutput, matchInputLength); lookaheadNextRead += matchInputLength; //System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); lookahead.freeBefore(lookaheadNextRead); //System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); return true; } else { //System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead); return false; } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); } /** Expands the output graph into the necessary tokens, adding * synonyms as side paths parallel to the input tokens, and * buffers them in the output token buffer. */ private void bufferOutputTokens(BytesRef bytes, int matchInputLength) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; //System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength); // How many nodes along all paths; we need this to assign the // node ID for the final end node where all paths merge back: int totalPathNodes; if (keepOrig) { assert matchInputLength > 0; totalPathNodes = matchInputLength - 1; } else { totalPathNodes = 0; } // How many synonyms we will insert over this match: final int count = code >>> 1; // TODO: we could encode this instead into the FST: // 1st pass: count how many new nodes we need List<List<String>> paths = new ArrayList<>(); for(int outputIDX=0;outputIDX<count;outputIDX++) { int wordID = bytesReader.readVInt(); synonyms.words.get(wordID, scratchBytes); scratchChars.copyUTF8Bytes(scratchBytes); int lastStart = 0; List<String> path = new ArrayList<>(); paths.add(path); int chEnd = scratchChars.length(); for(int chUpto=0; chUpto<=chEnd; chUpto++) { if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) { path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart)); lastStart = 1 + chUpto; } } assert path.size() > 0; totalPathNodes += path.size() - 1; } //System.out.println(" totalPathNodes=" + totalPathNodes); // 2nd pass: buffer tokens for the graph fragment // NOTE: totalPathNodes will be 0 in the case where the matched // input is a single token and all outputs are also a single token // We "spawn" a side-path for each of the outputs for this matched // synonym, all ending back at this end node: int startNode = nextNodeOut; int endNode = startNode + totalPathNodes + 1; //System.out.println(" " + paths.size() + " new side-paths"); // First, fanout all tokens departing start node for these new side paths: int newNodeCount = 0; for(List<String> path : paths) { int pathEndNode; //System.out.println(" path size=" + path.size()); if (path.size() == 1) { // Single token output, so there are no intermediate nodes: pathEndNode = endNode; } else { pathEndNode = nextNodeOut + newNodeCount + 1; newNodeCount += path.size() - 1; } outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode)); } // We must do the original tokens last, else the offsets "go backwards": if (keepOrig) { BufferedInputToken token = lookahead.get(lookaheadNextRead); int inputEndNode; if (matchInputLength == 1) { // Single token matched input, so there are no intermediate nodes: inputEndNode = endNode; } else { inputEndNode = nextNodeOut + newNodeCount + 1; } //System.out.println(" keepOrig first token: " + token.term); outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode)); } nextNodeOut = endNode; // Do full side-path for each syn output: for(int pathID=0;pathID<paths.size();pathID++) { List<String> path = paths.get(pathID); if (path.size() > 1) { int lastNode = outputBuffer.get(pathID).endNode; for(int i=1;i<path.size()-1;i++) { outputBuffer.add(new BufferedOutputToken(null, path.get(i), lastNode, lastNode+1)); lastNode++; } outputBuffer.add(new BufferedOutputToken(null, path.get(path.size()-1), lastNode, endNode)); } } if (keepOrig && matchInputLength > 1) { // Do full "side path" with the original tokens: int lastNode = outputBuffer.get(paths.size()).endNode; for(int i=1;i<matchInputLength-1;i++) { BufferedInputToken token = lookahead.get(lookaheadNextRead + i); outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, lastNode+1)); lastNode++; } BufferedInputToken token = lookahead.get(lookaheadNextRead + matchInputLength - 1); outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, endNode)); } /* System.out.println(" after buffer: " + outputBuffer.size() + " tokens:"); for(BufferedOutputToken token : outputBuffer) { System.out.println(" tok: " + token.term + " startNode=" + token.startNode + " endNode=" + token.endNode); } */ } /** Buffers the current input token into lookahead buffer. */ private void capture() { assert liveToken; liveToken = false; BufferedInputToken token = lookahead.get(lookaheadNextWrite); lookaheadNextWrite++; token.state = captureState(); token.startOffset = offsetAtt.startOffset(); token.endOffset = offsetAtt.endOffset(); assert token.term.length() == 0; token.term.append(termAtt); captureCount++; maxLookaheadUsed = Math.max(maxLookaheadUsed, lookahead.getBufferSize()); //System.out.println(" maxLookaheadUsed=" + maxLookaheadUsed); } @Override public void reset() throws IOException { super.reset(); lookahead.reset(); lookaheadNextWrite = 0; lookaheadNextRead = 0; captureCount = 0; lastNodeOut = -1; nextNodeOut = 0; matchStartOffset = -1; matchEndOffset = -1; finished = false; liveToken = false; outputBuffer.clear(); maxLookaheadUsed = 0; //System.out.println("S: reset"); } // for testing int getCaptureCount() { return captureCount; } // for testing int getMaxLookaheadUsed() { return maxLookaheadUsed; } }