/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.synonym; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; /** SynonymFilter handles multi-token synonyms with variable position increment offsets. * <p> * The matched tokens from the input stream may be optionally passed through (includeOrig=true) * or discarded. If the original tokens are included, the position increments may be modified * to retain absolute positions after merging with the synonym tokenstream. * <p> * Generated synonyms will start at the same position as the first matched source token. */ public final class SynonymFilter extends TokenFilter { private final SynonymMap map; // Map<String, SynonymMap> private Iterator<AttributeSource> replacement; // iterator over generated tokens public SynonymFilter(TokenStream in, SynonymMap map) { super(in); this.map = map; // just ensuring these attributes exist... addAttribute(CharTermAttribute.class); addAttribute(PositionIncrementAttribute.class); addAttribute(OffsetAttribute.class); addAttribute(TypeAttribute.class); } /* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ @Override public boolean incrementToken() throws IOException { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement!=null && replacement.hasNext()) { copy(this, replacement.next()); return true; } // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) return false; CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class); SynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; if (result == null) { copy(this, firstTok); return true; } // fast-path failed, clone ourselves if needed if (firstTok == this) firstTok = cloneAttributes(); // OK, we matched a token, so find the longest match. matched = new LinkedList<AttributeSource>(); result = match(result); if (result==null) { // no match, simply return the first token read. copy(this, firstTok); return true; } // reuse, or create new one each time? ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast(); boolean includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class); int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream int repPos=0; // curr position in replacement token stream int pos=0; // current position in merged token stream for (int i=0; i<result.synonyms.length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class); OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.getPositionIncrement(); if (i==0) repPos=origPos; // make position of first token equal to original // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPosInc.setPositionIncrement(origPos-pos); generated.add(origTok); pos += origPosInc.getPositionIncrement(); origTok = matched.isEmpty() ? null : matched.removeFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPos += origPosInc.getPositionIncrement(); } } newPosIncAtt.setPositionIncrement(repPos - pos); generated.add(newTok); pos += newPosIncAtt.getPositionIncrement(); } // finish up any leftover original tokens while (origTok!=null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPosInc.setPositionIncrement(origPos-pos); generated.add(origTok); pos += origPosInc.getPositionIncrement(); origTok = matched.isEmpty() ? null : matched.removeFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPos += origPosInc.getPositionIncrement(); } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.iterator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } } // // Defer creation of the buffer until the first time it is used to // optimize short fields with no matches. // private LinkedList<AttributeSource> buffer; private LinkedList<AttributeSource> matched; private AttributeSource nextTok() throws IOException { if (buffer!=null && !buffer.isEmpty()) { return buffer.removeFirst(); } else { if (input.incrementToken()) { return this; } else return null; } } private void pushTok(AttributeSource t) { if (buffer==null) buffer=new LinkedList<AttributeSource>(); buffer.addFirst(t); } private SynonymMap match(SynonymMap map) throws IOException { SynonymMap result = null; if (map.submap != null) { AttributeSource tok = nextTok(); if (tok != null) { // clone ourselves. if (tok == this) tok = cloneAttributes(); // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); SynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length()); if (subMap != null) { // recurse result = match(subMap); } if (result != null) { matched.addFirst(tok); } else { // push back unmatched token pushTok(tok); } } } // if no longer sequence matched, so if this node has synonyms, it's the match. if (result==null && map.synonyms!=null) { result = map; } return result; } private void copy(AttributeSource target, AttributeSource source) { if (target != source) source.copyTo(target); } @Override public void reset() throws IOException { input.reset(); replacement = null; } }