/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.uhighlight; import java.io.Closeable; import java.io.IOException; import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; /** * Analyzes the text, producing a single {@link OffsetsEnum} wrapping the {@link TokenStream} filtered to terms * in the query, including wildcards. It can't handle position-sensitive queries (phrases). Passage accuracy suffers * because the freq() is unknown -- it's always {@link Integer#MAX_VALUE} instead. */ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0]; public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) { super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer); assert phraseHelper.hasPositionSensitivity() == false; } private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) { CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length]; for (int i = 0; i < terms.length; i++) { String termString = terms[i].utf8ToString(); newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) { @Override public String toString() { return termString; } }; } // Append existing automata (that which is used for MTQs) System.arraycopy(automata, 0, newAutomata, terms.length, automata.length); return newAutomata; } @Override public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException { TokenStream tokenStream = tokenStream(content); PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata); mtqPostingsEnum.advance(docId); return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum)); } // See class javadocs. // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum. private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable { TokenStream stream; // becomes null when closed final CharacterRunAutomaton[] matchers; final CharTermAttribute charTermAtt; final OffsetAttribute offsetAtt; int currentDoc = -1; int currentMatch = -1; int currentStartOffset = -1; int currentEndOffset = -1; final BytesRef matchDescriptions[]; TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException { this.stream = ts; this.matchers = matchers; matchDescriptions = new BytesRef[matchers.length]; charTermAtt = ts.addAttribute(CharTermAttribute.class); offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); } @Override public int nextPosition() throws IOException { if (stream != null) { while (stream.incrementToken()) { for (int i = 0; i < matchers.length; i++) { if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) { currentStartOffset = offsetAtt.startOffset(); currentEndOffset = offsetAtt.endOffset(); currentMatch = i; return 0; } } } stream.end(); close(); } // exhausted currentStartOffset = currentEndOffset = Integer.MAX_VALUE; return Integer.MAX_VALUE; } @Override public int freq() throws IOException { return Integer.MAX_VALUE; // lie } @Override public int startOffset() throws IOException { assert currentStartOffset >= 0; return currentStartOffset; } @Override public int endOffset() throws IOException { assert currentEndOffset >= 0; return currentEndOffset; } // TOTAL HACK; used in OffsetsEnum.getTerm() @Override public BytesRef getPayload() throws IOException { if (matchDescriptions[currentMatch] == null) { matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString()); } return matchDescriptions[currentMatch]; } @Override public int docID() { return currentDoc; } @Override public int nextDoc() throws IOException { throw new UnsupportedOperationException(); } @Override public int advance(int target) throws IOException { return currentDoc = target; } @Override public long cost() { return 0; } @Override public void close() throws IOException { if (stream != null) { stream.close(); stream = null; } } } }