/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.Counter; import org.apache.lucene.util.UnicodeUtil; /** * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's * no need to wrap with a caching impl. * * @lucene.internal */ final class TokenStreamFromTermVector extends TokenStream { // note: differs from similar class in the standard highlighter. This one is optimized for sparse cases. /** * content length divided by distinct positions; an average of dense text. */ private static final double AVG_CHARS_PER_POSITION = 6; private static final int INSERTION_SORT_THRESHOLD = 16; private final Terms vector; private final int filteredDocId; private final CharTermAttribute termAttribute; private final PositionIncrementAttribute positionIncrementAttribute; private final int offsetLength; private final float loadFactor; private OffsetAttribute offsetAttribute;//maybe null private PayloadAttribute payloadAttribute;//maybe null private CharsRefBuilder termCharsBuilder;//term data here private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null private TokenLL firstToken = null; // the head of a linked-list private TokenLL incrementToken = null; private boolean initialized = false;//lazy public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException { this(vector, 0, offsetLength, 1f); } /** * Constructor. * * @param vector Terms that contains the data for * creating the TokenStream. Must have positions and/or offsets. * @param filteredDocId The docID we will process. * @param offsetLength Supply the character length of the text being uninverted, or a lower value if you don't want * to invert text beyond an offset (in so doing this will act as a filter). If you don't * know the length, pass -1. In conjunction with {@code loadFactor}, it's used to * determine how many buckets to create during uninversion. * It's also used to filter out tokens with a start offset exceeding this value. * @param loadFactor The percent of tokens from the original terms (by position count) that are * expected to be inverted. If they are filtered (e.g. * {@link org.apache.lucene.index.FilterLeafReader.FilterTerms}) * then consider using less than 1.0 to avoid wasting space. * 1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector. */ TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException { super(); this.filteredDocId = filteredDocId; this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength; if (loadFactor <= 0f || loadFactor > 1f) { throw new IllegalArgumentException("loadFactor should be > 0 and <= 1"); } this.loadFactor = loadFactor; assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*"; if (!vector.hasPositions() && !vector.hasOffsets()) { throw new IllegalArgumentException("The term vector needs positions and/or offsets."); } assert vector.hasFreqs(); this.vector = vector; termAttribute = addAttribute(CharTermAttribute.class); positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); } public Terms getTermVectorTerms() { return vector; } @Override public void reset() throws IOException { incrementToken = null; super.reset(); } //We delay initialization because we can see which attributes the consumer wants, particularly payloads private void init() throws IOException { assert !initialized; int dpEnumFlags = 0; if (vector.hasOffsets()) { offsetAttribute = addAttribute(OffsetAttribute.class); dpEnumFlags |= PostingsEnum.OFFSETS; } if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) { payloadAttribute = getAttribute(PayloadAttribute.class); payloadsBytesRefArray = new BytesRefArray(Counter.newCounter()); spareBytesRefBuilder = new BytesRefBuilder(); dpEnumFlags |= PostingsEnum.PAYLOADS; } // We put term data here termCharsBuilder = new CharsRefBuilder(); termCharsBuilder.grow(initTotalTermCharLen()); // Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor) final TokenLL[] tokenBuckets = initTokenBucketsArray(); final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION; final double POSITION_TO_BUCKET_IDX = loadFactor; final TermsEnum termsEnum = vector.iterator(); BytesRef termBytesRef; PostingsEnum dpEnum = null; final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call TERM_LOOP: while ((termBytesRef = termsEnum.next()) != null) { //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj) // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand tempCharsRefBuilder.grow(termBytesRef.length); final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars()); final int termCharsOff = termCharsBuilder.length(); termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen); dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier int currentDocId = dpEnum.advance(filteredDocId); if (currentDocId != filteredDocId) { continue; //Not expected } final int freq = dpEnum.freq(); for (int j = 0; j < freq; j++) { TokenLL token = new TokenLL(); token.position = dpEnum.nextPosition(); // can be -1 if not in the TV token.termCharsOff = termCharsOff; token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE); // copy offset (if it's there) and compute bucketIdx int bucketIdx; if (offsetAttribute != null) { token.startOffset = dpEnum.startOffset(); if (offsetLength >= 0 && token.startOffset > offsetLength) { continue TERM_LOOP;//filter this token out; exceeds threshold } token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE); bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX); } else { bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX); } if (bucketIdx >= tokenBuckets.length) { bucketIdx = tokenBuckets.length - 1; } if (payloadAttribute != null) { final BytesRef payload = dpEnum.getPayload(); token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload); } //Add token to the head of the bucket linked list token.next = tokenBuckets[bucketIdx]; tokenBuckets[bucketIdx] = token; } } // Step 2: Link all Tokens into a linked-list and sort all tokens at the same position firstToken = initLinkAndSortTokens(tokenBuckets); // If the term vector didn't have positions, synthesize them if (!vector.hasPositions() && firstToken != null) { TokenLL prevToken = firstToken; prevToken.position = 0; for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) { if (prevToken.startOffset == token.startOffset) { token.position = prevToken.position; } else { token.position = prevToken.position + 1; } } } initialized = true; } private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) { TokenLL firstToken = null; List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use. TODO use native array TokenLL prevToken = null; for (TokenLL tokenHead : tokenBuckets) { if (tokenHead == null) { continue; } //sort tokens at this position and link them; return the first TokenLL tokenTail; // just one token if (tokenHead.next == null) { tokenTail = tokenHead; } else { // add the linked list to a temporary array for (TokenLL cur = tokenHead; cur != null; cur = cur.next) { scratchTokenArray.add(cur); } // sort; and set tokenHead & tokenTail if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) { // insertion sort by creating a linked list (leave scratchTokenArray alone) tokenHead = tokenTail = scratchTokenArray.get(0); tokenHead.next = null; for (int i = 1; i < scratchTokenArray.size(); i++) { TokenLL insertToken = scratchTokenArray.get(i); if (insertToken.compareTo(tokenHead) <= 0) { // takes the place of tokenHead insertToken.next = tokenHead; tokenHead = insertToken; } else { // goes somewhere after tokenHead for (TokenLL prev = tokenHead; true; prev = prev.next) { if (prev.next == null || insertToken.compareTo(prev.next) <= 0) { if (prev.next == null) { tokenTail = insertToken; } insertToken.next = prev.next; prev.next = insertToken; break; } } } } } else { Collections.sort(scratchTokenArray); // take back out and create a linked list TokenLL prev = tokenHead = scratchTokenArray.get(0); for (int i = 1; i < scratchTokenArray.size(); i++) { prev.next = scratchTokenArray.get(i); prev = prev.next; } tokenTail = prev; tokenTail.next = null; } scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that } //link to previous if (prevToken != null) { assert prevToken.next == null; prevToken.next = tokenHead; //concatenate linked-list assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations"; } else { assert firstToken == null; firstToken = tokenHead; } prevToken = tokenTail; } return firstToken; } private int initTotalTermCharLen() throws IOException { int guessNumTerms; if (vector.size() != -1) { guessNumTerms = (int) vector.size(); } else if (offsetLength != -1) { guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd } else { return 128; } return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len } private TokenLL[] initTokenBucketsArray() throws IOException { // Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms). int positionsEstimate; if (offsetLength == -1) { // no clue what the char length is. // Estimate the number of position slots we need from term stats based on Wikipedia. int sumTotalTermFreq = (int) vector.getSumTotalTermFreq(); if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat int size = (int) vector.size(); if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way size = 128; } sumTotalTermFreq = (int) (size * 2.4); } positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this } else { // guess number of token positions by this factor. positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION); } // apply the load factor. return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))]; } @Override public boolean incrementToken() throws IOException { int posInc; if (incrementToken == null) { if (!initialized) { init(); assert initialized; } incrementToken = firstToken; if (incrementToken == null) { return false; } posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc } else if (incrementToken.next != null) { int lastPosition = incrementToken.position; incrementToken = incrementToken.next; posInc = incrementToken.position - lastPosition; } else { return false; } clearAttributes(); termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen); positionIncrementAttribute.setPositionIncrement(posInc); if (offsetAttribute != null) { offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc); } if (payloadAttribute != null && incrementToken.payloadIndex >= 0) { payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex)); } return true; } private static class TokenLL implements Comparable<TokenLL> { // This class should weigh 32 bytes, including object header int termCharsOff; // see termCharsBuilder short termCharsLen; int position; int startOffset; short endOffsetInc; // add to startOffset to get endOffset int payloadIndex; TokenLL next; @Override public int compareTo(TokenLL tokenB) { int cmp = Integer.compare(this.position, tokenB.position); if (cmp == 0) { cmp = Integer.compare(this.startOffset, tokenB.startOffset); if (cmp == 0) { cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc); } } return cmp; } } }