package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.*;
final class ExactPhraseScorer extends Scorer {
private final byte[] norms;
private final float value;
private static final int SCORE_CACHE_SIZE = 32;
private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
private final int endMinus1;
private final static int CHUNK = 4096;
private int gen;
private final int[] counts = new int[CHUNK];
private final int[] gens = new int[CHUNK];
boolean noDocs;
private final static class ChunkState {
final TermPositions posEnum;
final int offset;
final boolean useAdvance;
int posUpto;
int posLimit;
int pos;
int lastPos;
public ChunkState(TermPositions posEnum, int offset, boolean useAdvance) {
this.posEnum = posEnum;
this.offset = offset;
this.useAdvance = useAdvance;
}
}
private final ChunkState[] chunkStates;
private int docID = -1;
private int freq;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) throws IOException {
super(similarity, weight);
this.norms = norms;
this.value = weight.getValue();
chunkStates = new ChunkState[postings.length];
endMinus1 = postings.length-1;
for(int i=0;i<postings.length;i++) {
// Coarse optimization: advance(target) is fairly
// costly, so, if the relative freq of the 2nd
// rarest term is not that much (> 1/5th) rarer than
// the first term, then we just use .nextDoc() when
// ANDing. This buys ~15% gain for phrases where
// freq of rarest 2 terms is close:
final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
if (i > 0 && !postings[i].postings.next()) {
noDocs = true;
return;
}
}
for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
scoreCache[i] = getSimilarity().tf((float) i) * value;
}
}
@Override
public int nextDoc() throws IOException {
while(true) {
// first (rarest) term
if (!chunkStates[0].posEnum.next()) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
}
final int doc = chunkStates[0].posEnum.doc();
// not-first terms
int i = 1;
while(i < chunkStates.length) {
final ChunkState cs = chunkStates[i];
int doc2 = cs.posEnum.doc();
if (cs.useAdvance) {
if (doc2 < doc) {
if (!cs.posEnum.skipTo(doc)) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
} else {
doc2 = cs.posEnum.doc();
}
}
} else {
int iter = 0;
while(doc2 < doc) {
// safety net -- fallback to .skipTo if we've
// done too many .nextDocs
if (++iter == 50) {
if (!cs.posEnum.skipTo(doc)) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
} else {
doc2 = cs.posEnum.doc();
}
break;
} else {
if (cs.posEnum.next()) {
doc2 = cs.posEnum.doc();
} else {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
}
}
}
}
if (doc2 > doc) {
break;
}
i++;
}
if (i == chunkStates.length) {
// this doc has all the terms -- now test whether
// phrase occurs
docID = doc;
freq = phraseFreq();
if (freq != 0) {
return docID;
}
}
}
}
@Override
public int advance(int target) throws IOException {
// first term
if (!chunkStates[0].posEnum.skipTo(target)) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
}
int doc = chunkStates[0].posEnum.doc();
while(true) {
// not-first terms
int i = 1;
while(i < chunkStates.length) {
int doc2 = chunkStates[i].posEnum.doc();
if (doc2 < doc) {
if (!chunkStates[i].posEnum.skipTo(doc)) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
} else {
doc2 = chunkStates[i].posEnum.doc();
}
}
if (doc2 > doc) {
break;
}
i++;
}
if (i == chunkStates.length) {
// this doc has all the terms -- now test whether
// phrase occurs
docID = doc;
freq = phraseFreq();
if (freq != 0) {
return docID;
}
}
if (!chunkStates[0].posEnum.next()) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return docID;
} else {
doc = chunkStates[0].posEnum.doc();
}
}
}
@Override
public String toString() {
return "ExactPhraseScorer(" + weight + ")";
}
@Override
public float freq() {
return freq;
}
@Override
public int docID() {
return docID;
}
@Override
public float score() throws IOException {
final float raw; // raw score
if (freq < SCORE_CACHE_SIZE) {
raw = scoreCache[freq];
} else {
raw = getSimilarity().tf((float) freq) * value;
}
return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
}
private int phraseFreq() throws IOException {
freq = 0;
// init chunks
for(int i=0;i<chunkStates.length;i++) {
final ChunkState cs = chunkStates[i];
cs.posLimit = cs.posEnum.freq();
cs.pos = cs.offset + cs.posEnum.nextPosition();
cs.posUpto = 1;
cs.lastPos = -1;
}
int chunkStart = 0;
int chunkEnd = CHUNK;
// process chunk by chunk
boolean end = false;
// TODO: we could fold in chunkStart into offset and
// save one subtract per pos incr
while(!end) {
gen++;
if (gen == 0) {
// wraparound
Arrays.fill(gens, 0);
gen++;
}
// first term
{
final ChunkState cs = chunkStates[0];
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
counts[posIndex] = 1;
assert gens[posIndex] != gen;
gens[posIndex] = gen;
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
}
// middle terms
boolean any = true;
for(int t=1;t<endMinus1;t++) {
final ChunkState cs = chunkStates[t];
any = false;
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
// viable
counts[posIndex]++;
any = true;
}
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
if (!any) {
break;
}
}
if (!any) {
// petered out for this chunk
chunkStart += CHUNK;
chunkEnd += CHUNK;
continue;
}
// last term
{
final ChunkState cs = chunkStates[endMinus1];
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
freq++;
}
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
}
chunkStart += CHUNK;
chunkEnd += CHUNK;
}
return freq;
}
}