package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Comparator;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.util.BytesRef;
final class FreqProxTermsWriter extends TermsHashConsumer {
@Override
public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) {
return new FreqProxTermsWriterPerThread(perThread);
}
@Override
void closeDocStore(SegmentWriteState state) {}
@Override
void abort() {}
private int flushedDocCount;
// TODO: would be nice to factor out more of this, eg the
// FreqProxFieldMergeState, and code to visit all Fields
// under the same FieldInfo together, up into TermsHash*.
// Other writers would presumably share alot of this...
@Override
public void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException {
// Gather all FieldData's that have postings, across all
// ThreadStates
List<FreqProxTermsWriterPerField> allFields = new ArrayList<FreqProxTermsWriterPerField>();
flushedDocCount = state.numDocs;
for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) {
Collection<TermsHashConsumerPerField> fields = entry.getValue();
for (final TermsHashConsumerPerField i : fields) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i;
if (perField.termsHashPerField.numPostings > 0)
allFields.add(perField);
}
}
final int numAllFields = allFields.size();
// Sort by field name
Collections.sort(allFields);
// TODO: allow Lucene user to customize this codec:
final FieldsConsumer consumer = state.codec.fieldsConsumer(state);
/*
Current writer chain:
FieldsConsumer
-> IMPL: FormatPostingsTermsDictWriter
-> TermsConsumer
-> IMPL: FormatPostingsTermsDictWriter.TermsWriter
-> DocsConsumer
-> IMPL: FormatPostingsDocsWriter
-> PositionsConsumer
-> IMPL: FormatPostingsPositionsWriter
*/
int start = 0;
while(start < numAllFields) {
final FieldInfo fieldInfo = allFields.get(start).fieldInfo;
final String fieldName = fieldInfo.name;
int end = start+1;
while(end < numAllFields && allFields.get(end).fieldInfo.name.equals(fieldName))
end++;
FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start];
for(int i=start;i<end;i++) {
fields[i-start] = allFields.get(i);
// Aggregate the storePayload as seen by the same
// field across multiple threads
fieldInfo.storePayloads |= fields[i-start].hasPayloads;
}
// If this field has postings then add them to the
// segment
appendPostings(fields, consumer);
for(int i=0;i<fields.length;i++) {
TermsHashPerField perField = fields[i].termsHashPerField;
int numPostings = perField.numPostings;
perField.reset();
perField.shrinkHash(numPostings);
fields[i].reset();
}
start = end;
}
for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) {
FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey();
perThread.termsHashPerThread.reset(true);
}
consumer.close();
}
BytesRef payload;
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
void appendPostings(FreqProxTermsWriterPerField[] fields,
FieldsConsumer consumer)
throws CorruptIndexException, IOException {
int numFields = fields.length;
final BytesRef text = new BytesRef();
final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
final Comparator<BytesRef> termComp = termsConsumer.getComparator();
for(int i=0;i<numFields;i++) {
FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i], termComp);
assert fms.field.fieldInfo == fields[0].fieldInfo;
// Should always be true
boolean result = fms.nextTerm();
assert result;
}
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
//System.out.println("flush terms field=" + fields[0].fieldInfo.name);
// TODO: really TermsHashPerField should take over most
// of this loop, including merge sort of terms from
// multiple threads and interacting with the
// TermsConsumer, only calling out to us (passing us the
// DocsConsumer) to handle delivery of docs/positions
while(numFields > 0) {
// Get the next term to merge
termStates[0] = mergeStates[0];
int numToMerge = 1;
// TODO: pqueue
for(int i=1;i<numFields;i++) {
final int cmp = termComp.compare(mergeStates[i].text, termStates[0].text);
if (cmp < 0) {
termStates[0] = mergeStates[i];
numToMerge = 1;
} else if (cmp == 0) {
termStates[numToMerge++] = mergeStates[i];
}
}
// Need shallow copy here because termStates[0].text
// changes by the time we call finishTerm
text.bytes = termStates[0].text.bytes;
text.offset = termStates[0].text.offset;
text.length = termStates[0].text.length;
//System.out.println(" term=" + text.toUnicodeString());
//System.out.println(" term=" + text.toString());
final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text);
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
int numDocs = 0;
while(numToMerge > 0) {
FreqProxFieldMergeState minState = termStates[0];
for(int i=1;i<numToMerge;i++) {
if (termStates[i].docID < minState.docID) {
minState = termStates[i];
}
}
final int termDocFreq = minState.termFreq;
numDocs++;
assert minState.docID < flushedDocCount: "doc=" + minState.docID + " maxDoc=" + flushedDocCount;
postingsConsumer.startDoc(minState.docID, termDocFreq);
final ByteSliceReader prox = minState.prox;
// Carefully copy over the prox + payload info,
// changing the format to match Lucene's segment
// format.
if (!currentFieldOmitTermFreqAndPositions) {
// omitTermFreqAndPositions == false so we do write positions &
// payload
int position = 0;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
position += code >> 1;
//System.out.println(" pos=" + position);
final int payloadLength;
final BytesRef thisPayload;
if ((code & 1) != 0) {
// This position has a payload
payloadLength = prox.readVInt();
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[payloadLength];
} else if (payload.bytes.length < payloadLength) {
payload.grow(payloadLength);
}
prox.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
thisPayload = payload;
} else {
payloadLength = 0;
thisPayload = null;
}
postingsConsumer.addPosition(position, thisPayload);
} //End for
postingsConsumer.finishDoc();
}
if (!minState.nextDoc()) {
// Remove from termStates
int upto = 0;
// TODO: inefficient O(N) where N = number of
// threads that had seen this term:
for(int i=0;i<numToMerge;i++) {
if (termStates[i] != minState) {
termStates[upto++] = termStates[i];
}
}
numToMerge--;
assert upto == numToMerge;
// Advance this state to the next term
if (!minState.nextTerm()) {
// OK, no more terms, so remove from mergeStates
// as well
upto = 0;
for(int i=0;i<numFields;i++)
if (mergeStates[i] != minState)
mergeStates[upto++] = mergeStates[i];
numFields--;
assert upto == numFields;
}
}
}
assert numDocs > 0;
termsConsumer.finishTerm(text, numDocs);
}
termsConsumer.finish();
}
}