package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Comparator; import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.TermsConsumer; import org.apache.lucene.util.BytesRef; final class FreqProxTermsWriter extends TermsHashConsumer { @Override public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) { return new FreqProxTermsWriterPerThread(perThread); } @Override void closeDocStore(SegmentWriteState state) {} @Override void abort() {} private int flushedDocCount; // TODO: would be nice to factor out more of this, eg the // FreqProxFieldMergeState, and code to visit all Fields // under the same FieldInfo together, up into TermsHash*. // Other writers would presumably share alot of this... @Override public void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException { // Gather all FieldData's that have postings, across all // ThreadStates List<FreqProxTermsWriterPerField> allFields = new ArrayList<FreqProxTermsWriterPerField>(); flushedDocCount = state.numDocs; for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) { Collection<TermsHashConsumerPerField> fields = entry.getValue(); for (final TermsHashConsumerPerField i : fields) { final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i; if (perField.termsHashPerField.numPostings > 0) allFields.add(perField); } } final int numAllFields = allFields.size(); // Sort by field name Collections.sort(allFields); // TODO: allow Lucene user to customize this codec: final FieldsConsumer consumer = state.codec.fieldsConsumer(state); /* Current writer chain: FieldsConsumer -> IMPL: FormatPostingsTermsDictWriter -> TermsConsumer -> IMPL: FormatPostingsTermsDictWriter.TermsWriter -> DocsConsumer -> IMPL: FormatPostingsDocsWriter -> PositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ int start = 0; while(start < numAllFields) { final FieldInfo fieldInfo = allFields.get(start).fieldInfo; final String fieldName = fieldInfo.name; int end = start+1; while(end < numAllFields && allFields.get(end).fieldInfo.name.equals(fieldName)) end++; FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start]; for(int i=start;i<end;i++) { fields[i-start] = allFields.get(i); // Aggregate the storePayload as seen by the same // field across multiple threads fieldInfo.storePayloads |= fields[i-start].hasPayloads; } // If this field has postings then add them to the // segment appendPostings(fields, consumer); for(int i=0;i<fields.length;i++) { TermsHashPerField perField = fields[i].termsHashPerField; int numPostings = perField.numPostings; perField.reset(); perField.shrinkHash(numPostings); fields[i].reset(); } start = end; } for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) { FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } consumer.close(); } BytesRef payload; /* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, FieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; final BytesRef text = new BytesRef(); final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; final TermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); final Comparator<BytesRef> termComp = termsConsumer.getComparator(); for(int i=0;i<numFields;i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i], termComp); assert fms.field.fieldInfo == fields[0].fieldInfo; // Should always be true boolean result = fms.nextTerm(); assert result; } FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions; //System.out.println("flush terms field=" + fields[0].fieldInfo.name); // TODO: really TermsHashPerField should take over most // of this loop, including merge sort of terms from // multiple threads and interacting with the // TermsConsumer, only calling out to us (passing us the // DocsConsumer) to handle delivery of docs/positions while(numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; // TODO: pqueue for(int i=1;i<numFields;i++) { final int cmp = termComp.compare(mergeStates[i].text, termStates[0].text); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) { termStates[numToMerge++] = mergeStates[i]; } } // Need shallow copy here because termStates[0].text // changes by the time we call finishTerm text.bytes = termStates[0].text.bytes; text.offset = termStates[0].text.offset; text.length = termStates[0].text.length; //System.out.println(" term=" + text.toUnicodeString()); //System.out.println(" term=" + text.toString()); final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. int numDocs = 0; while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; for(int i=1;i<numToMerge;i++) { if (termStates[i].docID < minState.docID) { minState = termStates[i]; } } final int termDocFreq = minState.termFreq; numDocs++; assert minState.docID < flushedDocCount: "doc=" + minState.docID + " maxDoc=" + flushedDocCount; postingsConsumer.startDoc(minState.docID, termDocFreq); final ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTermFreqAndPositions) { // omitTermFreqAndPositions == false so we do write positions & // payload int position = 0; for(int j=0;j<termDocFreq;j++) { final int code = prox.readVInt(); position += code >> 1; //System.out.println(" pos=" + position); final int payloadLength; final BytesRef thisPayload; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.readVInt(); if (payload == null) { payload = new BytesRef(); payload.bytes = new byte[payloadLength]; } else if (payload.bytes.length < payloadLength) { payload.grow(payloadLength); } prox.readBytes(payload.bytes, 0, payloadLength); payload.length = payloadLength; thisPayload = payload; } else { payloadLength = 0; thisPayload = null; } postingsConsumer.addPosition(position, thisPayload); } //End for postingsConsumer.finishDoc(); } if (!minState.nextDoc()) { // Remove from termStates int upto = 0; // TODO: inefficient O(N) where N = number of // threads that had seen this term: for(int i=0;i<numToMerge;i++) { if (termStates[i] != minState) { termStates[upto++] = termStates[i]; } } numToMerge--; assert upto == numToMerge; // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for(int i=0;i<numFields;i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; assert upto == numFields; } } } assert numDocs > 0; termsConsumer.finishTerm(text, numDocs); } termsConsumer.finish(); } }