package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.UnicodeUtil; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Map; import java.util.ArrayList; import java.util.List; final class FreqProxTermsWriter extends TermsHashConsumer { @Override public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) { return new FreqProxTermsWriterPerThread(perThread); } @Override void createPostings(RawPostingList[] postings, int start, int count) { final int end = start + count; for(int i=start;i<end;i++) postings[i] = new PostingList(); } private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) { while(true) { final char c1 = text1[pos1++]; final char c2 = text2[pos2++]; if (c1 != c2) { if (0xffff == c2) return 1; else if (0xffff == c1) return -1; else return c1-c2; } else if (0xffff == c1) return 0; } } @Override void closeDocStore(SegmentWriteState state) {} @Override void abort() {} // TODO: would be nice to factor out more of this, eg the // FreqProxFieldMergeState, and code to visit all Fields // under the same FieldInfo together, up into TermsHash*. // Other writers would presumably share alot of this... @Override public void flush(Map<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> threadsAndFields, final SegmentWriteState state) throws IOException { // Gather all FieldData's that have postings, across all // ThreadStates List<FreqProxTermsWriterPerField> allFields = new ArrayList<FreqProxTermsWriterPerField>(); for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) { Collection<TermsHashConsumerPerField> fields = entry.getValue(); for (final TermsHashConsumerPerField i : fields) { final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i; if (perField.termsHashPerField.numPostings > 0) allFields.add(perField); } } // Sort by field name Collections.sort(allFields); final int numAllFields = allFields.size(); // TODO: allow Lucene user to customize this consumer: final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); /* Current writer chain: FormatPostingsFieldsConsumer -> IMPL: FormatPostingsFieldsWriter -> FormatPostingsTermsConsumer -> IMPL: FormatPostingsTermsWriter -> FormatPostingsDocConsumer -> IMPL: FormatPostingsDocWriter -> FormatPostingsPositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ int start = 0; while(start < numAllFields) { final FieldInfo fieldInfo = allFields.get(start).fieldInfo; final String fieldName = fieldInfo.name; int end = start+1; while(end < numAllFields && allFields.get(end).fieldInfo.name.equals(fieldName)) end++; FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start]; for(int i=start;i<end;i++) { fields[i-start] = allFields.get(i); // Aggregate the storePayload as seen by the same // field across multiple threads fieldInfo.storePayloads |= fields[i-start].hasPayloads; } // If this field has postings then add them to the // segment appendPostings(fields, consumer); for(int i=0;i<fields.length;i++) { TermsHashPerField perField = fields[i].termsHashPerField; int numPostings = perField.numPostings; perField.reset(); perField.shrinkHash(numPostings); fields[i].reset(); } start = end; } for (Map.Entry<TermsHashConsumerPerThread,Collection<TermsHashConsumerPerField>> entry : threadsAndFields.entrySet()) { FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } consumer.finish(); } private byte[] payloadBuffer; /* Walk through all unique text tokens (Posting * instances) found in this field and serialize them * into a single RAM segment. */ void appendPostings(FreqProxTermsWriterPerField[] fields, FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException { int numFields = fields.length; final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields]; for(int i=0;i<numFields;i++) { FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]); assert fms.field.fieldInfo == fields[0].fieldInfo; // Should always be true boolean result = fms.nextTerm(); assert result; } final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo); FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions; while(numFields > 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for(int i=1;i<numFields;i++) { final char[] text = mergeStates[i].text; final int textOffset = mergeStates[i].textOffset; final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset); if (cmp < 0) { termStates[0] = mergeStates[i]; numToMerge = 1; } else if (cmp == 0) termStates[numToMerge++] = mergeStates[i]; } final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset); // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; for(int i=1;i<numToMerge;i++) if (termStates[i].docID < minState.docID) minState = termStates[i]; final int termDocFreq = minState.termFreq; final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq); final ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. if (!currentFieldOmitTermFreqAndPositions) { // omitTermFreqAndPositions == false so we do write positions & // payload int position = 0; for(int j=0;j<termDocFreq;j++) { final int code = prox.readVInt(); position += code >> 1; final int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.readVInt(); if (payloadBuffer == null || payloadBuffer.length < payloadLength) payloadBuffer = new byte[payloadLength]; prox.readBytes(payloadBuffer, 0, payloadLength); } else payloadLength = 0; posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } //End for posConsumer.finish(); } if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for(int i=0;i<numToMerge;i++) if (termStates[i] != minState) termStates[upto++] = termStates[i]; numToMerge--; assert upto == numToMerge; // Advance this state to the next term if (!minState.nextTerm()) { // OK, no more terms, so remove from mergeStates // as well upto = 0; for(int i=0;i<numFields;i++) if (mergeStates[i] != minState) mergeStates[upto++] = mergeStates[i]; numFields--; assert upto == numFields; } } } docConsumer.finish(); } termsConsumer.finish(); } final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result(); static final class PostingList extends RawPostingList { int docFreq; // # times this term occurs in the current doc int lastDocID; // Last docID where this term occurred int lastDocCode; // Code for prior doc int lastPosition; // Last position where this term occurred } @Override int bytesPerPosting() { return RawPostingList.BYTES_SIZE + 4 * DocumentsWriter.INT_NUM_BYTE; } }