package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Gathers all Fieldables for a document under the same
* name, updates FieldInfos, and calls per-field consumers
* to process field by field.
*
* Currently, only a single thread visits the fields,
* sequentially, for processing.
*/
final class DocFieldProcessorPerThread extends DocConsumerPerThread {
float docBoost;
int fieldGen;
final DocFieldProcessor docFieldProcessor;
final FieldInfos fieldInfos;
final DocFieldConsumerPerThread consumer;
// Holds all fields seen in current doc
DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1];
int fieldCount;
// Hash table for all fields ever seen
DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2];
int hashMask = 1;
int totalFieldCount;
final StoredFieldsWriterPerThread fieldsWriter;
final DocumentsWriter.DocState docState;
public DocFieldProcessorPerThread(DocumentsWriterThreadState threadState, DocFieldProcessor docFieldProcessor) throws IOException {
this.docState = threadState.docState;
this.docFieldProcessor = docFieldProcessor;
this.fieldInfos = docFieldProcessor.fieldInfos;
this.consumer = docFieldProcessor.consumer.addThread(this);
fieldsWriter = docFieldProcessor.fieldsWriter.addThread(docState);
}
@Override
public void abort() {
for(int i=0;i<fieldHash.length;i++) {
DocFieldProcessorPerField field = fieldHash[i];
while(field != null) {
final DocFieldProcessorPerField next = field.next;
field.abort();
field = next;
}
}
fieldsWriter.abort();
consumer.abort();
}
public Collection<DocFieldConsumerPerField> fields() {
Collection<DocFieldConsumerPerField> fields = new HashSet<DocFieldConsumerPerField>();
for(int i=0;i<fieldHash.length;i++) {
DocFieldProcessorPerField field = fieldHash[i];
while(field != null) {
fields.add(field.consumer);
field = field.next;
}
}
assert fields.size() == totalFieldCount;
return fields;
}
/** If there are fields we've seen but did not see again
* in the last run, then free them up. */
void trimFields(SegmentWriteState state) {
for(int i=0;i<fieldHash.length;i++) {
DocFieldProcessorPerField perField = fieldHash[i];
DocFieldProcessorPerField lastPerField = null;
while (perField != null) {
if (perField.lastGen == -1) {
// This field was not seen since the previous
// flush, so, free up its resources now
// Unhash
if (lastPerField == null)
fieldHash[i] = perField.next;
else
lastPerField.next = perField.next;
if (state.infoStream != null) {
state.infoStream.println(" purge field=" + perField.fieldInfo.name);
}
totalFieldCount--;
} else {
// Reset
perField.lastGen = -1;
lastPerField = perField;
}
perField = perField.next;
}
}
}
private void rehash() {
final int newHashSize = (fieldHash.length*2);
assert newHashSize > fieldHash.length;
final DocFieldProcessorPerField newHashArray[] = new DocFieldProcessorPerField[newHashSize];
// Rehash
int newHashMask = newHashSize-1;
for(int j=0;j<fieldHash.length;j++) {
DocFieldProcessorPerField fp0 = fieldHash[j];
while(fp0 != null) {
final int hashPos2 = fp0.fieldInfo.name.hashCode() & newHashMask;
DocFieldProcessorPerField nextFP0 = fp0.next;
fp0.next = newHashArray[hashPos2];
newHashArray[hashPos2] = fp0;
fp0 = nextFP0;
}
}
fieldHash = newHashArray;
hashMask = newHashMask;
}
@Override
public DocumentsWriter.DocWriter processDocument() throws IOException {
consumer.startDocument();
fieldsWriter.startDocument();
final Document doc = docState.doc;
assert docFieldProcessor.docWriter.writer.testPoint("DocumentsWriter.ThreadState.init start");
fieldCount = 0;
final int thisFieldGen = fieldGen++;
final List<Fieldable> docFields = doc.getFields();
final int numDocFields = docFields.size();
// Absorb any new fields first seen in this document.
// Also absorb any changes to fields we had already
// seen before (eg suddenly turning on norms or
// vectors, etc.):
for(int i=0;i<numDocFields;i++) {
Fieldable field = docFields.get(i);
final String fieldName = field.name();
// Make sure we have a PerField allocated
final int hashPos = fieldName.hashCode() & hashMask;
DocFieldProcessorPerField fp = fieldHash[hashPos];
while(fp != null && !fp.fieldInfo.name.equals(fieldName))
fp = fp.next;
if (fp == null) {
// TODO FI: we need to genericize the "flags" that a
// field holds, and, how these flags are merged; it
// needs to be more "pluggable" such that if I want
// to have a new "thing" my Fields can do, I can
// easily add it
FieldInfo fi = fieldInfos.add(fieldName, field.isIndexed(), field.isTermVectorStored(),
field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
field.getOmitNorms(), false, field.getOmitTermFreqAndPositions());
fp = new DocFieldProcessorPerField(this, fi);
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
if (totalFieldCount >= fieldHash.length/2)
rehash();
} else
fp.fieldInfo.update(field.isIndexed(), field.isTermVectorStored(),
field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(),
field.getOmitNorms(), false, field.getOmitTermFreqAndPositions());
if (thisFieldGen != fp.lastGen) {
// First time we're seeing this field for this doc
fp.fieldCount = 0;
if (fieldCount == fields.length) {
final int newSize = fields.length*2;
DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize];
System.arraycopy(fields, 0, newArray, 0, fieldCount);
fields = newArray;
}
fields[fieldCount++] = fp;
fp.lastGen = thisFieldGen;
}
if (fp.fieldCount == fp.fields.length) {
Fieldable[] newArray = new Fieldable[fp.fields.length*2];
System.arraycopy(fp.fields, 0, newArray, 0, fp.fieldCount);
fp.fields = newArray;
}
fp.fields[fp.fieldCount++] = field;
if (field.isStored()) {
fieldsWriter.addField(field, fp.fieldInfo);
}
}
// If we are writing vectors then we must visit
// fields in sorted order so they are written in
// sorted order. TODO: we actually only need to
// sort the subset of fields that have vectors
// enabled; we could save [small amount of] CPU
// here.
quickSort(fields, 0, fieldCount-1);
for(int i=0;i<fieldCount;i++)
fields[i].consumer.processFields(fields[i].fields, fields[i].fieldCount);
if (docState.maxTermPrefix != null && docState.infoStream != null) {
docState.infoStream.println("WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
docState.maxTermPrefix = null;
}
final DocumentsWriter.DocWriter one = fieldsWriter.finishDocument();
final DocumentsWriter.DocWriter two = consumer.finishDocument();
if (one == null) {
return two;
} else if (two == null) {
return one;
} else {
PerDoc both = getPerDoc();
both.docID = docState.docID;
assert one.docID == docState.docID;
assert two.docID == docState.docID;
both.one = one;
both.two = two;
return both;
}
}
void quickSort(DocFieldProcessorPerField[] array, int lo, int hi) {
if (lo >= hi)
return;
else if (hi == 1+lo) {
if (array[lo].fieldInfo.name.compareTo(array[hi].fieldInfo.name) > 0) {
final DocFieldProcessorPerField tmp = array[lo];
array[lo] = array[hi];
array[hi] = tmp;
}
return;
}
int mid = (lo + hi) >>> 1;
if (array[lo].fieldInfo.name.compareTo(array[mid].fieldInfo.name) > 0) {
DocFieldProcessorPerField tmp = array[lo];
array[lo] = array[mid];
array[mid] = tmp;
}
if (array[mid].fieldInfo.name.compareTo(array[hi].fieldInfo.name) > 0) {
DocFieldProcessorPerField tmp = array[mid];
array[mid] = array[hi];
array[hi] = tmp;
if (array[lo].fieldInfo.name.compareTo(array[mid].fieldInfo.name) > 0) {
DocFieldProcessorPerField tmp2 = array[lo];
array[lo] = array[mid];
array[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return;
DocFieldProcessorPerField partition = array[mid];
for (; ;) {
while (array[right].fieldInfo.name.compareTo(partition.fieldInfo.name) > 0)
--right;
while (left < right && array[left].fieldInfo.name.compareTo(partition.fieldInfo.name) <= 0)
++left;
if (left < right) {
DocFieldProcessorPerField tmp = array[left];
array[left] = array[right];
array[right] = tmp;
--right;
} else {
break;
}
}
quickSort(array, lo, left);
quickSort(array, left + 1, hi);
}
PerDoc[] docFreeList = new PerDoc[1];
int freeCount;
int allocCount;
synchronized PerDoc getPerDoc() {
if (freeCount == 0) {
allocCount++;
if (allocCount > docFreeList.length) {
// Grow our free list up front to make sure we have
// enough space to recycle all outstanding PerDoc
// instances
assert allocCount == 1+docFreeList.length;
docFreeList = new PerDoc[ArrayUtil.oversize(allocCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
}
return new PerDoc();
} else
return docFreeList[--freeCount];
}
synchronized void freePerDoc(PerDoc perDoc) {
assert freeCount < docFreeList.length;
docFreeList[freeCount++] = perDoc;
}
class PerDoc extends DocumentsWriter.DocWriter {
DocumentsWriter.DocWriter one;
DocumentsWriter.DocWriter two;
@Override
public long sizeInBytes() {
return one.sizeInBytes() + two.sizeInBytes();
}
@Override
public void finish() throws IOException {
try {
try {
one.finish();
} finally {
two.finish();
}
} finally {
freePerDoc(this);
}
}
@Override
public void abort() {
try {
try {
one.abort();
} finally {
two.abort();
}
} finally {
freePerDoc(this);
}
}
}
}