package org.apache.lucene.index; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.FieldInfosWriter; import org.apache.lucene.codecs.PerDocConsumer; import org.apache.lucene.index.DocumentsWriterPerThread.DocState; import org.apache.lucene.index.TypePromoter.TypeCompatibility; import org.apache.lucene.store.IOContext; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; /** * This is a DocConsumer that gathers all fields under the * same name, and calls per-field consumers to process field * by field. This class doesn't doesn't do any "real" work * of its own: it just forwards the fields to a * DocFieldConsumer. */ final class DocFieldProcessor extends DocConsumer { final DocFieldConsumer consumer; final StoredFieldsConsumer fieldsWriter; final Codec codec; // Holds all fields seen in current doc DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1]; int fieldCount; // Hash table for all fields ever seen DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2]; int hashMask = 1; int totalFieldCount; float docBoost; int fieldGen; final DocumentsWriterPerThread.DocState docState; public DocFieldProcessor(DocumentsWriterPerThread docWriter, DocFieldConsumer consumer) { this.docState = docWriter.docState; this.codec = docWriter.codec; this.consumer = consumer; fieldsWriter = new StoredFieldsConsumer(docWriter); } @Override public void flush(SegmentWriteState state) throws IOException { Map<String,DocFieldConsumerPerField> childFields = new HashMap<String,DocFieldConsumerPerField>(); Collection<DocFieldConsumerPerField> fields = fields(); for (DocFieldConsumerPerField f : fields) { childFields.put(f.getFieldInfo().name, f); } fieldsWriter.flush(state); consumer.flush(childFields, state); for (DocValuesConsumerHolder consumer : docValues.values()) { consumer.docValuesConsumer.finish(state.segmentInfo.getDocCount()); } // close perDocConsumer during flush to ensure all files are flushed due to PerCodec CFS IOUtils.close(perDocConsumer); // Important to save after asking consumer to flush so // consumer can alter the FieldInfo* if necessary. EG, // FreqProxTermsWriter does this with // FieldInfo.storePayload. FieldInfosWriter infosWriter = codec.fieldInfosFormat().getFieldInfosWriter(); infosWriter.write(state.directory, state.segmentInfo.name, state.fieldInfos, IOContext.DEFAULT); } @Override public void abort() { Throwable th = null; for (DocFieldProcessorPerField field : fieldHash) { while (field != null) { final DocFieldProcessorPerField next = field.next; try { field.abort(); } catch (Throwable t) { if (th == null) { th = t; } } field = next; } } IOUtils.closeWhileHandlingException(perDocConsumer); // TODO add abort to PerDocConsumer! try { fieldsWriter.abort(); } catch (Throwable t) { if (th == null) { th = t; } } try { consumer.abort(); } catch (Throwable t) { if (th == null) { th = t; } } try { if (perDocConsumer != null) { perDocConsumer.abort(); } } catch (Throwable t) { if (th == null) { th = t; } } // If any errors occured, throw it. if (th != null) { if (th instanceof RuntimeException) throw (RuntimeException) th; if (th instanceof Error) throw (Error) th; // defensive code - we should not hit unchecked exceptions throw new RuntimeException(th); } } @Override public boolean freeRAM() { return consumer.freeRAM(); } public Collection<DocFieldConsumerPerField> fields() { Collection<DocFieldConsumerPerField> fields = new HashSet<DocFieldConsumerPerField>(); for(int i=0;i<fieldHash.length;i++) { DocFieldProcessorPerField field = fieldHash[i]; while(field != null) { fields.add(field.consumer); field = field.next; } } assert fields.size() == totalFieldCount; return fields; } /** In flush we reset the fieldHash to not maintain per-field state * across segments */ @Override void doAfterFlush() { fieldHash = new DocFieldProcessorPerField[2]; hashMask = 1; totalFieldCount = 0; perDocConsumer = null; docValues.clear(); } private void rehash() { final int newHashSize = (fieldHash.length*2); assert newHashSize > fieldHash.length; final DocFieldProcessorPerField newHashArray[] = new DocFieldProcessorPerField[newHashSize]; // Rehash int newHashMask = newHashSize-1; for(int j=0;j<fieldHash.length;j++) { DocFieldProcessorPerField fp0 = fieldHash[j]; while(fp0 != null) { final int hashPos2 = fp0.fieldInfo.name.hashCode() & newHashMask; DocFieldProcessorPerField nextFP0 = fp0.next; fp0.next = newHashArray[hashPos2]; newHashArray[hashPos2] = fp0; fp0 = nextFP0; } } fieldHash = newHashArray; hashMask = newHashMask; } @Override public void processDocument(FieldInfos.Builder fieldInfos) throws IOException { consumer.startDocument(); fieldsWriter.startDocument(); fieldCount = 0; final int thisFieldGen = fieldGen++; // Absorb any new fields first seen in this document. // Also absorb any changes to fields we had already // seen before (eg suddenly turning on norms or // vectors, etc.): for(IndexableField field : docState.doc) { final String fieldName = field.name(); // Make sure we have a PerField allocated final int hashPos = fieldName.hashCode() & hashMask; DocFieldProcessorPerField fp = fieldHash[hashPos]; while(fp != null && !fp.fieldInfo.name.equals(fieldName)) { fp = fp.next; } if (fp == null) { // TODO FI: we need to genericize the "flags" that a // field holds, and, how these flags are merged; it // needs to be more "pluggable" such that if I want // to have a new "thing" my Fields can do, I can // easily add it FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.fieldType()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; fieldHash[hashPos] = fp; totalFieldCount++; if (totalFieldCount >= fieldHash.length/2) { rehash(); } } else { fieldInfos.addOrUpdate(fp.fieldInfo.name, field.fieldType()); } if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.fieldCount = 0; if (fieldCount == fields.length) { final int newSize = fields.length*2; DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize]; System.arraycopy(fields, 0, newArray, 0, fieldCount); fields = newArray; } fields[fieldCount++] = fp; fp.lastGen = thisFieldGen; } fp.addField(field); if (field.fieldType().stored()) { fieldsWriter.addField(field, fp.fieldInfo); } final DocValues.Type dvType = field.fieldType().docValueType(); if (dvType != null) { DocValuesConsumerHolder docValuesConsumer = docValuesConsumer(dvType, docState, fp.fieldInfo); DocValuesConsumer consumer = docValuesConsumer.docValuesConsumer; if (docValuesConsumer.compatibility == null) { consumer.add(docState.docID, field); docValuesConsumer.compatibility = new TypeCompatibility(dvType, consumer.getValueSize()); } else if (docValuesConsumer.compatibility.isCompatible(dvType, TypePromoter.getValueSize(dvType, field.binaryValue()))) { consumer.add(docState.docID, field); } else { docValuesConsumer.compatibility.isCompatible(dvType, TypePromoter.getValueSize(dvType, field.binaryValue())); TypeCompatibility compatibility = docValuesConsumer.compatibility; throw new IllegalArgumentException("Incompatible DocValues type: " + dvType.name() + " size: " + TypePromoter.getValueSize(dvType, field.binaryValue()) + " expected: " + " type: " + compatibility.getBaseType() + " size: " + compatibility.getBaseSize()); } } } // If we are writing vectors then we must visit // fields in sorted order so they are written in // sorted order. TODO: we actually only need to // sort the subset of fields that have vectors // enabled; we could save [small amount of] CPU // here. ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp); for(int i=0;i<fieldCount;i++) { final DocFieldProcessorPerField perField = fields[i]; perField.consumer.processFields(perField.fields, perField.fieldCount); } if (docState.maxTermPrefix != null && docState.infoStream.isEnabled("IW")) { docState.infoStream.message("IW", "WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"); docState.maxTermPrefix = null; } } private static final Comparator<DocFieldProcessorPerField> fieldsComp = new Comparator<DocFieldProcessorPerField>() { public int compare(DocFieldProcessorPerField o1, DocFieldProcessorPerField o2) { return o1.fieldInfo.name.compareTo(o2.fieldInfo.name); } }; @Override void finishDocument() throws IOException { try { fieldsWriter.finishDocument(); } finally { consumer.finishDocument(); } } private static class DocValuesConsumerHolder { // Only used to enforce that same DV field name is never // added more than once per doc: int docID; final DocValuesConsumer docValuesConsumer; TypeCompatibility compatibility; public DocValuesConsumerHolder(DocValuesConsumer docValuesConsumer) { this.docValuesConsumer = docValuesConsumer; } } final private Map<String, DocValuesConsumerHolder> docValues = new HashMap<String, DocValuesConsumerHolder>(); private PerDocConsumer perDocConsumer; DocValuesConsumerHolder docValuesConsumer(DocValues.Type valueType, DocState docState, FieldInfo fieldInfo) throws IOException { DocValuesConsumerHolder docValuesConsumerAndDocID = docValues.get(fieldInfo.name); if (docValuesConsumerAndDocID != null) { if (docState.docID == docValuesConsumerAndDocID.docID) { throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed, per field)"); } assert docValuesConsumerAndDocID.docID < docState.docID; docValuesConsumerAndDocID.docID = docState.docID; return docValuesConsumerAndDocID; } if (perDocConsumer == null) { PerDocWriteState perDocWriteState = docState.docWriter.newPerDocWriteState(""); perDocConsumer = docState.docWriter.codec.docValuesFormat().docsConsumer(perDocWriteState); if (perDocConsumer == null) { throw new IllegalStateException("codec=" + docState.docWriter.codec + " does not support docValues: from docValuesFormat().docsConsumer(...) returned null; field=" + fieldInfo.name); } } DocValuesConsumer docValuesConsumer = perDocConsumer.addValuesField(valueType, fieldInfo); assert fieldInfo.getDocValuesType() == null || fieldInfo.getDocValuesType() == valueType; fieldInfo.setDocValuesType(valueType); docValuesConsumerAndDocID = new DocValuesConsumerHolder(docValuesConsumer); docValuesConsumerAndDocID.docID = docState.docID; docValues.put(fieldInfo.name, docValuesConsumerAndDocID); return docValuesConsumerAndDocID; } }