package org.apache.lucene.codecs.lucene40; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ import java.io.Closeable; import java.io.IOException; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.document.Document; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentReader; import org.apache.lucene.index.StorableField; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; /** * Class responsible for writing stored document fields. * <p/> * It uses <segment>.fdt and <segment>.fdx; files. * * @see Lucene40StoredFieldsFormat * @lucene.experimental */ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { // NOTE: bit 0 is free here! You can steal it! static final int FIELD_IS_BINARY = 1 << 1; // the old bit 1 << 2 was compressed, is now left out private static final int _NUMERIC_BIT_SHIFT = 3; static final int FIELD_IS_NUMERIC_MASK = 0x07 << _NUMERIC_BIT_SHIFT; static final int FIELD_IS_NUMERIC_INT = 1 << _NUMERIC_BIT_SHIFT; static final int FIELD_IS_NUMERIC_LONG = 2 << _NUMERIC_BIT_SHIFT; static final int FIELD_IS_NUMERIC_FLOAT = 3 << _NUMERIC_BIT_SHIFT; static final int FIELD_IS_NUMERIC_DOUBLE = 4 << _NUMERIC_BIT_SHIFT; // the next possible bits are: 1 << 6; 1 << 7 // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; static final String CODEC_NAME_IDX = "Lucene40StoredFieldsIndex"; static final String CODEC_NAME_DAT = "Lucene40StoredFieldsData"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; static final long HEADER_LENGTH_IDX = CodecUtil.headerLength(CODEC_NAME_IDX); static final long HEADER_LENGTH_DAT = CodecUtil.headerLength(CODEC_NAME_DAT); /** Extension of stored fields file */ public static final String FIELDS_EXTENSION = "fdt"; /** Extension of stored fields index file */ public static final String FIELDS_INDEX_EXTENSION = "fdx"; private final Directory directory; private final String segment; private IndexOutput fieldsStream; private IndexOutput indexStream; /** Sole constructor. */ public Lucene40StoredFieldsWriter(Directory directory, String segment, IOContext context) throws IOException { assert directory != null; this.directory = directory; this.segment = segment; boolean success = false; try { fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context); indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION), context); CodecUtil.writeHeader(fieldsStream, CODEC_NAME_DAT, VERSION_CURRENT); CodecUtil.writeHeader(indexStream, CODEC_NAME_IDX, VERSION_CURRENT); assert HEADER_LENGTH_DAT == fieldsStream.getFilePointer(); assert HEADER_LENGTH_IDX == indexStream.getFilePointer(); success = true; } finally { if (!success) { abort(); } } } // Writes the contents of buffer into the fields stream // and adds a new entry for this document into the index // stream. This assumes the buffer was already written // in the correct fields format. @Override public void startDocument(int numStoredFields) throws IOException { indexStream.writeLong(fieldsStream.getFilePointer()); fieldsStream.writeVInt(numStoredFields); } @Override public void close() throws IOException { try { IOUtils.close(fieldsStream, indexStream); } finally { fieldsStream = indexStream = null; } } @Override public void abort() { try { close(); } catch (Throwable ignored) {} IOUtils.deleteFilesIgnoringExceptions(directory, IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION)); } @Override public void writeField(FieldInfo info, StorableField field) throws IOException { fieldsStream.writeVInt(info.number); int bits = 0; final BytesRef bytes; final String string; // TODO: maybe a field should serialize itself? // this way we don't bake into indexer all these // specific encodings for different fields? and apps // can customize... Number number = field.numericValue(); if (number != null) { if (number instanceof Byte || number instanceof Short || number instanceof Integer) { bits |= FIELD_IS_NUMERIC_INT; } else if (number instanceof Long) { bits |= FIELD_IS_NUMERIC_LONG; } else if (number instanceof Float) { bits |= FIELD_IS_NUMERIC_FLOAT; } else if (number instanceof Double) { bits |= FIELD_IS_NUMERIC_DOUBLE; } else { throw new IllegalArgumentException("cannot store numeric type " + number.getClass()); } string = null; bytes = null; } else { bytes = field.binaryValue(); if (bytes != null) { bits |= FIELD_IS_BINARY; string = null; } else { string = field.stringValue(); if (string == null) { throw new IllegalArgumentException("field " + field.name() + " is stored but does not have binaryValue, stringValue nor numericValue"); } } } fieldsStream.writeByte((byte) bits); if (bytes != null) { fieldsStream.writeVInt(bytes.length); fieldsStream.writeBytes(bytes.bytes, bytes.offset, bytes.length); } else if (string != null) { fieldsStream.writeString(field.stringValue()); } else { if (number instanceof Byte || number instanceof Short || number instanceof Integer) { fieldsStream.writeInt(number.intValue()); } else if (number instanceof Long) { fieldsStream.writeLong(number.longValue()); } else if (number instanceof Float) { fieldsStream.writeInt(Float.floatToIntBits(number.floatValue())); } else if (number instanceof Double) { fieldsStream.writeLong(Double.doubleToLongBits(number.doubleValue())); } else { throw new AssertionError("Cannot get here"); } } } /** Bulk write a contiguous series of documents. The * lengths array is the length (in bytes) of each raw * document. The stream IndexInput is the * fieldsStream from which we should bulk-copy all * bytes. */ public void addRawDocuments(IndexInput stream, int[] lengths, int numDocs) throws IOException { long position = fieldsStream.getFilePointer(); long start = position; for(int i=0;i<numDocs;i++) { indexStream.writeLong(position); position += lengths[i]; } fieldsStream.copyBytes(stream, position-start); assert fieldsStream.getFilePointer() == position; } @Override public void finish(FieldInfos fis, int numDocs) { if (HEADER_LENGTH_IDX+((long) numDocs)*8 != indexStream.getFilePointer()) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from // entering the index. See LUCENE-1282 for // details. throw new RuntimeException("fdx size mismatch: docCount is " + numDocs + " but fdx file size is " + indexStream.getFilePointer() + " file=" + indexStream.toString() + "; now aborting this merge to prevent index corruption"); } @Override public int merge(MergeState mergeState) throws IOException { int docCount = 0; // Used for bulk-reading raw bytes for stored fields int rawDocLengths[] = new int[MAX_RAW_MERGE_DOCS]; int idx = 0; for (AtomicReader reader : mergeState.readers) { final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++]; Lucene40StoredFieldsReader matchingFieldsReader = null; if (matchingSegmentReader != null) { final StoredFieldsReader fieldsReader = matchingSegmentReader.getFieldsReader(); // we can only bulk-copy if the matching reader is also a Lucene40FieldsReader if (fieldsReader != null && fieldsReader instanceof Lucene40StoredFieldsReader) { matchingFieldsReader = (Lucene40StoredFieldsReader) fieldsReader; } } if (reader.getLiveDocs() != null) { docCount += copyFieldsWithDeletions(mergeState, reader, matchingFieldsReader, rawDocLengths); } else { docCount += copyFieldsNoDeletions(mergeState, reader, matchingFieldsReader, rawDocLengths); } } finish(mergeState.fieldInfos, docCount); return docCount; } /** Maximum number of contiguous documents to bulk-copy when merging stored fields */ private final static int MAX_RAW_MERGE_DOCS = 4192; private int copyFieldsWithDeletions(MergeState mergeState, final AtomicReader reader, final Lucene40StoredFieldsReader matchingFieldsReader, int rawDocLengths[]) throws IOException { int docCount = 0; final int maxDoc = reader.maxDoc(); final Bits liveDocs = reader.getLiveDocs(); assert liveDocs != null; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc;) { if (!liveDocs.get(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) break; if (!liveDocs.get(j)) { j++; break; } } while(numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs); addRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; mergeState.checkAbort.work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (!liveDocs.get(j)) { // skip deleted docs continue; } // TODO: this could be more efficient using // FieldVisitor instead of loading/writing entire // doc; ie we just have to renumber the field number // on the fly? // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 StoredDocument doc = reader.document(j); addDocument(doc, mergeState.fieldInfos); docCount++; mergeState.checkAbort.work(300); } } return docCount; } private int copyFieldsNoDeletions(MergeState mergeState, final AtomicReader reader, final Lucene40StoredFieldsReader matchingFieldsReader, int rawDocLengths[]) throws IOException { final int maxDoc = reader.maxDoc(); int docCount = 0; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" while (docCount < maxDoc) { int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, docCount, len); addRawDocuments(stream, rawDocLengths, len); docCount += len; mergeState.checkAbort.work(300 * len); } } else { for (; docCount < maxDoc; docCount++) { // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 StoredDocument doc = reader.document(docCount); addDocument(doc, mergeState.fieldInfos); mergeState.checkAbort.work(300); } } return docCount; } }