package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Closeable; import java.io.IOException; import java.io.Reader; import java.util.zip.DataFormatException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.AbstractField; import org.apache.lucene.document.CompressionTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.NumericField; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.IOUtils; import com.alimama.mdrill.fdtBlockCompress.FdtCompressIndexInput; /** * Class responsible for access to stored document fields. * <p/> * It uses <segment>.fdt and <segment>.fdx; files. */ final class FieldsReader implements Cloneable, Closeable { private static final Log LOG = LogFactory.getLog(FieldsReader.class); private final FieldInfos fieldInfos; // The main fieldStream, used only for cloning. private IndexInput cloneableFieldsStream; // This is a clone of cloneableFieldsStream used for reading documents. // It should not be cloned outside of a synchronized context. private final IndexInput fieldsStream; private final IndexInput cloneableIndexStream; private final IndexInput indexStream; private int numTotalDocs; private int size; private boolean closed; private final int format; private final int formatSize; // The docID offset where our docs begin in the index // file. This will be 0 if we have our own private file. private int docStoreOffset; private CloseableThreadLocal<IndexInput> fieldsStreamTL = new CloseableThreadLocal<IndexInput>(); private boolean isOriginal = false; /** Returns a cloned FieldsReader that shares open * IndexInputs with the original one. It is the caller's * job not to close the original FieldsReader until all * clones are called (eg, currently SegmentReader manages * this logic). */ @Override public Object clone() { ensureOpen(); return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream); } /** * Detects the code version this segment was written with. Returns either * "2.x" for all pre-3.0 segments, or "3.0" for 3.0 segments. This method * should not be called for 3.1+ segments since they already record their code * version. */ static String detectCodeVersion(Directory dir, String segment) throws IOException { IndexInput idxStream = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION), 1024); try { int format = idxStream.readInt(); if(format==FieldsWriterCompress.FORMAT_CURRENT) { return "3.0"; }else if (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) { return "2.x"; } else { return "3.0"; } } finally { idxStream.close(); } } // Used only by clone private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize, int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) { this.fieldInfos = fieldInfos; this.numTotalDocs = numTotalDocs; this.size = size; this.format = format; this.formatSize = formatSize; this.docStoreOffset = docStoreOffset; this.cloneableFieldsStream = cloneableFieldsStream; this.cloneableIndexStream = cloneableIndexStream; fieldsStream = (IndexInput) cloneableFieldsStream.clone(); indexStream = (IndexInput) cloneableIndexStream.clone(); } FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException { this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0); } FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException { this(d, segment, fn, readBufferSize, -1, 0); } FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException { boolean success = false; isOriginal = true; try { fieldInfos = fn; cloneableFieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_EXTENSION), readBufferSize); final String indexStreamFN = IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION); cloneableIndexStream = d.openInput(indexStreamFN, readBufferSize); // First version of fdx did not include a format // header, but, the first int will always be 0 in that // case int firstInt = cloneableIndexStream.readInt(); if (firstInt == 0) format = 0; else format = firstInt; if(format==FieldsWriterCompress.FORMAT_CURRENT) { formatSize = 4; }else if (format > FieldsWriter.FORMAT_CURRENT) { throw new IndexFormatTooNewException(cloneableIndexStream, format, 0, FieldsWriter.FORMAT_CURRENT); } else if (format > FieldsWriter.FORMAT) { formatSize = 4; } else { formatSize = 0; } if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) cloneableFieldsStream.setModifiedUTF8StringsMode(); if(format==FieldsWriterCompress.FORMAT_CURRENT) { cloneableFieldsStream=new FdtCompressIndexInput(cloneableFieldsStream); fieldsStream=(IndexInput) cloneableFieldsStream.clone(); }else{ fieldsStream = (IndexInput) cloneableFieldsStream.clone(); } final long indexSize = cloneableIndexStream.length()-formatSize; if (docStoreOffset != -1) { // We read only a slice out of this shared fields file this.docStoreOffset = docStoreOffset; this.size = size; // Verify the file is long enough to hold all of our // docs assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset; } else { this.docStoreOffset = 0; this.size = (int) (indexSize >> 3); } indexStream = (IndexInput) cloneableIndexStream.clone(); numTotalDocs = (int) (indexSize >> 3); success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { close(); } } } /** * @throws AlreadyClosedException if this FieldsReader is closed */ private void ensureOpen() throws AlreadyClosedException { if (closed) { throw new AlreadyClosedException("this FieldsReader is closed"); } } /** * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a * lazy implementation of a Field. This means that the Fields values will not be accessible. * * @throws IOException */ public final void close() throws IOException { if (!closed) { if (isOriginal) { IOUtils.close(fieldsStream, indexStream, fieldsStreamTL, cloneableFieldsStream, cloneableIndexStream); } else { IOUtils.close(fieldsStream, indexStream, fieldsStreamTL); } closed = true; } } final int size() { return size; } private final void seekIndex(int docID) throws IOException { indexStream.seek(formatSize + (docID + docStoreOffset) * 8L); } boolean canReadRawDocs() { // Disable reading raw docs in 2.x format, because of the removal of compressed // fields in 3.0. We don't want rawDocs() to decode field bits to figure out // if a field was compressed, hence we enforce ordinary (non-raw) stored field merges // for <3.0 indexes. return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; } final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { seekIndex(n); long position = indexStream.readLong(); fieldsStream.seek(position); Document doc = new Document(); int numFields = fieldsStream.readVInt(); // LOG.info("doc seek"+position+","+n+","+numFields); out: for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); int bits = fieldsStream.readByte() & 0xFF; assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_COMPRESSED | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true) : "compressed fields are only allowed in indexes of version <= 2.9"; if(compressed) { LOG.info("compressed"+position+","+n+","+fieldNumber+"@"+i+"@"+fi.name+",bits="+bits); } boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK; switch (acceptField) { case LOAD: { addField(doc, fi, binary, compressed, tokenize, numeric); break; } case LOAD_AND_BREAK: { addField(doc, fi, binary, compressed, tokenize, numeric); break out; //Get out of this loop } case LAZY_LOAD: { addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric); break; } case LATENT: { addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric); break; } case SIZE: { skipFieldBytes(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric)); break; } case SIZE_AND_BREAK: { addFieldSize(doc, fi, binary, compressed, numeric); break out; //Get out of this loop } default: { skipField(binary, compressed, numeric); } } } return doc; } /** Returns the length in bytes of each raw document in a * contiguous range of length numDocs starting with * startDocID. Returns the IndexInput (the fieldStream), * already seeked to the starting point for startDocID.*/ final IndexInput rawDocs(long[] posStartList,long[] posEndList, int startDocID, int numDocs) throws IOException { seekIndex(startDocID); long startOffset = indexStream.readLong(); long posStart = startOffset; int count = 0; while (count < numDocs) { final long posEnd; final int docID = docStoreOffset + startDocID + count + 1; assert docID <= numTotalDocs; if (docID < numTotalDocs) { posEnd = indexStream.readLong(); } else { posEnd = -1; } posStartList[count] = posStart; posEndList[count] = posEnd; count++; posStart = posEnd; } fieldsStream.seek(startOffset); return fieldsStream; } /** * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ private void skipField(boolean binary, boolean compressed, int numeric) throws IOException { final int numBytes; switch(numeric) { case 0: numBytes = fieldsStream.readVInt(); break; case FieldsWriter.FIELD_IS_NUMERIC_INT: fieldsStream.readVVInt(); numBytes=0; break; case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: fieldsStream.readVVVInt(); numBytes=0; break; case FieldsWriter.FIELD_IS_NUMERIC_LONG: fieldsStream.readVVLong(); numBytes=0; break; case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: fieldsStream.readVVVLong(); numBytes=0; break; default: throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); } skipFieldBytes(binary, compressed, numBytes); } private void skipFieldBytes(boolean binary, boolean compressed, int toRead) throws IOException { if (toRead <= 0) { return; } if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { byte[] skip = new byte[toRead]; fieldsStream.readBytes(skip, 0, toRead); } else { // We need to skip chars. This will slow us down, but still better fieldsStream.skipChars(toRead); } } private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException { assert numeric != 0; switch(numeric) { case FieldsWriter.FIELD_IS_NUMERIC_INT: return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readVVInt()); case FieldsWriter.FIELD_IS_NUMERIC_LONG: return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readVVLong()); case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readVVVInt())); case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readVVVLong())); default: throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); } } private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, int numeric) throws IOException { final AbstractField f; if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult); //Need to move the pointer ahead by toRead positions // fieldsStream.seek(pointer + toRead); fieldsStream.seek(pointer); byte[] skip=new byte[toRead]; fieldsStream.readBytes(skip,0,toRead); } else if (numeric != 0) { f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); if (compressed) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); f = new LazyField(fi.name, store, toRead, pointer, binary, compressed, cacheResult); //skip over the part that we aren't loading // fieldsStream.seek(pointer + toRead); fieldsStream.seek(pointer); byte[] skip=new byte[toRead]; fieldsStream.readBytes(skip,0,toRead); } else { int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); //Skip ahead of where we are by the length of what is stored if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { // fieldsStream.seek(pointer+length); fieldsStream.seek(pointer); byte[] skip=new byte[length]; fieldsStream.readBytes(skip,0,length); } else { fieldsStream.skipChars(length); } f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed, cacheResult); } } f.setOmitNorms(fi.omitNorms); f.setIndexOptions(fi.indexOptions); doc.add(f); } private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, int numeric) throws CorruptIndexException, IOException { final AbstractField f; //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); if (compressed) { f = new Field(fi.name, uncompress(b)); } else { f = new Field(fi.name, b); } } else if (numeric != 0) { f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); if (compressed) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); f = new Field(fi.name, // field name false, new String(uncompress(b), "UTF-8"), // uncompress the value and add as string store, index, termVector); } else { f = new Field(fi.name, // name false, fieldsStream.readString(), // read value store, index, termVector); } } f.setIndexOptions(fi.indexOptions); f.setOmitNorms(fi.omitNorms); doc.add(f); } // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, int numeric) throws IOException { final int bytesize, size; switch(numeric) { case 0: size = fieldsStream.readVInt(); bytesize = (binary || compressed) ? size : 2*size; break; case FieldsWriter.FIELD_IS_NUMERIC_INT: fieldsStream.readVVInt(); size=0; bytesize=4; break; case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: fieldsStream.readVVVInt(); size=0; bytesize=4; break; case FieldsWriter.FIELD_IS_NUMERIC_LONG: fieldsStream.readVVLong(); size=0; bytesize=8; break; case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: fieldsStream.readVVVLong(); size=0; bytesize=8; break; default: throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); } byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); sizebytes[2] = (byte) (bytesize>>> 8); sizebytes[3] = (byte) bytesize ; doc.add(new Field(fi.name, sizebytes)); return size; } /** * A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is * loaded. */ private class LazyField extends AbstractField implements Fieldable { private int toRead; private long pointer; /** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */ @Deprecated private boolean isCompressed; private boolean cacheResult; public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) { super(name, store, Field.Index.NO, Field.TermVector.NO); this.toRead = toRead; this.pointer = pointer; this.isBinary = isBinary; this.cacheResult = cacheResult; if (isBinary) binaryLength = toRead; lazy = true; this.isCompressed = isCompressed; } public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) { super(name, store, index, termVector); this.toRead = toRead; this.pointer = pointer; this.isBinary = isBinary; this.cacheResult = cacheResult; if (isBinary) binaryLength = toRead; lazy = true; this.isCompressed = isCompressed; } private IndexInput getFieldStream() { IndexInput localFieldsStream = fieldsStreamTL.get(); if (localFieldsStream == null) { localFieldsStream = (IndexInput) cloneableFieldsStream.clone(); fieldsStreamTL.set(localFieldsStream); } return localFieldsStream; } /** The value of the field as a Reader, or null. If null, the String value, * binary value, or TokenStream value is used. Exactly one of stringValue(), * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ public Reader readerValue() { ensureOpen(); return null; } /** The value of the field as a TokenStream, or null. If null, the Reader value, * String value, or binary value is used. Exactly one of stringValue(), * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ public TokenStream tokenStreamValue() { ensureOpen(); return null; } /** The value of the field as a String, or null. If null, the Reader value, * binary value, or TokenStream value is used. Exactly one of stringValue(), * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ public String stringValue() { ensureOpen(); if (isBinary) return null; else { if (fieldsData == null) { IndexInput localFieldsStream = getFieldStream(); String value; try { localFieldsStream.seek(pointer); if (isCompressed) { final byte[] b = new byte[toRead]; localFieldsStream.readBytes(b, 0, b.length); value = new String(uncompress(b), "UTF-8"); } else { if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { byte[] bytes = new byte[toRead]; localFieldsStream.readBytes(bytes, 0, toRead); value = new String(bytes, "UTF-8"); } else { //read in chars b/c we already know the length we need to read char[] chars = new char[toRead]; localFieldsStream.readChars(chars, 0, toRead); value = new String(chars); } } } catch (IOException e) { throw new FieldReaderException(e); } if (cacheResult){ fieldsData = value; } return value; } else{ return (String) fieldsData; } } } // public long getPointer() { // ensureOpen(); // return pointer; // } // // public void setPointer(long pointer) { // ensureOpen(); // this.pointer = pointer; // } // // public int getToRead() { // ensureOpen(); // return toRead; // } // // public void setToRead(int toRead) { // ensureOpen(); // this.toRead = toRead; // } @Override public byte[] getBinaryValue(byte[] result) { ensureOpen(); if (isBinary) { if (fieldsData == null) { // Allocate new buffer if result is null or too small final byte[] b; byte[] value; if (result == null || result.length < toRead) b = new byte[toRead]; else b = result; IndexInput localFieldsStream = getFieldStream(); // Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people // since they are already handling this exception when getting the document try { localFieldsStream.seek(pointer); localFieldsStream.readBytes(b, 0, toRead); if (isCompressed == true) { value = uncompress(b); } else { value = b; } } catch (IOException e) { throw new FieldReaderException(e); } binaryOffset = 0; binaryLength = toRead; if (cacheResult == true){ fieldsData = value; } return value; } else{ return (byte[]) fieldsData; } } else { return null; } } } private byte[] uncompress(byte[] b) throws CorruptIndexException { try { return CompressionTools.decompress(b); } catch (DataFormatException e) { // this will happen if the field is not compressed CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString()); newException.initCause(e); throw newException; } } }