package org.apache.lucene.index; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import org.apache.lucene.store.Directory; /** * This class replaces the existing term array, terminfos array, and indexpointer long array * to be much more compact while still maintaining a high level of performance. * @author Aaron McCurry amccurry@nearinfinity.com */ public class TermInfosReaderIndex { public static interface InstantOn { void save(TermInfosReaderIndex small, Directory directory, String segment); boolean load(TermInfosReaderIndex small, Directory directory, String segment); } public static InstantOn instantOn = new InstantOn() { @Override public void save(TermInfosReaderIndex index, Directory directory, String segment) { } @Override public boolean load(TermInfosReaderIndex index, Directory directory, String segment) { return false; } }; private int[] indexToTerms; private byte[] data; private Term[] fields; private String segment; private Directory directory; public TermInfosReaderIndex(Directory directory, String segment) { this.directory = directory; this.segment = segment; } /** * Loads the segment information at segment load time. */ public void build(SegmentTermEnum indexEnum, int indexDivisor, int tiiFileLength) throws IOException { if (instantOn.load(this,directory,segment)) { return; } int indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor; indexToTerms = new int[indexSize]; // this is only an inital size, it will be GCed once the build is complete int initialSize = (int) (tiiFileLength * 1.5); ByteArrayOutputStream baos = new ByteArrayOutputStream(initialSize); CustomDataOutputStream outputStream = new CustomDataOutputStream(baos); String currentField = null; List<String> fieldStrs = new ArrayList<String>(); int fieldCounter = -1; for (int i = 0; indexEnum.next(); i++) { Term term = indexEnum.term(); if (currentField != term.field) { currentField = term.field; fieldStrs.add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.termInfo(); indexToTerms[i] = baos.size(); outputStream.writeVInt(fieldCounter); outputStream.writeString(term.text()); outputStream.writeVInt(termInfo.docFreq); outputStream.writeVInt(termInfo.skipOffset); outputStream.writeVLong(termInfo.freqPointer); outputStream.writeVLong(termInfo.proxPointer); outputStream.writeVLong(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) if (!indexEnum.next()) break; } outputStream.close(); fields = new Term[fieldStrs.size()]; for (int i = 0; i < fields.length; i++) { fields[i] = new Term(fieldStrs.get(i)); } this.data = baos.toByteArray(); instantOn.save(this,directory,segment); } public void seekEnum(SegmentTermEnum enumerator, int indexOffset, int totalIndexInterval) throws IOException { int index = indexToTerms[indexOffset]; int[] intResults = new int[2]; long[] longResults = new long[2]; //read the field index and get the field from the index array Term field = getFieldTerm(index,intResults); index += intResults[1]; //increment the length of the vint //read the text length readVInt(data, index, intResults); index += intResults[1]; //increment the length of the vint //create the term from the field term for speed Term term = field.createTerm(getString(data, index, intResults, longResults)); index += intResults[1]; //increment the length of the string IN BYTES //create term info object TermInfo termInfo = new TermInfo(); //read the doc freq readVInt(data, index, intResults); termInfo.docFreq = intResults[0]; index += intResults[1]; //increment the length of the vint //read the skip offset readVInt(data, index, intResults); termInfo.skipOffset = intResults[0]; index += intResults[1]; //increment the length of the vint //read the freq pointer readVLong(data, index, longResults); termInfo.freqPointer = longResults[0]; index += longResults[1]; //increment the length of the vlong //read the prox pointer readVLong(data, index, longResults); termInfo.proxPointer = longResults[0]; index += longResults[1]; //increment the length of the vlong //read the long pointer readVLong(data, index, longResults); long pointer = longResults[0]; //perform the seek enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo); } /** * Binary search for the given term. */ public int getIndexOffset(Term term) { int lo = 0; // binary search indexTerms[] int hi = indexToTerms.length - 1; int[] buffer = new int[2]; // getBytes(term); while (hi >= lo) { int mid = (lo + hi) >>> 1; int delta = compare(term, mid, buffer); if (delta < 0) hi = mid - 1; else if (delta > 0) lo = mid + 1; else return mid; } return hi; } public int length() { return indexToTerms.length; } public int compareTo(Term term, int termIndex) { // getBytes(term); int[] buffer = new int[2]; //creating a buffer for return multiple values during vint reads return compare(term, termIndex, buffer); } private int compare(Term term, int termIndex, int[] results) { // if term field does not equal mid's field index, then compare fields // else if they are equal, compare term's string values... int c = compareField(term, termIndex, results); return c == 0 ? compareText(term, termIndex, results[1] /* needed to move the data index position because of the field vint length*/ , results) : c; } private int compareText(Term term, int termIndex, int fieldLengthOffset, int[] buffer) { //this method may need work when it comes to uni-code values //but handling all of them as int[] arrays seems to work byte[] s2b = data; //fetch the position of the term information and add the length of the field pointer vint int indexOfStr2 = indexToTerms[termIndex] + fieldLengthOffset; readVInt(data, indexOfStr2, buffer); char[] s1b = term.text.toCharArray(); int len1 = s1b.length; int len2 = buffer[0]; int n = Math.min(len1, len2); int i = 0; int j = indexOfStr2 + buffer[1]; //index of the binary of the string plus the vint offset //lexicographical compare while (n-- != 0) { char c1 = s1b[i++]; readVInt(s2b, j, buffer); //needed to use ints here (vint for better compaction) because of uni-code char c2 = (char) buffer[0]; j += buffer[1]; if (c1 != c2) { return c1- c2; } } return len1 - len2; } private int compareField(Term term, int termIndex, int[] buffer) { return term.field.compareTo(getFieldTerm(indexToTerms[termIndex], buffer).field); } /** * Fetches the field pointer from the data array given the data index value. * @param dataIndex the data index. * @param results used for calling readvint method. * @return the Term field. */ private Term getFieldTerm(int dataIndex, int[] results) { readVInt(data, dataIndex, results); return fields[results[0]]; } /** * Reads an integer from the data array at offset and populates the results array. * Position 0 is the value read, and position 1 is the amount of bytes read. * @param data the data byte array. * @param offset the offset into the array. * @param results the value and number of bytes read from the array. */ private static void readVInt(byte[] data, int offset, int[] results) { int originalOffset = offset; byte b = data[offset++]; int i = b & 0x7F; for (int shift = 7; (b & 0x80) != 0; shift += 7) { b = data[offset++]; i |= (b & 0x7F) << shift; } results[0] = i; results[1] = offset - originalOffset; } /** * Reads an long from the data array at offset and populates the results array. * Position 0 is the value read, and position 1 is the amount of bytes read. * @param data the data byte array. * @param offset the offset into the array. * @param results the value and number of bytes read from the array. */ private static void readVLong(byte[] data, int offset, long[] results) { int originalOffset = offset; byte b = data[offset++]; long i = b & 0x7F; for (int shift = 7; (b & 0x80) != 0; shift += 7) { b = data[offset++]; i |= (b & 0x7FL) << shift; } results[0] = i; results[1] = offset - originalOffset; } /** * Gets bytes form the text of the term and sets the text data reference for reuse in other segments. * @param term the term. */ // private static void getBytes(Term term) { // if (term.textData != null) { // return; // } // String text = term.text; // int[] data = new int[text.length()]; // for (int i = 0; i < data.length; i++) { // data[i] = text.charAt(i); // } // term.textData = data; // } /** * Gets a string object from the data array at the offset with length provided. * @param data the data byte array. * @param offset the offset to start reading the string. * @param length the string. * @return the string generated. */ private static String getString(byte[] data, int offset, int[] results, long[] buffer) { int length = results[0]; results[1] = 0; char[] chars = new char[length]; for (int i = 0; i < length; i++) { readVLong(data, offset, buffer); chars[i] = (char) buffer[0]; offset += buffer[1]; results[1] += buffer[1]; } return new String(chars); } private static class CustomDataOutputStream extends DataOutputStream { CustomDataOutputStream(OutputStream out) { super(out); } void writeString(String s) throws IOException { int length = s.length(); writeVInt(length); for (int i = 0; i < length; i++) { writeVLong(s.charAt(i)); } } void writeVInt(int i) throws IOException { while ((i & ~0x7F) != 0) { writeByte((byte) ((i & 0x7f) | 0x80)); i >>>= 7; } writeByte((byte) i); } void writeVLong(long i) throws IOException { while ((i & ~0x7F) != 0) { writeByte((byte) ((i & 0x7f) | 0x80)); i >>>= 7; } writeByte((byte) i); } } public int[] getIndexToTerms() { return indexToTerms; } public void setIndexToTerms(int[] indexToTerms) { this.indexToTerms = indexToTerms; } public byte[] getData() { return data; } public void setData(byte[] data) { this.data = data; } public Term[] getFields() { return fields; } public void setFields(Term[] fields) { this.fields = fields; } }