package com.aptana.ide.internal.index.core; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.io.UTFDataFormatException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import com.aptana.ide.index.core.Index; import com.aptana.ide.index.core.QueryResult; import com.aptana.ide.index.core.SearchPattern; /** * Yuck this needs to be a random access file that stores the index of documents, category names, and the relationship * between them. We need to be careful to allow for quick access into the categories when reading, and to keep filesize * down if possible. * * @author cwilliams */ public class DiskIndex { private static final String SIGNATURE = "INDEX VERSION 0.1"; private static final int CHUNK_SIZE = 100; private static final int RE_INDEXED = -1; private static final int DELETED = -2; private static final boolean DEBUG = true; private File indexFile; private int headerInfoOffset; private int streamRead; private int numberOfChunks; private int sizeOfLastChunk; private int documentReferenceSize; private char separator = Index.DEFAULT_SEPARATOR; private int[] chunkOffsets; private int startOfCategoryTables; private Map<String, Integer> categoryOffsets; private Map<String, Integer> categoryEnds; // Usually a map from string to map from string to list of integer. But may also be a single integer (to represent a // pointer to long array) // FIXME YUCK!! This "usually a list of integers, sometimes one integer that acts as a pointer" stuff is killing me! private Map<String, Map<String, Object>> categoryTables; private int streamEnd; private String cachedCategoryName; private String[][] cachedChunks; private String[] categoriesToDiscard; public DiskIndex(String fileName) { this.indexFile = new File(fileName); // clear cached items this.headerInfoOffset = -1; this.numberOfChunks = -1; this.sizeOfLastChunk = -1; this.chunkOffsets = null; this.documentReferenceSize = -1; this.categoryTables = null; this.categoryOffsets = null; this.categoryEnds = null; this.categoriesToDiscard = null; } private long skip(InputStream stream, long n) throws IOException { final long wantToSkip = n; long actuallySkipped = 0; while (n > 0) { long skipped = stream.skip(n); actuallySkipped += skipped; n = n - skipped; if (actuallySkipped == wantToSkip) return actuallySkipped; int read = stream.read(); if (read == -1) return actuallySkipped; n--; actuallySkipped++; } return actuallySkipped; } public void initialize() throws IOException { if (this.indexFile.exists()) { // read it in! InputStream stream = new BufferedInputStream(new FileInputStream(this.indexFile)); try { streamRead = 0; String signature = readString(stream); if (!signature.equals(SIGNATURE)) { throw new IOException("Messages.exception_wrongFormat"); } this.headerInfoOffset = readStreamInt(stream); if (this.headerInfoOffset > 0) { // file is empty if its not set skip(stream, this.headerInfoOffset - this.streamRead); // assume that the header info offset is over // current buffer end readHeaderInfo(stream); } } finally { stream.close(); } return; } else { // create a new empty one! if (indexFile.createNewFile()) { FileOutputStream stream = new FileOutputStream(this.indexFile, false); try { writeString(stream, SIGNATURE); stream.write(-1); } finally { stream.close(); } } else throw new IOException("Failed to create new index file " + indexFile); } } private void readHeaderInfo(InputStream stream) throws IOException { // must be same order as writeHeaderInfo() this.numberOfChunks = readStreamInt(stream); this.sizeOfLastChunk = read(stream) & 0xFF; this.documentReferenceSize = read(stream) & 0xFF; this.separator = (char) (read(stream) & 0xFF); this.chunkOffsets = new int[this.numberOfChunks]; for (int i = 0; i < this.numberOfChunks; i++) this.chunkOffsets[i] = readStreamInt(stream); this.startOfCategoryTables = readStreamInt(stream); int size = readStreamInt(stream); this.categoryOffsets = new HashMap<String, Integer>(size); this.categoryEnds = new HashMap<String, Integer>(size); String previousCategory = null; int offset = -1; for (int i = 0; i < size; i++) { String categoryName = readString(stream); offset = readStreamInt(stream); this.categoryOffsets.put(categoryName, offset); // cache offset to category table if (previousCategory != null) { this.categoryEnds.put(previousCategory, offset); // cache end of the category table } previousCategory = categoryName; } if (previousCategory != null) { this.categoryEnds.put(previousCategory, this.headerInfoOffset); // cache end of the category table } this.categoryTables = new HashMap<String, Map<String, Object>>(3); } private int read(InputStream stream) throws IOException { int val = stream.read(); streamRead++; return val; } private void writeString(OutputStream stream, String signature) throws IOException { char[] array = signature.toCharArray(); int length = array.length; stream.write((byte) ((length >>> 8) & 0xFF)); // store chars array length instead of bytes stream.write((byte) (length & 0xFF)); // this will allow to read it faster this.streamEnd += 2; // we're assuming that very few char[] are so large that we need to flush the buffer more than once, if at all for (char ch : array) { if ((ch & 0x007F) == ch) { stream.write((byte) ch); this.streamEnd++; } else if ((ch & 0x07FF) == ch) { // first two bits are stored in first byte byte b = (byte) (ch >> 6); b &= 0x1F; b |= 0xC0; stream.write(b); streamEnd++; // last six bits are stored in second byte b = (byte) (ch & 0x3F); b |= 0x80; stream.write(b); streamEnd++; } else { // first four bits are stored in first byte byte b = (byte) (ch >> 12); b &= 0x0F; b |= 0xE0; stream.write(b); streamEnd++; // six following bits are stored in second byte b = (byte) (ch >> 6); b &= 0x3F; b |= 0x80; stream.write(b); streamEnd++; // last six bits are stored in third byte b = (byte) (ch & 0x3F); b |= 0x80; stream.write(b); streamEnd++; } } stream.flush(); } private int readStreamInt(InputStream stream) throws IOException { int val = (read(stream) & 0xFF) << 24; val += (read(stream) & 0xFF) << 16; val += (read(stream) & 0xFF) << 8; return val + (read(stream) & 0xFF); } private String readString(InputStream stream) throws IOException { int length = (read(stream) & 0xFF) << 8; length += read(stream) & 0xFF; // fill the chars from bytes buffer char[] word = new char[length]; int i = 0; while (i < length) { byte b = (byte) read(stream); switch (b & 0xF0) { case 0x00: case 0x10: case 0x20: case 0x30: case 0x40: case 0x50: case 0x60: case 0x70: word[i++] = (char) b; break; case 0xC0: case 0xD0: char next = (char) read(stream); if ((next & 0xC0) != 0x80) { throw new UTFDataFormatException(); } char ch = (char) ((b & 0x1F) << 6); ch |= next & 0x3F; word[i++] = ch; break; case 0xE0: char first = (char) read(stream); char second = (char) read(stream); if ((first & second & 0xC0) != 0x80) { throw new UTFDataFormatException(); } ch = (char) ((b & 0x0F) << 12); ch |= ((first & 0x3F) << 6); ch |= second & 0x3F; word[i++] = ch; break; default: throw new UTFDataFormatException(); } } return new String(word); } private synchronized List<String> readAllDocumentNames() throws IOException { if (this.numberOfChunks <= 0) return Collections.emptyList(); InputStream stream = new BufferedInputStream(new FileInputStream(this.indexFile)); try { int offset = this.chunkOffsets[0]; skip(stream, offset); int lastIndex = this.numberOfChunks - 1; String[] docNames = new String[lastIndex * CHUNK_SIZE + sizeOfLastChunk]; for (int i = 0; i < this.numberOfChunks; i++) readChunk(docNames, stream, i * CHUNK_SIZE, i < lastIndex ? CHUNK_SIZE : sizeOfLastChunk); return Arrays.asList(docNames); } finally { stream.close(); } } public DiskIndex mergeWith(MemoryIndex memoryIndex) throws IOException { // assume write lock is held // compute & write out new docNames List<String> names = readAllDocumentNames(); int previousLength = names.size(); int[] positions = new int[previousLength]; // keeps track of the position of each document in the new sorted // docNames Map<String, Integer> indexedDocuments = new HashMap<String, Integer>(3); // for each new/changed document in the // memoryIndex names = computeDocumentNames(names, positions, indexedDocuments, memoryIndex); if (names.isEmpty()) { if (previousLength == 0) return this; // nothing to do... memory index contained deleted documents that had never been saved // index is now empty since all the saved documents were removed DiskIndex newDiskIndex = new DiskIndex(this.indexFile.getPath()); newDiskIndex.initialize(); return newDiskIndex; } this.streamEnd = 0; DiskIndex newDiskIndex = new DiskIndex(this.indexFile.getPath() + ".tmp"); //$NON-NLS-1$ try { newDiskIndex.initializeFrom(this, newDiskIndex.indexFile); OutputStream stream = new BufferedOutputStream(new FileOutputStream(newDiskIndex.indexFile, false)); int offsetToHeader = -1; try { newDiskIndex.writeDocumentNames(stream, names); names = null; // add each new/changed doc to empty category tables using its new position # if (!indexedDocuments.isEmpty()) { for (Map.Entry<String, Integer> entry : indexedDocuments.entrySet()) if (entry.getKey() != null) newDiskIndex.copyQueryResults(memoryIndex.getCategoriesForDocument(entry.getKey()), entry .getValue()); } indexedDocuments = null; // free up the space // TODO Check list of categories we wanted removed and wipe them out of "categoryTables"? // merge each category table with the new ones & write them out if (previousLength == 0) newDiskIndex.writeCategories(stream); else newDiskIndex.mergeCategories(this, positions, stream); // write header offsetToHeader = newDiskIndex.streamEnd; newDiskIndex.writeHeaderInfo(stream); positions = null; // free up the space } finally { stream.close(); } newDiskIndex.writeOffsetToHeader(offsetToHeader); // rename file by deleting previous index file & renaming temp one if (this.indexFile.exists() && !this.indexFile.delete()) { throw new IOException("Failed to delete index file " + this.indexFile); //$NON-NLS-1$ } if (!newDiskIndex.indexFile.renameTo(this.indexFile)) { throw new IOException("Failed to rename index file " + this.indexFile); //$NON-NLS-1$ } } catch (IOException e) { if (newDiskIndex.indexFile.exists() && !newDiskIndex.indexFile.delete()) if (DEBUG) System.out.println("mergeWith - Failed to delete temp index " + newDiskIndex.indexFile); //$NON-NLS-1$ throw e; } newDiskIndex.indexFile = this.indexFile; return newDiskIndex; } private void mergeCategories(DiskIndex onDisk, int[] positions, OutputStream stream) throws IOException { // at this point, this.categoryTables contains the names -> wordsToDocs added in copyQueryResults() Set<String> oldNames = onDisk.categoryOffsets.keySet(); for (String oldName : oldNames) { if (oldName != null && !this.categoryTables.containsKey(oldName)) this.categoryTables.put(oldName, null); } Set<String> categoryNames = this.categoryTables.keySet(); for (String categoryName : categoryNames) if (categoryName != null) mergeCategory(categoryName, onDisk, positions, stream); this.categoryTables = null; } private void mergeCategory(String categoryName, DiskIndex onDisk, int[] positions, OutputStream stream) throws IOException { Map<String, Object> wordsToDocs = this.categoryTables.get(categoryName); if (wordsToDocs == null) wordsToDocs = new HashMap<String, Object>(3); Map<String, Object> oldWordsToDocs = onDisk.readCategoryTable(categoryName, true); if (oldWordsToDocs != null) { nextWord: for (Map.Entry<String, Object> entry : oldWordsToDocs.entrySet()) { String oldWord = entry.getKey(); if (oldWord == null) continue; List<Integer> oldDocNumbers = (List<Integer>) entry.getValue(); List<Integer> mappedNumbers = new ArrayList<Integer>(oldDocNumbers.size()); for (Integer oldDocNumber : oldDocNumbers) { int pos = positions[oldDocNumber]; if (pos > RE_INDEXED) // forget any reference to a document which was deleted or re_indexed mappedNumbers.add(pos); } if (mappedNumbers.isEmpty()) continue nextWord; // skip words which no longer have any references Object o = wordsToDocs.get(oldWord); if (o == null) { wordsToDocs.put(oldWord, mappedNumbers); } else { List<Integer> list = null; if (o instanceof List<?>) { list = (List<Integer>) o; } else { list = new ArrayList<Integer>(); wordsToDocs.put(oldWord, list); } list.addAll(mappedNumbers); } } onDisk.categoryTables.put(categoryName, null); // flush cached table } writeCategoryTable(categoryName, wordsToDocs, stream); } private void initializeFrom(DiskIndex diskIndex, File newIndexFile) throws IOException { if (newIndexFile.exists() && !newIndexFile.delete()) { // delete the temporary index file if (DEBUG) System.out.println("initializeFrom - Failed to delete temp index " + this.indexFile); //$NON-NLS-1$ } else if (!newIndexFile.createNewFile()) { if (DEBUG) System.out.println("initializeFrom - Failed to create temp index " + this.indexFile); //$NON-NLS-1$ throw new IOException("Failed to create temp index " + this.indexFile); //$NON-NLS-1$ } int size = diskIndex.categoryOffsets == null ? 8 : diskIndex.categoryOffsets.size(); this.categoryOffsets = new HashMap<String, Integer>(size); this.categoryEnds = new HashMap<String, Integer>(size); this.categoryTables = new HashMap<String, Map<String, Object>>(size); this.separator = diskIndex.separator; this.categoriesToDiscard = diskIndex.categoriesToDiscard; } private void copyQueryResults(Map<String, Set<String>> categoryToWords, int newPosition) { for (Map.Entry<String, Set<String>> entry : categoryToWords.entrySet()) { String categoryName = entry.getKey(); if (categoryName == null) continue; Map<String, Object> wordsToDocs = this.categoryTables.get(categoryName); if (wordsToDocs == null) this.categoryTables.put(categoryName, wordsToDocs = new HashMap<String, Object>()); for (String word : entry.getValue()) { if (word == null) continue; Object positions = wordsToDocs.get(word); if (positions == null) { wordsToDocs.put(word, positions = new ArrayList<Integer>()); } ((List<Integer>) positions).add(newPosition); } } } private void writeCategories(OutputStream stream) throws IOException { for (Map.Entry<String, Map<String, Object>> entry : categoryTables.entrySet()) { String categoryName = entry.getKey(); if (categoryName != null) writeCategoryTable(categoryName, entry.getValue(), stream); } this.categoryTables = null; } private void writeCategoryTable(String categoryName, Map<String, Object> wordsToDocs, OutputStream stream) throws IOException { if (this.categoriesToDiscard != null) { for (String categoryToDiscard : categoriesToDiscard) if (categoryName.equals(categoryToDiscard)) return; } // the format of a category table is as follows: // any document number arrays with >= 256 elements are written before the table (the offset to each array is // remembered) // then the number of word->int[] pairs in the table is written // for each word -> int[] pair, the word is written followed by: // an int <= 0 if the array size == 1 // an int > 1 & < 256 for the size of the array if its > 1 & < 256, the document array follows immediately // 256 if the array size >= 256 followed by another int which is the offset to the array (written prior to the // table) Map<String, Integer> longArrays = new HashMap<String, Integer>(); int largeArraySize = 256; for (Map.Entry<String, Object> entry : wordsToDocs.entrySet()) { List<Integer> docNumbers = (List<Integer>) entry.getValue(); if (docNumbers.size() >= largeArraySize) { longArrays.put(entry.getKey(), new Integer(this.streamEnd)); writeDocumentNumbers(docNumbers, stream); } } this.categoryOffsets.put(categoryName, this.streamEnd); // remember the offset to the start of the table this.categoryTables.put(categoryName, null); // flush cached table writeStreamInt(stream, wordsToDocs.size()); for (Map.Entry<String, Object> entry : wordsToDocs.entrySet()) { writeString(stream, entry.getKey()); if (longArrays.containsKey(entry.getKey())) { writeStreamInt(stream, largeArraySize); // mark to identify that an offset follows writeStreamInt(stream, longArrays.get(entry.getKey()).intValue()); // offset in the file of the array of // document numbers } else { List<Integer> documentNumbers = (List<Integer>) entry.getValue(); if (documentNumbers.size() == 1) writeStreamInt(stream, -documentNumbers.get(0)); else writeDocumentNumbers(documentNumbers, stream); } } } private void writeDocumentNumbers(List<Integer> documentNumbers, OutputStream stream) throws IOException { // must store length as a positive int to detect in-lined array of 1 element int length = documentNumbers.size(); writeStreamInt(stream, length); Collections.sort(documentNumbers); for (Integer docNumber : documentNumbers) { int value = docNumber.intValue(); switch (this.documentReferenceSize) { case 1: stream.write((byte) value); this.streamEnd++; break; case 2: stream.write((byte) (value >> 8)); stream.write((byte) value); this.streamEnd += 2; break; default: writeStreamInt(stream, value); break; } } stream.flush(); } private void writeOffsetToHeader(int offsetToHeader) throws IOException { if (offsetToHeader > 0) { RandomAccessFile file = new RandomAccessFile(this.indexFile, "rw"); //$NON-NLS-1$ try { file.seek(this.headerInfoOffset); // offset to position in header file.writeInt(offsetToHeader); this.headerInfoOffset = offsetToHeader; // update to reflect the correct offset } finally { file.close(); } } } private void writeHeaderInfo(OutputStream stream) throws IOException { writeStreamInt(stream, this.numberOfChunks); stream.write((byte) this.sizeOfLastChunk); stream.write((byte) this.documentReferenceSize); stream.write((byte) this.separator); this.streamEnd += 3; // apend the file with chunk offsets for (int i = 0; i < this.numberOfChunks; i++) { writeStreamInt(stream, this.chunkOffsets[i]); } writeStreamInt(stream, this.startOfCategoryTables); // append the file with the category offsets... # of name -> offset pairs, followed by each name & an offset to // its word->doc# table writeStreamInt(stream, this.categoryOffsets.size()); for (Map.Entry<String, Integer> entry : categoryOffsets.entrySet()) { writeString(stream, entry.getKey()); writeStreamInt(stream, entry.getValue()); } stream.flush(); } private void writeDocumentNames(OutputStream stream, List<String> sortedDocNames) throws IOException { writeString(stream, SIGNATURE); this.headerInfoOffset = this.streamEnd; writeStreamInt(stream, -1); int size = sortedDocNames.size(); this.numberOfChunks = (size / CHUNK_SIZE) + 1; this.sizeOfLastChunk = size % CHUNK_SIZE; if (this.sizeOfLastChunk == 0) { this.numberOfChunks--; this.sizeOfLastChunk = CHUNK_SIZE; } this.documentReferenceSize = size <= 0x7F ? 1 : (size <= 0x7FFF ? 2 : 4); // number of bytes used to encode a // reference this.chunkOffsets = new int[this.numberOfChunks]; int lastIndex = this.numberOfChunks - 1; for (int i = 0; i < this.numberOfChunks; i++) { this.chunkOffsets[i] = this.streamEnd; int chunkSize = i == lastIndex ? this.sizeOfLastChunk : CHUNK_SIZE; int chunkIndex = i * CHUNK_SIZE; String current = sortedDocNames.get(chunkIndex); writeString(stream, current); for (int j = 1; j < chunkSize; j++) { String next = sortedDocNames.get(chunkIndex + j); int len1 = current.length(); int len2 = next.length(); int max = len1 < len2 ? len1 : len2; int start = 0; // number of identical characters at the beginning (also the index of first character // that is different) while (current.charAt(start) == next.charAt(start)) { start++; if (max == start) break; // current is 'abba', next is 'abbab' } if (start > 255) start = 255; int end = 0; // number of identical characters at the end while (current.charAt(--len1) == next.charAt(--len2)) { end++; if (len2 == start) break; // current is 'abbba', next is 'abba' if (len1 == 0) break; // current is 'xabc', next is 'xyabc' } if (end > 255) end = 255; stream.write((byte) start); stream.write((byte) end); this.streamEnd += 2; int last = next.length() - end; writeString(stream, (start < last ? next.substring(start, last) : "")); current = next; } } this.startOfCategoryTables = this.streamEnd + 1; } private void writeStreamInt(OutputStream stream, int val) throws IOException { stream.write((byte) (val >> 24)); stream.write((byte) (val >> 16)); stream.write((byte) (val >> 8)); stream.write((byte) val); this.streamEnd += 4; stream.flush(); } public Map<String, QueryResult> addQueryResults(String[] categories, String key, int matchRule, MemoryIndex memoryIndex) throws IOException { // assumes sender has called startQuery() & will call stopQuery() when finished if (this.categoryOffsets == null) return null; // file is empty Map<String, QueryResult> results = null; // initialized if needed if (key == null) { for (int i = 0, l = categories.length; i < l; i++) { Map<String, Object> wordsToDocNumbers = readCategoryTable(categories[i], true); // cache if key // is null // since its a definite // match if (wordsToDocNumbers != null) { if (results == null) results = new HashMap<String, QueryResult>(wordsToDocNumbers.size()); for (String word : wordsToDocNumbers.keySet()) if (word != null) results = addQueryResult(results, word, wordsToDocNumbers, memoryIndex); } } if (results != null && this.cachedChunks == null) cacheDocumentNames(); } else { switch (matchRule) { case SearchPattern.EXACT_MATCH | SearchPattern.CASE_SENSITIVE: for (int i = 0, l = categories.length; i < l; i++) { Map<String, Object> wordsToDocNumbers = readCategoryTable(categories[i], false); if (wordsToDocNumbers != null && wordsToDocNumbers.containsKey(key)) results = addQueryResult(results, key, wordsToDocNumbers, memoryIndex); } break; case SearchPattern.PREFIX_MATCH | SearchPattern.CASE_SENSITIVE: for (int i = 0, l = categories.length; i < l; i++) { Map<String, Object> wordsToDocNumbers = readCategoryTable(categories[i], false); if (wordsToDocNumbers != null) { for (String word : wordsToDocNumbers.keySet()) { if (word != null && word.startsWith(key)) results = addQueryResult(results, word, wordsToDocNumbers, memoryIndex); } } } break; default: for (int i = 0, l = categories.length; i < l; i++) { Map<String, Object> wordsToDocNumbers = readCategoryTable(categories[i], false); if (wordsToDocNumbers != null) { for (String word : wordsToDocNumbers.keySet()) { if (word != null && Index.isMatch(key, word, matchRule)) results = addQueryResult(results, word, wordsToDocNumbers, memoryIndex); } } } } } if (results == null) return null; return results; } private void cacheDocumentNames() throws IOException { // will need all document names so get them now this.cachedChunks = new String[this.numberOfChunks][]; InputStream stream = new BufferedInputStream(new FileInputStream(this.indexFile)); try { // if (this.numberOfChunks > 5) // BUFFER_READ_SIZE <<= 1; int offset = this.chunkOffsets[0]; skip(stream, offset); for (int i = 0; i < this.numberOfChunks; i++) { int size = i == this.numberOfChunks - 1 ? this.sizeOfLastChunk : CHUNK_SIZE; readChunk(this.cachedChunks[i] = new String[size], stream, 0, size); } } catch (IOException e) { this.cachedChunks = null; throw e; } finally { stream.close(); } } private Map<String, QueryResult> addQueryResult(Map<String, QueryResult> results, String word, Map<String, Object> wordsToDocNumbers, MemoryIndex memoryIndex) throws IOException { // must skip over documents which have been added/changed/deleted in the memory index if (results == null) results = new HashMap<String, QueryResult>(13); QueryResult result = (QueryResult) results.get(word); if (memoryIndex == null) { if (result == null) results.put(word, new QueryResult(word, wordsToDocNumbers)); else result.addDocumentTable(wordsToDocNumbers); } else { Map<String, Map<String, Set<String>>> docsToRefs = memoryIndex.getDocumentsToReferences(); if (result == null) result = new QueryResult(word, null); List<Integer> docNumbers = readDocumentNumbers(wordsToDocNumbers.get(word)); for (Integer docNumber : docNumbers) { String docName = readDocumentName(docNumber); if (!docsToRefs.containsKey(docName)) result.addDocumentName(docName); } if (!result.isEmpty()) results.put(word, result); } return results; } synchronized String readDocumentName(int docNumber) throws IOException { if (this.cachedChunks == null) this.cachedChunks = new String[this.numberOfChunks][]; int chunkNumber = docNumber / CHUNK_SIZE; String[] chunk = this.cachedChunks[chunkNumber]; if (chunk == null) { boolean isLastChunk = chunkNumber == this.numberOfChunks - 1; int start = this.chunkOffsets[chunkNumber]; int numberOfBytes = (isLastChunk ? this.startOfCategoryTables : this.chunkOffsets[chunkNumber + 1]) - start; if (numberOfBytes < 0) throw new IllegalArgumentException(); InputStream file = new BufferedInputStream(new FileInputStream(this.indexFile)); try { skip(file, start); int numberOfNames = isLastChunk ? this.sizeOfLastChunk : CHUNK_SIZE; chunk = new String[numberOfNames]; readChunk(chunk, file, 0, numberOfNames); } catch (IOException ioe) { throw ioe; } finally { file.close(); } this.cachedChunks[chunkNumber] = chunk; } return chunk[docNumber - (chunkNumber * CHUNK_SIZE)]; } private void readChunk(String[] docNames, InputStream stream, int index, int size) throws IOException { String current = readString(stream); docNames[index++] = current; for (int i = 1; i < size; i++) { int start = read(stream) & 0xFF; int end = read(stream) & 0xFF; String next = readString(stream); if (start > 0) { if (end > 0) { int length = current.length(); next = current.substring(0, start) + next + current.substring(length - end, length); } else { next = current.substring(0, start) + next; } } else if (end > 0) { int length = current.length(); next = next + current.substring(length - end, length); } docNames[index++] = next; current = next; } } private synchronized Map<String, Object> readCategoryTable(String categoryName, boolean readDocNumbers) throws IOException { // result will be null if categoryName is unknown Integer offset = this.categoryOffsets.get(categoryName); if (offset == null) { return null; } if (this.categoryTables == null) { this.categoryTables = new HashMap<String, Map<String, Object>>(3); } else { Map<String, Object> cachedTable = this.categoryTables.get(categoryName); if (cachedTable != null) { if (readDocNumbers) { // must cache remaining document number arrays Map<String, Object> copy = new HashMap<String, Object>(cachedTable); for (Map.Entry<String, Object> entry : cachedTable.entrySet()) { Object arrayOffset = entry.getValue(); if (arrayOffset instanceof Integer) copy.put(entry.getKey(), readDocumentNumbers(arrayOffset)); } cachedTable = copy; } return cachedTable; } } InputStream stream = new BufferedInputStream(new FileInputStream(this.indexFile)); Map<String, Object> categoryTable = null; String[] matchingWords = null; int count = 0; int firstOffset = -1; try { skip(stream, offset); int size = readStreamInt(stream); try { if (size < 0) { // DEBUG System.err.println("-------------------- DEBUG --------------------"); //$NON-NLS-1$ System.err.println("file = " + this.indexFile); //$NON-NLS-1$ System.err.println("offset = " + offset); //$NON-NLS-1$ System.err.println("size = " + size); //$NON-NLS-1$ System.err.println("-------------------- END --------------------"); //$NON-NLS-1$ } categoryTable = new HashMap<String, Object>(size); } catch (OutOfMemoryError oom) { // DEBUG oom.printStackTrace(); System.err.println("-------------------- DEBUG --------------------"); //$NON-NLS-1$ System.err.println("file = " + this.indexFile); //$NON-NLS-1$ System.err.println("offset = " + offset); //$NON-NLS-1$ System.err.println("size = " + size); //$NON-NLS-1$ System.err.println("-------------------- END --------------------"); //$NON-NLS-1$ throw oom; } int largeArraySize = 256; for (int i = 0; i < size; i++) { String word = readString(stream); int arrayOffset = readStreamInt(stream); // if arrayOffset is: // <= 0 then the array size == 1 with the value -> -arrayOffset // > 1 & < 256 then the size of the array is > 1 & < 256, the document array follows immediately // 256 if the array size >= 256 followed by another int which is the offset to the array (written prior // to the table) if (arrayOffset <= 0) { List<Integer> positions = new ArrayList<Integer>(); positions.add(-arrayOffset); categoryTable.put(word, positions); // store 1 element array by negating // documentNumber } else if (arrayOffset < largeArraySize) { categoryTable.put(word, readStreamDocumentArray(stream, arrayOffset)); // read in-lined array // providing size } else { arrayOffset = readStreamInt(stream); // read actual offset if (readDocNumbers) { if (matchingWords == null) matchingWords = new String[size]; if (count == 0) firstOffset = arrayOffset; matchingWords[count++] = word; } categoryTable.put(word, new Integer(arrayOffset)); // offset to array in the file } } this.categoryTables.put(categoryName, categoryTable); // cache the table as long as its not too big // in practice, some tables can be greater than 500K when they contain more than 10K elements this.cachedCategoryName = categoryTable.size() < 20000 ? categoryName : null; } catch (IOException ioe) { throw ioe; } finally { stream.close(); } if (matchingWords != null && count > 0) { stream = new BufferedInputStream(new FileInputStream(this.indexFile)); try { skip(stream, firstOffset); for (int i = 0; i < count; i++) { // each array follows the previous one categoryTable.put(matchingWords[i], readStreamDocumentArray(stream, readStreamInt(stream))); } } catch (IOException ioe) { throw ioe; } finally { stream.close(); } } return categoryTable; } synchronized List<Integer> readDocumentNumbers(Object arrayOffset) throws IOException { // arrayOffset is either a cached array of docNumbers or an Integer offset in the file if (arrayOffset instanceof List<?>) return (List<Integer>) arrayOffset; InputStream stream = new BufferedInputStream(new FileInputStream(this.indexFile)); try { int offset = ((Integer) arrayOffset).intValue(); skip(stream, offset); return readStreamDocumentArray(stream, readStreamInt(stream)); } finally { stream.close(); } } private List<Integer> readStreamDocumentArray(InputStream stream, int arraySize) throws IOException { if (arraySize == 0) return Collections.emptyList(); List<Integer> indexes = new ArrayList<Integer>(); for (int i = 0; i < arraySize; i++) { int value = 0; switch (this.documentReferenceSize) { case 1: value = read(stream) & 0xFF; break; case 2: value = (read(stream) & 0xFF) << 8; value = value + (read(stream) & 0xFF); break; default: value = readStreamInt(stream); break; } indexes.add(value); } return indexes; } private List<String> computeDocumentNames(List<String> onDiskNames, int[] positions, Map<String, Integer> indexedDocuments, MemoryIndex memoryIndex) { int onDiskLength = onDiskNames.size(); Map<String, Map<String, Set<String>>> memIndexDocs = memoryIndex.getDocumentsToReferences(); if (onDiskLength == 0) { // disk index was empty, so add every indexed document for (Map.Entry<String, Map<String, Set<String>>> entry : memIndexDocs.entrySet()) { Map<String, Set<String>> refTable = entry.getValue(); if (refTable != null) indexedDocuments.put(entry.getKey(), null); // remember each new document } List<String> newDocNames = new ArrayList<String>(indexedDocuments.size()); Set<String> added = indexedDocuments.keySet(); for (String adddedString : added) if (adddedString != null) newDocNames.add(adddedString); Collections.sort(newDocNames); for (int i = 0, l = newDocNames.size(); i < l; i++) indexedDocuments.put(newDocNames.get(i), new Integer(i)); return newDocNames; } // initialize positions as if each document will remain in the same position for (int i = 0; i < onDiskLength; i++) positions[i] = i; // find out if the memory index has any new or deleted documents, if not then the names & positions are the same int numDeletedDocNames = 0; int numReindexedDocNames = 0; nextPath: for (Map.Entry<String, Map<String, Set<String>>> entry : memIndexDocs.entrySet()) { String docName = entry.getKey(); if (docName == null) continue; for (int j = 0; j < onDiskLength; j++) { if (docName.equals(onDiskNames.get(j))) { if (entry.getValue() == null) { positions[j] = DELETED; numDeletedDocNames++; } else { positions[j] = RE_INDEXED; numReindexedDocNames++; } continue nextPath; } } if (entry.getValue() != null) indexedDocuments.put(docName, null); // remember each new document, skip deleted documents which were // never saved } List<String> newDocNames = onDiskNames; if (numDeletedDocNames > 0 || indexedDocuments.size() > 0) { // some new documents have been added or some old ones deleted newDocNames = new ArrayList<String>(onDiskLength + indexedDocuments.size() - numDeletedDocNames); for (int i = 0; i < onDiskLength; i++) if (positions[i] >= RE_INDEXED) newDocNames.add(onDiskNames.get(i)); // keep each unchanged document Set<String> added = indexedDocuments.keySet(); for (String addedString : added) if (addedString != null) newDocNames.add(addedString); // add each new document Collections.sort(newDocNames); for (int i = 0, l = newDocNames.size(); i < l; i++) if (indexedDocuments.containsKey(newDocNames.get(i))) indexedDocuments.put(newDocNames.get(i), new Integer(i)); // remember the position for each new // document } // need to be able to look up an old position (ref# from a ref[]) and map it to its new position // if its old position == DELETED then its forgotton // if its old position == ReINDEXED then its also forgotten but its new position is needed to map references int count = -1; for (int i = 0; i < onDiskLength;) { switch (positions[i]) { case DELETED: i++; // skip over deleted... references are forgotten break; case RE_INDEXED: String newName = newDocNames.get(++count); if (newName.equals(onDiskNames.get(i))) { indexedDocuments.put(newName, new Integer(count)); // the reindexed docName that was at position // i is now at position count i++; } break; default: if (newDocNames.get(++count).equals(onDiskNames.get(i))) positions[i++] = count; // the unchanged docName that was at position i is now at position count } } return newDocNames; } public DiskIndex removeCategories(String[] categoryNames, MemoryIndex memoryIndex) throws IOException { // FIXME We need to wipe out the category from the file somehow! The problem is that we can drop a document/file // fairly easily by setting it's mapping to null in memory index and merging, but the way the thing is set up we // don't really have an easy way of wiping a category out from memory and disk right now. // int offset = this.categoryOffsets.get(categoryName); // // "Wipe out" from the category offset to next category offset. However we need to handle the "long arrays" // written just before offsets! // RandomAccessFile file = new RandomAccessFile(this.indexFile, "rw"); // file.seek(offset); this.categoriesToDiscard = categoryNames; DiskIndex newIndex = mergeWith(memoryIndex); newIndex.categoriesToDiscard = null; return newIndex; } }