/******************************************************************************* * Copyright (c) 2000, 2007 IBM Corporation and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * *******************************************************************************/ package org.eclipse.dltk.core.search.index; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.util.regex.Pattern; import org.eclipse.dltk.compiler.CharOperation; import org.eclipse.dltk.compiler.util.HashtableOfIntValues; import org.eclipse.dltk.compiler.util.HashtableOfObject; import org.eclipse.dltk.compiler.util.SimpleLookupTable; import org.eclipse.dltk.compiler.util.SimpleSet; import org.eclipse.dltk.core.search.SearchPattern; import org.eclipse.dltk.core.search.indexing.IIndexConstants; import org.eclipse.dltk.internal.core.util.Messages; import org.eclipse.dltk.internal.core.util.SimpleWordSet; import org.eclipse.dltk.internal.core.util.Util; public class DiskIndex { String fileName; private int headerInfoOffset; private int numberOfChunks; private int sizeOfLastChunk; private int[] chunkOffsets; private int documentReferenceSize; // 1, 2 or more bytes... depends on # of // document names private int startOfCategoryTables; private HashtableOfIntValues categoryOffsets; private int cacheUserCount; private String[][] cachedChunks; // decompressed chunks of document names private HashtableOfObject categoryTables; // category name -> // HashtableOfObject(words -> // int[] of document #'s) or // offset if not read yet private char[] cachedCategoryName; char separator = Index.DEFAULT_SEPARATOR; public static final String SIGNATURE = "DLTK INDEX VERSION 1.014"; //$NON-NLS-1$ public final static boolean DEBUG = false; private static final int RE_INDEXED = -1; private static final int DELETED = -2; private static final int CHUNK_SIZE = 100; static class IntList { int size; int[] elements; IntList(int[] elements) { this.elements = elements; this.size = elements.length; } void add(int newElement) { if (this.size == this.elements.length) { int newSize = this.size * 3; if (newSize < 7) newSize = 7; System.arraycopy(this.elements, 0, this.elements = new int[newSize], 0, this.size); } this.elements[this.size++] = newElement; } int[] asArray() { int[] result = new int[this.size]; System.arraycopy(this.elements, 0, result, 0, this.size); return result; } } DiskIndex(String fileName) { this.fileName = fileName; // clear cached items this.headerInfoOffset = -1; this.numberOfChunks = -1; this.sizeOfLastChunk = -1; this.chunkOffsets = null; this.documentReferenceSize = -1; this.cacheUserCount = -1; this.cachedChunks = null; this.categoryTables = null; this.cachedCategoryName = null; this.categoryOffsets = null; } SimpleSet addDocumentNames(String substring, MemoryIndex memoryIndex) throws IOException { // must skip over documents which have been added/changed/deleted in the // memory index String[] docNames = readAllDocumentNames(); SimpleSet results = new SimpleSet(docNames.length); if (substring == null) { if (memoryIndex == null) { for (int i = 0, l = docNames.length; i < l; i++) results.add(docNames[i]); } else { SimpleLookupTable docsToRefs = memoryIndex.docsToReferences; for (int i = 0, l = docNames.length; i < l; i++) { String docName = docNames[i]; if (!docsToRefs.containsKey(docName)) results.add(docName); } } } else { if (memoryIndex == null) { for (int i = 0, l = docNames.length; i < l; i++) if (docNames[i].startsWith(substring, 0)) results.add(docNames[i]); } else { SimpleLookupTable docsToRefs = memoryIndex.docsToReferences; for (int i = 0, l = docNames.length; i < l; i++) { String docName = docNames[i]; if (docName.startsWith(substring, 0) && !docsToRefs.containsKey(docName)) results.add(docName); } } } return results; } private HashtableOfObject addQueryResult(HashtableOfObject results, char[] word, HashtableOfObject wordsToDocNumbers, MemoryIndex memoryIndex) throws IOException { // must skip over documents which have been added/changed/deleted in the // memory index if (results == null) results = new HashtableOfObject(13); EntryResult result = (EntryResult) results.get(word); if (memoryIndex == null) { if (result == null) results.put(word, new EntryResult(word, wordsToDocNumbers)); else result.addDocumentTable(wordsToDocNumbers); } else { SimpleLookupTable docsToRefs = memoryIndex.docsToReferences; if (result == null) result = new EntryResult(word, null); int[] docNumbers = readDocumentNumbers(wordsToDocNumbers.get(word)); for (int i = 0, l = docNumbers.length; i < l; i++) { String docName = readDocumentName(docNumbers[i]); if (!docsToRefs.containsKey(docName)) result.addDocumentName(docName); } if (!result.isEmpty()) results.put(word, result); } return results; } HashtableOfObject addQueryResults(char[][] categories, char[] key, int matchRule, MemoryIndex memoryIndex) throws IOException { // assumes sender has called startQuery() & will call stopQuery() when // finished if (this.categoryOffsets == null) return null; // file is empty HashtableOfObject results = null; // initialized if needed if (key == null) { for (int i = 0, l = categories.length; i < l; i++) { HashtableOfObject wordsToDocNumbers = readCategoryTable( categories[i], true); // cache if key is null since // its a definite match if (wordsToDocNumbers != null) { char[][] words = wordsToDocNumbers.keyTable; if (results == null) results = new HashtableOfObject( wordsToDocNumbers.elementSize); for (int j = 0, m = words.length; j < m; j++) if (words[j] != null) results = addQueryResult(results, words[j], wordsToDocNumbers, memoryIndex); } } if (results != null && this.cachedChunks == null) cacheDocumentNames(); } else { switch (matchRule) { case SearchPattern.R_EXACT_MATCH | SearchPattern.R_CASE_SENSITIVE: for (int i = 0, l = categories.length; i < l; i++) { HashtableOfObject wordsToDocNumbers = readCategoryTable( categories[i], false); if (wordsToDocNumbers != null && wordsToDocNumbers.containsKey(key)) results = addQueryResult(results, key, wordsToDocNumbers, memoryIndex); } break; case SearchPattern.R_PREFIX_MATCH | SearchPattern.R_CASE_SENSITIVE: for (int i = 0, l = categories.length; i < l; i++) { HashtableOfObject wordsToDocNumbers = readCategoryTable( categories[i], false); if (wordsToDocNumbers != null) { char[][] words = wordsToDocNumbers.keyTable; for (int j = 0, m = words.length; j < m; j++) { char[] word = words[j]; if (word != null && key[0] == word[0] && CharOperation.prefixEquals(key, word)) results = addQueryResult(results, word, wordsToDocNumbers, memoryIndex); } } } break; case SearchPattern.R_REGEXP_MATCH: case SearchPattern.R_REGEXP_MATCH | SearchPattern.R_CASE_SENSITIVE: Pattern regexpPattern = Pattern .compile( new String(key), (matchRule & SearchPattern.R_CASE_SENSITIVE) == 0 ? Pattern.CASE_INSENSITIVE : 0); for (int i = 0, l = categories.length; i < l; i++) { HashtableOfObject wordsToDocNumbers = readCategoryTable( categories[i], false); if (wordsToDocNumbers != null) { char[][] words = wordsToDocNumbers.keyTable; for (int j = 0, m = words.length; j < m; j++) { char[] word = words[j]; if (word != null) { char decodedWord[]; int sep = CharOperation.indexOf( IIndexConstants.SEPARATOR, word); if (sep > 0) { decodedWord = CharOperation.subarray(word, 0, sep); } else { decodedWord = word; } if (regexpPattern.matcher( new String(decodedWord)).matches()) { results = addQueryResult(results, word, wordsToDocNumbers, memoryIndex); } } } } } break; default: for (int i = 0, l = categories.length; i < l; i++) { HashtableOfObject wordsToDocNumbers = readCategoryTable( categories[i], false); if (wordsToDocNumbers != null) { char[][] words = wordsToDocNumbers.keyTable; for (int j = 0, m = words.length; j < m; j++) { char[] word = words[j]; if (word != null && Index.isMatch(key, word, matchRule)) { results = addQueryResult(results, word, wordsToDocNumbers, memoryIndex); } } } } } } if (results == null) return null; return results; } private void cacheDocumentNames() throws IOException { // will need all document names so get them now this.cachedChunks = new String[this.numberOfChunks][]; DataInputStream stream = new DataInputStream(new BufferedInputStream( new FileInputStream(getIndexFile()), this.numberOfChunks > 5 ? 4096 : 2048)); try { stream.skip(this.chunkOffsets[0]); for (int i = 0; i < this.numberOfChunks; i++) { int size = i == this.numberOfChunks - 1 ? this.sizeOfLastChunk : CHUNK_SIZE; readChunk(this.cachedChunks[i] = new String[size], stream, 0, size); } } finally { stream.close(); } } private String[] computeDocumentNames(String[] onDiskNames, int[] positions, SimpleLookupTable indexedDocuments, MemoryIndex memoryIndex) { int onDiskLength = onDiskNames.length; Object[] docNames = memoryIndex.docsToReferences.keyTable; Object[] referenceTables = memoryIndex.docsToReferences.valueTable; if (onDiskLength == 0) { // disk index was empty, so add every indexed document for (int i = 0, l = referenceTables.length; i < l; i++) if (referenceTables[i] != null) indexedDocuments.put(docNames[i], null); // remember each // new document String[] newDocNames = new String[indexedDocuments.elementSize]; int count = 0; Object[] added = indexedDocuments.keyTable; for (int i = 0, l = added.length; i < l; i++) if (added[i] != null) newDocNames[count++] = (String) added[i]; Util.sort(newDocNames); for (int i = 0, l = newDocNames.length; i < l; i++) indexedDocuments.put(newDocNames[i], i); return newDocNames; } // initialize positions as if each document will remain in the same // position for (int i = 0; i < onDiskLength; i++) positions[i] = i; // find out if the memory index has any new or deleted documents, if not // then the names & positions are the same int numDeletedDocNames = 0; int numReindexedDocNames = 0; nextPath: for (int i = 0, l = docNames.length; i < l; i++) { String docName = (String) docNames[i]; if (docName != null) { for (int j = 0; j < onDiskLength; j++) { if (docName.equals(onDiskNames[j])) { if (referenceTables[i] == null) { positions[j] = DELETED; numDeletedDocNames++; } else { positions[j] = RE_INDEXED; numReindexedDocNames++; } continue nextPath; } } if (referenceTables[i] != null) indexedDocuments.put(docName, null); // remember each new // document, skip // deleted documents // which were never // saved } } String[] newDocNames = onDiskNames; if (numDeletedDocNames > 0 || indexedDocuments.elementSize > 0) { // some new documents have been added or some old ones deleted newDocNames = new String[onDiskLength + indexedDocuments.elementSize - numDeletedDocNames]; int count = 0; for (int i = 0; i < onDiskLength; i++) if (positions[i] >= RE_INDEXED) newDocNames[count++] = onDiskNames[i]; // keep each // unchanged // document Object[] added = indexedDocuments.keyTable; for (int i = 0, l = added.length; i < l; i++) if (added[i] != null) newDocNames[count++] = (String) added[i]; // add each new // document Util.sort(newDocNames); for (int i = 0, l = newDocNames.length; i < l; i++) if (indexedDocuments.containsKey(newDocNames[i])) indexedDocuments.put(newDocNames[i], i); // remember // the // position // for // each // new // document } // need to be able to look up an old position (ref# from a ref[]) and // map it to its new position // if its old position == DELETED then its forgotton // if its old position == ReINDEXED then its also forgotten but its new // position is needed to map references int count = -1; for (int i = 0; i < onDiskLength;) { switch (positions[i]) { case DELETED: i++; // skip over deleted... references are forgotten break; case RE_INDEXED: String newName = newDocNames[++count]; if (newName.equals(onDiskNames[i])) { indexedDocuments.put(newName, count); // the // reindexed // docName // that // was // at // position // i is // now // at // position // count i++; } break; default: if (newDocNames[++count].equals(onDiskNames[i])) positions[i++] = count; // the unchanged docName that was at // position i is now at position // count } } return newDocNames; } private void copyQueryResults(HashtableOfObject categoryToWords, int newPosition) { char[][] categoryNames = categoryToWords.keyTable; Object[] wordSets = categoryToWords.valueTable; for (int i = 0, l = categoryNames.length; i < l; i++) { char[] categoryName = categoryNames[i]; if (categoryName != null) { SimpleWordSet wordSet = (SimpleWordSet) wordSets[i]; HashtableOfObject wordsToDocs = (HashtableOfObject) this.categoryTables .get(categoryName); if (wordsToDocs == null) this.categoryTables.put(categoryName, wordsToDocs = new HashtableOfObject( wordSet.elementSize)); char[][] words = wordSet.words; for (int j = 0, m = words.length; j < m; j++) { char[] word = words[j]; if (word != null) { Object o = wordsToDocs.get(word); if (o == null) { wordsToDocs.put(word, new int[] { newPosition }); } else if (o instanceof IntList) { ((IntList) o).add(newPosition); } else { IntList list = new IntList((int[]) o); list.add(newPosition); wordsToDocs.put(word, list); } } } } } } File getIndexFile() { if (this.fileName == null) return null; return new File(this.fileName); } void initialize(boolean reuseExistingFile) throws IOException { File indexFile = getIndexFile(); if (indexFile.exists()) { if (reuseExistingFile) { RandomAccessFile file = new RandomAccessFile(this.fileName, "r"); //$NON-NLS-1$ try { String signature = file.readUTF(); if (!signature.equals(SIGNATURE)) throw new IOException(Messages.exception_wrongFormat); this.headerInfoOffset = file.readInt(); if (this.headerInfoOffset > 0) // file is empty if its not // set readHeaderInfo(file); } finally { file.close(); } return; } if (!indexFile.delete()) { if (DEBUG) System.out .println("initialize - Failed to delete index " + this.fileName); //$NON-NLS-1$ throw new IOException("Failed to delete index " + this.fileName); //$NON-NLS-1$ } } if (indexFile.createNewFile()) { RandomAccessFile file = new RandomAccessFile(this.fileName, "rw"); //$NON-NLS-1$ try { file.writeUTF(SIGNATURE); file.writeInt(-1); // file is empty } finally { file.close(); } } else { if (DEBUG) System.out .println("initialize - Failed to create new index " + this.fileName); //$NON-NLS-1$ throw new IOException("Failed to create new index " + this.fileName); //$NON-NLS-1$ } } private void initializeFrom(DiskIndex diskIndex, File newIndexFile) throws IOException { if (newIndexFile.exists() && !newIndexFile.delete()) { // delete the // temporary // index file if (DEBUG) System.out .println("initializeFrom - Failed to delete temp index " + this.fileName); //$NON-NLS-1$ } else if (!newIndexFile.createNewFile()) { if (DEBUG) System.out .println("initializeFrom - Failed to create temp index " + this.fileName); //$NON-NLS-1$ throw new IOException( "Failed to create temp index " + this.fileName); //$NON-NLS-1$ } int size = diskIndex.categoryOffsets == null ? 8 : diskIndex.categoryOffsets.elementSize; this.categoryOffsets = new HashtableOfIntValues(size); this.categoryTables = new HashtableOfObject(size); this.separator = diskIndex.separator; } private void mergeCategories(DiskIndex onDisk, int[] positions, DataOutputStream stream) throws IOException { // at this point, this.categoryTables contains the names -> wordsToDocs // added in copyQueryResults() char[][] oldNames = onDisk.categoryOffsets.keyTable; for (int i = 0, l = oldNames.length; i < l; i++) { char[] oldName = oldNames[i]; if (oldName != null && !this.categoryTables.containsKey(oldName)) this.categoryTables.put(oldName, null); } char[][] categoryNames = this.categoryTables.keyTable; for (int i = 0, l = categoryNames.length; i < l; i++) if (categoryNames[i] != null) mergeCategory(categoryNames[i], onDisk, positions, stream); this.categoryTables = null; } private void mergeCategory(char[] categoryName, DiskIndex onDisk, int[] positions, DataOutputStream stream) throws IOException { HashtableOfObject wordsToDocs = (HashtableOfObject) this.categoryTables .get(categoryName); if (wordsToDocs == null) wordsToDocs = new HashtableOfObject(3); HashtableOfObject oldWordsToDocs = onDisk.readCategoryTable( categoryName, true); if (oldWordsToDocs != null) { char[][] oldWords = oldWordsToDocs.keyTable; Object[] oldArrayOffsets = oldWordsToDocs.valueTable; nextWord: for (int i = 0, l = oldWords.length; i < l; i++) { char[] oldWord = oldWords[i]; if (oldWord != null) { int[] oldDocNumbers = (int[]) oldArrayOffsets[i]; int length = oldDocNumbers.length; int[] mappedNumbers = new int[length]; int count = 0; for (int j = 0; j < length; j++) { int pos = positions[oldDocNumbers[j]]; if (pos > RE_INDEXED) // forget any reference to a // document which was deleted or // re_indexed mappedNumbers[count++] = pos; } if (count < length) { if (count == 0) continue nextWord; // skip words which no longer // have any references System.arraycopy(mappedNumbers, 0, mappedNumbers = new int[count], 0, count); } Object o = wordsToDocs.get(oldWord); if (o == null) { wordsToDocs.put(oldWord, mappedNumbers); } else { IntList list = null; if (o instanceof IntList) { list = (IntList) o; } else { list = new IntList((int[]) o); wordsToDocs.put(oldWord, list); } for (int j = 0; j < count; j++) list.add(mappedNumbers[j]); } } } onDisk.categoryTables.put(categoryName, null); // flush cached // table } writeCategoryTable(categoryName, wordsToDocs, stream); } DiskIndex mergeWith(MemoryIndex memoryIndex) throws IOException { // assume write lock is held // compute & write out new docNames String[] docNames = readAllDocumentNames(); int previousLength = docNames.length; int[] positions = new int[previousLength]; // keeps track of the // position of each document // in the new sorted // docNames SimpleLookupTable indexedDocuments = new SimpleLookupTable(3); // for // each // new/changed // document // in // the // memoryIndex docNames = computeDocumentNames(docNames, positions, indexedDocuments, memoryIndex); if (docNames.length == 0) { if (previousLength == 0) return this; // nothing to do... memory index contained // deleted documents that had never been saved // index is now empty since all the saved documents were removed DiskIndex newDiskIndex = new DiskIndex(this.fileName); newDiskIndex.initialize(false); return newDiskIndex; } DiskIndex newDiskIndex = new DiskIndex(this.fileName + ".tmp"); //$NON-NLS-1$ File newIndexFile = newDiskIndex.getIndexFile(); try { newDiskIndex.initializeFrom(this, newIndexFile); DataOutputStream stream = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(newIndexFile, false), 2048)); int offsetToHeader = -1; try { newDiskIndex.writeAllDocumentNames(docNames, stream); docNames = null; // free up the space // add each new/changed doc to empty category tables using its // new position # if (indexedDocuments.elementSize > 0) { Object[] names = indexedDocuments.keyTable; Object[] integerPositions = indexedDocuments.valueTable; for (int i = 0, l = names.length; i < l; i++) if (names[i] != null) newDiskIndex .copyQueryResults( (HashtableOfObject) memoryIndex.docsToReferences .get(names[i]), ((Integer) integerPositions[i]) .intValue()); } indexedDocuments = null; // free up the space // merge each category table with the new ones & write them out if (previousLength == 0) newDiskIndex.writeCategories(stream); else newDiskIndex.mergeCategories(this, positions, stream); offsetToHeader = stream.size(); newDiskIndex.writeHeaderInfo(stream); positions = null; // free up the space } finally { stream.close(); } newDiskIndex.writeOffsetToHeader(offsetToHeader); // rename file by deleting previous index file & renaming temp one File old = getIndexFile(); if (old.exists() && !old.delete()) { if (DEBUG) System.out .println("mergeWith - Failed to delete " + this.fileName); //$NON-NLS-1$ throw new IOException( "Failed to delete index file " + this.fileName); //$NON-NLS-1$ } if (!newIndexFile.renameTo(old)) { if (DEBUG) System.out .println("mergeWith - Failed to rename " + this.fileName); //$NON-NLS-1$ throw new IOException( "Failed to rename index file " + this.fileName); //$NON-NLS-1$ } } catch (IOException e) { if (newIndexFile.exists() && !newIndexFile.delete()) if (DEBUG) System.out .println("mergeWith - Failed to delete temp index " + newDiskIndex.fileName); //$NON-NLS-1$ throw e; } newDiskIndex.fileName = this.fileName; return newDiskIndex; } private synchronized String[] readAllDocumentNames() throws IOException { if (this.numberOfChunks <= 0) return new String[0]; DataInputStream stream = new DataInputStream(new BufferedInputStream( new FileInputStream(getIndexFile()), this.numberOfChunks > 5 ? 4096 : 2048)); try { stream.skip(this.chunkOffsets[0]); int lastIndex = this.numberOfChunks - 1; String[] docNames = new String[lastIndex * CHUNK_SIZE + sizeOfLastChunk]; for (int i = 0; i < this.numberOfChunks; i++) readChunk(docNames, stream, i * CHUNK_SIZE, i < lastIndex ? CHUNK_SIZE : sizeOfLastChunk); return docNames; } finally { stream.close(); } } private synchronized HashtableOfObject readCategoryTable( char[] categoryName, boolean readDocNumbers) throws IOException { // result will be null if categoryName is unknown int offset = this.categoryOffsets.get(categoryName); if (offset == HashtableOfIntValues.NO_VALUE) return null; if (this.categoryTables == null) { this.categoryTables = new HashtableOfObject(3); } else { HashtableOfObject cachedTable = (HashtableOfObject) this.categoryTables .get(categoryName); if (cachedTable != null) { if (readDocNumbers) { // must cache remaining document number // arrays Object[] arrayOffsets = cachedTable.valueTable; for (int i = 0, l = arrayOffsets.length; i < l; i++) if (arrayOffsets[i] instanceof Integer) arrayOffsets[i] = readDocumentNumbers(arrayOffsets[i]); } return cachedTable; } } DataInputStream stream = new DataInputStream(new BufferedInputStream( new FileInputStream(getIndexFile()), 2048)); HashtableOfObject categoryTable = null; char[][] matchingWords = null; int count = 0; int firstOffset = -1; try { stream.skip(offset); int size = stream.readInt(); try { if (size < 0) { // DEBUG System.err .println("-------------------- DEBUG --------------------"); //$NON-NLS-1$ System.err.println("file = " + getIndexFile()); //$NON-NLS-1$ System.err.println("offset = " + offset); //$NON-NLS-1$ System.err.println("size = " + size); //$NON-NLS-1$ System.err .println("-------------------- END --------------------"); //$NON-NLS-1$ } categoryTable = new HashtableOfObject(size); } catch (OutOfMemoryError oom) { // DEBUG oom.printStackTrace(); System.err .println("-------------------- DEBUG --------------------"); //$NON-NLS-1$ System.err.println("file = " + getIndexFile()); //$NON-NLS-1$ System.err.println("offset = " + offset); //$NON-NLS-1$ System.err.println("size = " + size); //$NON-NLS-1$ System.err .println("-------------------- END --------------------"); //$NON-NLS-1$ throw oom; } int largeArraySize = 256; for (int i = 0; i < size; i++) { char[] word = Util.readUTF(stream); int arrayOffset = stream.readInt(); // if arrayOffset is: // <= 0 then the array size == 1 with the value -> -arrayOffset // > 1 & < 256 then the size of the array is > 1 & < 256, the // document array follows immediately // 256 if the array size >= 256 followed by another int which is // the offset to the array (written prior to the table) if (arrayOffset <= 0) { categoryTable.put(word, new int[] { -arrayOffset }); // store // 1 // element // array // by // negating // documentNumber } else if (arrayOffset < largeArraySize) { categoryTable.put(word, readDocumentArray(stream, arrayOffset)); // read in-lined array providing size } else { arrayOffset = stream.readInt(); // read actual offset if (readDocNumbers) { if (matchingWords == null) matchingWords = new char[size][]; if (count == 0) firstOffset = arrayOffset; matchingWords[count++] = word; } categoryTable.put(word, arrayOffset); // offset // to // array // in // the // file } } this.categoryTables.put(categoryName, categoryTable); // cache the table as long as its not too big // in practise, some tables can be greater than 500K when the // contain more than 10K elements this.cachedCategoryName = categoryTable.elementSize < 10000 ? categoryName : null; } finally { stream.close(); } if (matchingWords != null && count > 0) { stream = new DataInputStream(new BufferedInputStream( new FileInputStream(getIndexFile()), 2048)); try { stream.skip(firstOffset); for (int i = 0; i < count; i++) // each array follows the previous one categoryTable.put(matchingWords[i], readDocumentArray(stream, stream.readInt())); } finally { stream.close(); } } return categoryTable; } private void readChunk(String[] docNames, DataInputStream stream, int index, int size) throws IOException { String current = stream.readUTF(); docNames[index++] = current; for (int i = 1; i < size; i++) { int start = stream.readUnsignedByte(); // number of identical // characters at the // beginning int end = stream.readUnsignedByte(); // number of identical // characters at the end String next = stream.readUTF(); if (start > 0) { if (end > 0) { int length = current.length(); next = current.substring(0, start) + next + current.substring(length - end, length); } else { next = current.substring(0, start) + next; } } else if (end > 0) { int length = current.length(); next = next + current.substring(length - end, length); } docNames[index++] = next; current = next; } } private int[] readDocumentArray(DataInputStream stream, int arraySize) throws IOException { int[] result = new int[arraySize]; switch (this.documentReferenceSize) { case 1: for (int i = 0; i < arraySize; i++) result[i] = stream.readUnsignedByte(); break; case 2: for (int i = 0; i < arraySize; i++) result[i] = stream.readUnsignedShort(); break; default: for (int i = 0; i < arraySize; i++) result[i] = stream.readInt(); break; } return result; } synchronized String readDocumentName(int docNumber) throws IOException { if (this.cachedChunks == null) this.cachedChunks = new String[this.numberOfChunks][]; int chunkNumber = docNumber / CHUNK_SIZE; String[] chunk = this.cachedChunks[chunkNumber]; if (chunk == null) { boolean isLastChunk = chunkNumber == this.numberOfChunks - 1; int start = this.chunkOffsets[chunkNumber]; int numberOfBytes = (isLastChunk ? this.startOfCategoryTables : this.chunkOffsets[chunkNumber + 1]) - start; if (numberOfBytes < 0) throw new IllegalArgumentException(); byte[] bytes = new byte[numberOfBytes]; FileInputStream file = new FileInputStream(getIndexFile()); try { file.skip(start); if (file.read(bytes, 0, numberOfBytes) != numberOfBytes) throw new IOException(); } finally { file.close(); } DataInputStream stream = new DataInputStream( new ByteArrayInputStream(bytes)); int numberOfNames = isLastChunk ? this.sizeOfLastChunk : CHUNK_SIZE; chunk = this.cachedChunks[chunkNumber] = new String[numberOfNames]; readChunk(chunk, stream, 0, numberOfNames); } return chunk[docNumber - (chunkNumber * CHUNK_SIZE)]; } synchronized int[] readDocumentNumbers(Object arrayOffset) throws IOException { // arrayOffset is either a cached array of docNumbers or an Integer // offset in the file if (arrayOffset instanceof int[]) return (int[]) arrayOffset; DataInputStream stream = new DataInputStream(new BufferedInputStream( new FileInputStream(getIndexFile()), 2048)); try { stream.skip(((Integer) arrayOffset).intValue()); return readDocumentArray(stream, stream.readInt()); } finally { stream.close(); } } private void readHeaderInfo(RandomAccessFile file) throws IOException { file.seek(this.headerInfoOffset); // must be same order as writeHeaderInfo() this.numberOfChunks = file.readInt(); this.sizeOfLastChunk = file.readUnsignedByte(); this.documentReferenceSize = file.readUnsignedByte(); this.separator = (char) file.readUnsignedByte(); this.chunkOffsets = new int[this.numberOfChunks]; for (int i = 0; i < this.numberOfChunks; i++) this.chunkOffsets[i] = file.readInt(); this.startOfCategoryTables = file.readInt(); int size = file.readInt(); this.categoryOffsets = new HashtableOfIntValues(size); for (int i = 0; i < size; i++) this.categoryOffsets.put(Util.readUTF(file), file.readInt()); // cache // offset // to // category // table this.categoryTables = new HashtableOfObject(3); } synchronized void startQuery() { this.cacheUserCount++; } synchronized void stopQuery() { if (--this.cacheUserCount < 0) { // clear cached items this.cacheUserCount = -1; this.cachedChunks = null; if (this.categoryTables != null) { if (this.cachedCategoryName == null) { this.categoryTables = null; } else if (this.categoryTables.elementSize > 1) { HashtableOfObject newTables = new HashtableOfObject(3); newTables.put(this.cachedCategoryName, this.categoryTables.get(this.cachedCategoryName)); this.categoryTables = newTables; } } } } private void writeAllDocumentNames(String[] sortedDocNames, DataOutputStream stream) throws IOException { if (sortedDocNames.length == 0) throw new IllegalArgumentException(); // assume the file was just created by initializeFrom() // in order, write: SIGNATURE & headerInfoOffset place holder, then each // compressed chunk of document names stream.writeUTF(SIGNATURE); this.headerInfoOffset = stream.size(); stream.writeInt(-1); // will overwrite with correct value later int size = sortedDocNames.length; this.numberOfChunks = (size / CHUNK_SIZE) + 1; this.sizeOfLastChunk = size % CHUNK_SIZE; if (this.sizeOfLastChunk == 0) { this.numberOfChunks--; this.sizeOfLastChunk = CHUNK_SIZE; } this.documentReferenceSize = size <= 0x7F ? 1 : (size <= 0x7FFF ? 2 : 4); // number of bytes used to encode a // reference this.chunkOffsets = new int[this.numberOfChunks]; int lastIndex = this.numberOfChunks - 1; for (int i = 0; i < this.numberOfChunks; i++) { this.chunkOffsets[i] = stream.size(); int chunkSize = i == lastIndex ? this.sizeOfLastChunk : CHUNK_SIZE; int chunkIndex = i * CHUNK_SIZE; String current = sortedDocNames[chunkIndex]; stream.writeUTF(current); for (int j = 1; j < chunkSize; j++) { String next = sortedDocNames[chunkIndex + j]; int len1 = current.length(); int len2 = next.length(); int max = len1 < len2 ? len1 : len2; int start = 0; // number of identical characters at the // beginning (also the index of first character // that is different) while (current.charAt(start) == next.charAt(start)) { start++; if (max == start) break; // current is 'abba', next is 'abbab' } if (start > 255) start = 255; int end = 0; // number of identical characters at the end while (current.charAt(--len1) == next.charAt(--len2)) { end++; if (len2 == start) break; // current is 'abbba', next is 'abba' if (len1 == 0) break; // current is 'xabc', next is 'xyabc' } if (end > 255) end = 255; stream.writeByte(start); stream.writeByte(end); int last = next.length() - end; stream.writeUTF(start < last ? next.substring(start, last) : ""); //$NON-NLS-1$ current = next; } } this.startOfCategoryTables = stream.size() + 1; } private void writeCategories(DataOutputStream stream) throws IOException { char[][] categoryNames = this.categoryTables.keyTable; Object[] tables = this.categoryTables.valueTable; for (int i = 0, l = categoryNames.length; i < l; i++) if (categoryNames[i] != null) writeCategoryTable(categoryNames[i], (HashtableOfObject) tables[i], stream); this.categoryTables = null; } private void writeCategoryTable(char[] categoryName, HashtableOfObject wordsToDocs, DataOutputStream stream) throws IOException { // the format of a category table is as follows: // any document number arrays with >= 256 elements are written before // the table (the offset to each array is remembered) // then the number of word->int[] pairs in the table is written // for each word -> int[] pair, the word is written followed by: // an int <= 0 if the array size == 1 // an int > 1 & < 256 for the size of the array if its > 1 & < 256, the // document array follows immediately // 256 if the array size >= 256 followed by another int which is the // offset to the array (written prior to the table) int largeArraySize = 256; Object[] values = wordsToDocs.valueTable; for (int i = 0, l = values.length; i < l; i++) { Object o = values[i]; if (o != null) { if (o instanceof IntList) o = values[i] = ((IntList) values[i]).asArray(); int[] documentNumbers = (int[]) o; if (documentNumbers.length >= largeArraySize) { values[i] = stream.size(); writeDocumentNumbers(documentNumbers, stream); } } } this.categoryOffsets.put(categoryName, stream.size()); // remember the // offset to the // start of the // table this.categoryTables.put(categoryName, null); // flush cached table stream.writeInt(wordsToDocs.elementSize); char[][] words = wordsToDocs.keyTable; for (int i = 0, l = words.length; i < l; i++) { Object o = values[i]; if (o != null) { Util.writeUTF(stream, words[i]); if (o instanceof int[]) { int[] documentNumbers = (int[]) o; if (documentNumbers.length == 1) stream.writeInt(-documentNumbers[0]); // store an // array of 1 // element by // negating the // documentNumber // (can be zero) else writeDocumentNumbers(documentNumbers, stream); } else { stream.writeInt(largeArraySize); // mark to identify that // an offset follows stream.writeInt(((Integer) o).intValue()); // offset in the // file of the // array of // document // numbers } } } } private void writeDocumentNumbers(int[] documentNumbers, DataOutputStream stream) throws IOException { // must store length as a positive int to detect in-lined array of 1 // element int length = documentNumbers.length; stream.writeInt(length); Util.sort(documentNumbers); switch (this.documentReferenceSize) { case 1: for (int i = 0; i < length; i++) stream.writeByte(documentNumbers[i]); break; case 2: for (int i = 0; i < length; i++) stream.writeShort(documentNumbers[i]); break; default: for (int i = 0; i < length; i++) stream.writeInt(documentNumbers[i]); break; } } private void writeHeaderInfo(DataOutputStream stream) throws IOException { stream.writeInt(this.numberOfChunks); stream.writeByte(this.sizeOfLastChunk); stream.writeByte(this.documentReferenceSize); stream.writeByte(this.separator); // apend the file with chunk offsets for (int i = 0; i < this.numberOfChunks; i++) stream.writeInt(this.chunkOffsets[i]); stream.writeInt(this.startOfCategoryTables); // append the file with the category offsets... # of name -> offset // pairs, followed by each name & an offset to its word->doc# table stream.writeInt(this.categoryOffsets.elementSize); char[][] categoryNames = this.categoryOffsets.keyTable; int[] offsets = this.categoryOffsets.valueTable; for (int i = 0, l = categoryNames.length; i < l; i++) { if (categoryNames[i] != null) { Util.writeUTF(stream, categoryNames[i]); stream.writeInt(offsets[i]); } } } private void writeOffsetToHeader(int offsetToHeader) throws IOException { if (offsetToHeader > 0) { RandomAccessFile file = new RandomAccessFile(this.fileName, "rw"); //$NON-NLS-1$ try { file.seek(this.headerInfoOffset); // offset to position in // header file.writeInt(offsetToHeader); this.headerInfoOffset = offsetToHeader; // update to reflect the // correct offset } finally { file.close(); } } } }