package org.apache.lucene.spelt; /* * Copyright 2006-2007 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import org.apache.lucene.util.Hash64; import org.apache.lucene.util.IntList; import org.apache.lucene.util.LongList; /** * A fast, simple, in-memory data structure for holding frequency data used * to produce spelling suggestions. * * @author Martin Haye */ class FreqData { /** List of keys */ private LongList keys = new LongList(); /** One count per key */ private IntList counts = new IntList(); /** Tracks the section of the data that has been sorted. */ private int sortTop = 0; /** Upper limit on the number of unsorted entries */ private static final int MAX_UNSORTED = 1000000; /** Magic number stored in file when data is written */ static final long MAGIC_NUM = ((long)'P') << (7 * 8) | ((long)'a') << (6 * 8) | ((long)'i') << (5 * 8) | ((long)'r') << (4 * 8) | ((long)'F') << (3 * 8) | ((long)'r') << (2 * 8) | ((long)'q') << (1 * 8) | ((long)'1') << (0 * 8); /** Add a count for a given word */ public final void add(String word, int count) { add(Hash64.hash(word), count); } /** Add a count for a given word pair */ public final void add(String word1, String word2, int count) { add(Hash64.hash(word1, word2), count); } /** Add a count for a given hash code and count */ public void add(long hash, int count) { // Check within the sorted section to see if we already have this. int pos = searchSorted(hash); if (pos >= 0) { counts.set(pos, counts.get(pos) + count); return; } // Gotta add a new entry. keys.add(hash); counts.add(count); // Every once in a while, sort and de-dupe to keep our memory footprint // reasonable. // if (keys.size() - sortTop > MAX_UNSORTED) sort(); } /** Search within the sorted keys for the given one. */ private int searchSorted(long hash) { int low = 0; int high = sortTop - 1; while (low <= high) { int mid = (low + high) >> 1; long probe = keys.get(mid); if (probe < hash) low = mid + 1; else if (probe > hash) high = mid - 1; else return mid; } return -1; } /** Get the count for a given word, or zero if not found */ public final int get(String word) { return get(Hash64.hash(word)); } /** Get the count for a given word pair, or zero if not found */ public final int get(String word1, String word2) { return get(Hash64.hash(word1, word2)); } /** Get the count for a given hash code, or zero if not found */ public final int get(long hash) { // Before using binary search, ensure the data is sorted. sort(); // See if we can locate the given hash code. int pos = keys.binarySearch(hash); if (pos < 0) return 0; // Got it! return counts.get(pos); } /** * Append sorted counts from an input stream that were saved by * {@link #save(File)}. * * @param f File to load from * @throws IOException if anything goes wrong */ public void add(File f) throws IOException { int prevSize = keys.size(); // Open the file DataInputStream s = new DataInputStream( new BufferedInputStream(new FileInputStream(f))); try { // Check the magic number long magic = s.readLong(); if (magic != MAGIC_NUM) throw new IOException("unrecognized format of frequency data"); // Find out how many counts are stored, and prepare space for them. int numCounts = s.readInt(); keys.ensureCapacity(keys.size() + numCounts); counts.ensureCapacity(keys.size() + numCounts); // Read each pair, verify ascending key order, and add it to our lists. long prevKey = -1; for (int i = 0; i < numCounts; i++) { // Read the data long key = s.readLong(); int count = s.readInt(); // Validate it if (key <= prevKey) throw new IOException( "freq data was not sorted correctly on disk, or file is corrupt"); prevKey = key; if (count < 0) throw new IOException("frequency data file is corrupted"); // And record it keys.add(key); counts.add(count); } // If we weren't appending, there's no need to re-sort. if (prevSize == 0) sortTop = keys.size(); } finally { s.close(); } } /** * Save sorted counts to an input stream. These can later be loaded by * {@link #add(File)}. * * @param f File to write to (existing contents are replaced) * @throws IOException if anything goes wrong */ public void save(File f) throws IOException { // Make sure the data is in sorted order sort(); // Open the file DataOutputStream s = new DataOutputStream( new BufferedOutputStream(new FileOutputStream(f))); // Write out the data try { s.writeLong(MAGIC_NUM); s.writeInt(keys.size()); for (int i = 0; i < keys.size(); i++) { s.writeLong(keys.get(i)); s.writeInt(counts.get(i)); } } finally { s.close(); } } /** If not already sorted, re-sort the data */ private void sort() { // Already sorted, or no data? Forget it. if (sortTop == keys.size()) return; // First step: sort both lists. final int[] map = keys.calcSortMap(); keys.remap(map); counts.remap(map); // Now merge duplicates. long key = keys.get(0); int count = counts.get(0); int dp = 0; for (int sp = 1; sp < keys.size(); sp++) { final long nextKey = keys.get(sp); if (nextKey != key) { assert nextKey > key : "calcSortMap didn't work right"; keys.set(dp, key); counts.set(dp, count); dp++; key = nextKey; count = 0; } count += counts.get(sp); } // Be sure to do the last one. keys.set(dp, key); counts.set(dp, count); dp++; // Chop off any unused space caused by merging. keys.resize(dp); counts.resize(dp); // Lastly, remember that we don't have to sort again. sortTop = keys.size(); } } // class