package org.cdlib.xtf.util;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.lucene.util.Prime;
/**
* Creates a persistent string to byte buffer hash table on disk, optimized
* for quick access. It can be read back later using a
* {@link DiskHashReader}.
*
* @author Martin Haye
*/
public class DiskHashWriter
{
/** Keeps track of entries in memory until we're ready to write to disk */
private HashMap memMap = new HashMap(100);
/**
* Add a new key/value pair to the hash.
*/
public void put(String key, PackedByteBuf val)
{
// We don't allow zero-length strings, because that's how an empty
// hash slot is denoted.
//
if (key.length() == 0)
key = " ";
PackedByteBuf cloned = (PackedByteBuf) val.clone();
cloned.doNotCompress(); // We're going to copy to another buffer, so avoid compressing
memMap.put(key, cloned);
} // put()
/** Writes out the entire hash */
public void outputTo(SubStoreWriter out)
throws IOException
{
// Calculate a good size for the hash. We want to have plenty of open
// spaces to avoid excessive collisions.
//
int nItems = memMap.size();
int hashSize = Prime.findAfter(nItems * 2);
// Throw all the added entries into the hash.
PackedByteBuf[] slots = new PackedByteBuf[hashSize];
int maxSlotSize = 0;
for (Iterator iter = memMap.keySet().iterator(); iter.hasNext();)
{
String key = (String)iter.next();
PackedByteBuf val = (PackedByteBuf)memMap.get(key);
// Add it to the correct slot.
int slotNum = (key.hashCode() & 0xffffff) % hashSize;
if (slots[slotNum] == null)
slots[slotNum] = new PackedByteBuf(val.length() + key.length() + 5);
slots[slotNum].writeString(key);
slots[slotNum].writeBuffer(val);
}
// Finish all the slots.
for (int i = 0; i < hashSize; i++)
{
if (slots[i] == null)
continue;
slots[i].writeString(""); // Marks end of slot
maxSlotSize = Math.max(maxSlotSize, slots[i].length());
}
// Now write the header and the slot offsets.
out.write("hash".getBytes());
out.writeInt(hashSize);
out.writeInt(maxSlotSize);
assert DiskHashReader.headerSize == (int)out.length();
int startOffset = (int)out.length() + (hashSize * 4);
int curOffset = startOffset;
for (int i = 0; i < hashSize; i++)
{
if (slots[i] == null) {
out.writeInt(0);
continue;
}
out.writeInt(curOffset);
curOffset += slots[i].length();
assert slots[i].length() <= maxSlotSize;
} // for i
assert out.length() == startOffset;
// Finally, write all the data.
for (int i = 0; i < hashSize; i++) {
if (slots[i] == null)
continue;
slots[i].output(out);
}
assert out.length() == curOffset;
// To make sure that the hash reader doesn't have to worry about
// accidentally reading past the end of the sub-file, write an extra
// block of bytes.
//
out.write(new byte[maxSlotSize]);
// All done!
out.close();
} // outputTo()
// Perform a basic regression test on the DiskHash system. Writes a file
// in the current directory during the test, but erases it on completion.
//
public static final Tester tester = new Tester("DiskHash")
{
protected void testImpl()
throws Exception
{
// Since we depend on StructuredFile, make sure it passes.
StructuredFile.tester.test();
File testFile = new File("DiskHashTest.sf");
StructuredFile f = null;
try
{
// Create a structured file to hold the hash.
f = StructuredFile.create(testFile);
// Make a test hash.
DiskHashWriter w = new DiskHashWriter();
PackedByteBuf buf = new PackedByteBuf(20);
buf.writeInt(11);
buf.writeString("hello");
w.put("foo", buf);
buf.reset();
buf.writeInt(22);
buf.writeString("kangaroo");
w.put("bar", buf);
w.outputTo(f.createSubStore("testhash"));
DiskHashReader r = new DiskHashReader(f.openSubStore("testhash"));
buf = r.find("bar");
assert buf != null;
assert buf.readInt() == 22;
assert buf.readString().equals("kangaroo");
buf = r.find("foo");
assert buf != null;
assert buf.readInt() == 11;
assert buf.readString().equals("hello");
assert r.find("xyz") == null;
}
finally {
// All done. Close and clean up our file.
if (f != null)
f.close();
testFile.delete();
}
} // testImpl()
};
} // class DiskHashWriter