/**
* File: $HeadURL: https://hdt-java.googlecode.com/svn/trunk/hdt-java/src/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java $
* Revision: $Rev: 194 $
* Last modified: $Date: 2013-03-04 21:30:01 +0000 (lun, 04 mar 2013) $
* Last modified by: $Author: mario.arias $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* Contacting the authors:
* Mario Arias: mario.arias@deri.org
* Javier D. Fernandez: jfergar@infor.uva.es
* Miguel A. Martinez-Prieto: migumar2@infor.uva.es
* Alejandro Andres: fuzzy.alej@gmail.com
*/
package org.rdfhdt.hdt.dictionary.impl.section;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;
import org.rdfhdt.hdt.compact.integer.VByte;
import org.rdfhdt.hdt.compact.sequence.SequenceLog64Big;
import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
import org.rdfhdt.hdt.dictionary.TempDictionarySection;
import org.rdfhdt.hdt.exceptions.CRCException;
import org.rdfhdt.hdt.exceptions.IllegalFormatException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.util.BitUtil;
import org.rdfhdt.hdt.util.Mutable;
import org.rdfhdt.hdt.util.crc.CRC32;
import org.rdfhdt.hdt.util.crc.CRC8;
import org.rdfhdt.hdt.util.crc.CRCInputStream;
import org.rdfhdt.hdt.util.crc.CRCOutputStream;
import org.rdfhdt.hdt.util.io.IOUtil;
import org.rdfhdt.hdt.util.string.ByteStringUtil;
import org.rdfhdt.hdt.util.string.CompactString;
import org.rdfhdt.hdt.util.string.ReplazableString;
/**
* Implementation of Plain Front Coding that divides the data in different arrays, therefore
* overcoming Java's limitation of 2Gb for a single array.
*
* It allows loading much bigger files, but waste some memory in pointers to the blocks and
* some CPU to locate the array at search time.
*
* @author mario.arias, Lyudmila Balakireva
*
*/
public class PFCDictionarySectionBig implements DictionarySectionPrivate {
public static final int TYPE_INDEX = 2;
public static final int DEFAULT_BLOCK_SIZE = 16;
public static final int BLOCK_PER_BUFFER = 1000000;
byte [][] data;
long [] posFirst;
protected SequenceLog64Big blocks;
protected int blocksize;
protected int numstrings;
protected long size;
static int filecounter = 0;
public PFCDictionarySectionBig(HDTOptions spec) {
this.blocksize = (int) spec.getInt("pfc.blocksize");
if(blocksize==0) {
blocksize = DEFAULT_BLOCK_SIZE;
}
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#load(hdt.dictionary.DictionarySection)
*/
@Override
public void load(TempDictionarySection other, ProgressListener listener) {
this.blocks = new SequenceLog64Big(BitUtil.log2(other.size()), other.getNumberOfElements()/blocksize);
System.out.println("numbits:"+BitUtil.log2(other.size()));
Iterator<? extends CharSequence> it = other.getSortedEntries();
this.load((Iterator<CharSequence>)it, other.getNumberOfElements(), listener);
}
public void load(Iterator<CharSequence> it, long numentries, ProgressListener listener) {
this.blocks = new SequenceLog64Big(64, numentries/blocksize);
this.numstrings = 0;
filecounter++;
String name = ".test"+filecounter+".tmp";
File file = new File(name);
FileOutputStream out=null;
try {
if (!file.exists()) {
file.createNewFile();
}
out = new FileOutputStream(file);
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
long byteoutsize = 0;
ByteArrayOutputStream byteOut = new ByteArrayOutputStream(16*1024);
CharSequence previousStr=null;
try {
while(it.hasNext()) {
CharSequence str = it.next();
if(numstrings%blocksize==0) {
// Add new block pointer
byteOut.flush();
byteoutsize = byteoutsize+byteOut.size();
blocks.append(byteoutsize);
byteOut.writeTo(out);
byteOut.reset();
// Copy full string
ByteStringUtil.append(byteOut, str, 0);
} else {
// Find common part.
int delta = ByteStringUtil.longestCommonPrefix(previousStr, str);
// Write Delta in VByte
VByte.encode(byteOut, delta);
// Write remaining
ByteStringUtil.append(byteOut, str, delta);
}
byteOut.write(0); // End of string
numstrings++;
previousStr = str;
}
// Ending block pointer.
byteOut.flush();
byteoutsize = byteoutsize + byteOut.size();
//blocks.append(byteOut.size());
blocks.append(byteoutsize);
// Trim text/blocks
blocks.aggresiveTrimToSize();
byteOut.flush();
byteOut.writeTo(out);
out.close();
InputStream in = new FileInputStream(name);
// Read block by block
// Read packed data
int block = 0;
int buffer = 0;
long bytePos = 0;
long numBlocks = blocks.getNumberOfElements();
//System.out.println("numblocks:"+numBlocks);
long numBuffers = 1+numBlocks/BLOCK_PER_BUFFER;
data = new byte[(int)numBuffers][];
posFirst = new long[(int)numBuffers];
while(block<numBlocks-1) {
int nextBlock = (int) Math.min(numBlocks-1, block+BLOCK_PER_BUFFER);
long nextBytePos = blocks.get(nextBlock);
//System.out.println("Loding block: "+i+" from "+previous+" to "+ current+" of size "+ (current-previous));
data[buffer]=IOUtil.readBuffer(in, (int)(nextBytePos-bytePos), null);
posFirst[buffer] = bytePos;
bytePos = nextBytePos;
block+=BLOCK_PER_BUFFER;
buffer++;
}
} catch (IOException e) {
e.printStackTrace();
}
finally {
try {
out.close();
file.delete();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* Locate the block of a string doing binary search.
*/
protected int locateBlock(CharSequence str) {
int low = 0;
int high = (int)blocks.getNumberOfElements() - 1;
int max = high;
while (low <= high) {
int mid = (low + high) >>> 1;
int cmp;
if(max==high) {
cmp = -1;
} else {
cmp = ByteStringUtil.strcmp(str, data[mid/BLOCK_PER_BUFFER], (int)(blocks.get(mid)-posFirst[mid/BLOCK_PER_BUFFER]));
//System.out.println("Comparing against block: "+ mid + " which is "+ ByteStringUtil.asString(data[mid], 0)+ " Result: "+cmp);
}
if (cmp<0) {
high = mid - 1;
} else if (cmp > 0) {
low = mid + 1;
} else {
return mid; // key found
}
}
return -(low + 1); // key not found.
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#locate(java.lang.CharSequence)
*/
@Override
public int locate(CharSequence str) {
int blocknum = locateBlock(str);
if(blocknum>=0) {
// Located exactly
return (blocknum*blocksize)+1;
} else {
// Not located exactly.
blocknum = -blocknum-2;
if(blocknum>=0) {
int idblock = locateInBlock(blocknum, str);
if(idblock != 0) {
return (blocknum*blocksize)+idblock+1;
}
}
}
// Not found
return 0;
}
protected int locateInBlock(int blockid, CharSequence str) {
ReplazableString tempString = new ReplazableString();
Mutable<Long> delta = new Mutable<Long>(0L);
int idInBlock = 0;
int cshared=0;
byte [] block = data[blockid/BLOCK_PER_BUFFER];
int pos = (int) (blocks.get(blockid)-posFirst[blockid/BLOCK_PER_BUFFER]);
// Read the first string in the block
int slen = ByteStringUtil.strlen(block, pos);
tempString.append(block, pos, slen);
pos+=slen+1;
idInBlock++;
while( (idInBlock<blocksize) && (pos<block.length))
{
// Decode prefix
pos += VByte.decode(block, pos, delta);
// Copy suffix
slen = ByteStringUtil.strlen(block, pos);
tempString.replace(delta.getValue().intValue(), block, pos, slen);
if(delta.getValue()>=cshared)
{
// Current delta value means that this string
// has a larger long common prefix than the previous one
cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared);
if((cshared==str.length()) && (tempString.length()==str.length())) {
break;
}
} else {
// We have less common characters than before,
// this string is bigger that what we are looking for.
// i.e. Not found.
idInBlock = 0;
break;
}
pos+=slen+1;
idInBlock++;
}
// Not found
if(pos==block.length || idInBlock== blocksize) {
idInBlock=0;
}
return idInBlock;
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#extract(int)
*/
@Override
public CharSequence extract(int id) {
if(id<1 || id>numstrings) {
return null;
}
// Locate block
int blockid = (id-1)/blocksize;
int nstring = (id-1)%blocksize;
byte [] block = data[blockid/BLOCK_PER_BUFFER];
int pos = (int) (blocks.get(blockid)-posFirst[blockid/BLOCK_PER_BUFFER]);
// Copy first string
int len = ByteStringUtil.strlen(block, pos);
Mutable<Long> delta = new Mutable<Long>(0L);
ReplazableString tempString = new ReplazableString();
tempString.append(block, pos, len);
// Copy strings untill we find our's.
for(int i=0;i<nstring;i++) {
pos+=len+1;
pos += VByte.decode(block, pos, delta);
len = ByteStringUtil.strlen(block, pos);
tempString.replace(delta.getValue().intValue(), block, pos, len);
}
return new CompactString(tempString).getDelayed();
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#size()
*/
@Override
public long size() {
return size;
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#getNumberOfElements()
*/
@Override
public int getNumberOfElements() {
return numstrings;
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#getEntries()
*/
@Override
public Iterator<CharSequence> getSortedEntries() {
return new Iterator<CharSequence>() {
int pos;
@Override
public boolean hasNext() {
return pos<getNumberOfElements();
}
@Override
public CharSequence next() {
// FIXME: It is more efficient to go through each block, each entry.
pos++;
return extract(pos);
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#save(java.io.OutputStream, hdt.ProgressListener)
*/
@Override
public void save(OutputStream output, ProgressListener listener) throws IOException {
CRCOutputStream out = new CRCOutputStream(output, new CRC8());
out.write(TYPE_INDEX);
VByte.encode(out, numstrings);
long datasize=0;
for (int i =0; i<data.length;i++) {
datasize = data[i].length+ datasize;
}
System.out.println("datasize:"+datasize);
VByte.encode(out, datasize);
VByte.encode(out, blocksize);
out.writeCRC();
blocks.save(output, listener); // Write blocks directly to output, they have their own CRC check.
out.setCRC(new CRC32());
for (int i =0; i<data.length;i++) {
IOUtil.writeBuffer(out, data[i], 0, data[i].length, listener);
}
out.writeCRC();
//throw new NotImplementedException();
}
/* (non-Javadoc)
* @see hdt.dictionary.DictionarySection#load(java.io.InputStream, hdt.ProgressListener)
*/
@SuppressWarnings("resource")
@Override
public void load(InputStream input, ProgressListener listener) throws IOException {
CRCInputStream in = new CRCInputStream(input, new CRC8());
// Read type
int type = in.read();
if(type!=TYPE_INDEX) {
throw new IllegalFormatException("Trying to read a DictionarySectionPFC from data that is not of the suitable type");
}
numstrings = (int) VByte.decode(in);
this.size = VByte.decode(in);
blocksize = (int)VByte.decode(in);
if(!in.readCRCAndCheck()) {
throw new CRCException("CRC Error while reading Dictionary Section Plain Front Coding Header.");
}
// Load block pointers
blocks = new SequenceLog64Big();
blocks.load(input, listener);
// Initialize global block array
// Read block by block
// Read packed data
in.setCRC(new CRC32());
int block = 0;
int buffer = 0;
long bytePos = 0;
long numBlocks = blocks.getNumberOfElements();
long numBuffers = 1+numBlocks/BLOCK_PER_BUFFER;
data = new byte[(int)numBuffers][];
posFirst = new long[(int)numBuffers];
while(block<numBlocks-1) {
int nextBlock = (int) Math.min(numBlocks-1, block+BLOCK_PER_BUFFER);
long nextBytePos = blocks.get(nextBlock);
//System.out.println("Loding block: "+i+" from "+previous+" to "+ current+" of size "+ (current-previous));
data[buffer]=IOUtil.readBuffer(in, (int)(nextBytePos-bytePos), null);
posFirst[buffer] = bytePos;
bytePos = nextBytePos;
block+=BLOCK_PER_BUFFER;
buffer++;
}
if(!in.readCRCAndCheck()) {
throw new CRCException("CRC Error while reading Dictionary Section Plain Front Coding Data.");
}
}
@Override
public void close() throws IOException {
data=null;
posFirst=null;
blocks.close();
blocks=null;
}
}