/** * File: $HeadURL: https://hdt-java.googlecode.com/svn/trunk/hdt-java/src/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java $ * Revision: $Rev: 94 $ * Last modified: $Date: 2012-11-20 23:44:36 +0000 (mar, 20 nov 2012) $ * Last modified by: $Author: mario.arias $ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Contacting the authors: * Mario Arias: mario.arias@deri.org * Javier D. Fernandez: jfergar@infor.uva.es * Miguel A. Martinez-Prieto: migumar2@infor.uva.es * Alejandro Andres: fuzzy.alej@gmail.com */ package org.rdfhdt.hdt.dictionary.impl.section; import java.io.BufferedInputStream; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; import java.util.Iterator; import org.rdfhdt.hdt.compact.integer.VByte; import org.rdfhdt.hdt.compact.sequence.Sequence; import org.rdfhdt.hdt.compact.sequence.SequenceFactory; import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; import org.rdfhdt.hdt.dictionary.TempDictionarySection; import org.rdfhdt.hdt.exceptions.CRCException; import org.rdfhdt.hdt.exceptions.IllegalFormatException; import org.rdfhdt.hdt.exceptions.NotImplementedException; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.util.crc.CRC8; import org.rdfhdt.hdt.util.crc.CRCInputStream; import org.rdfhdt.hdt.util.io.CountInputStream; import org.rdfhdt.hdt.util.io.IOUtil; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; /** * @author mario.arias * */ public class PFCDictionarySectionMap implements DictionarySectionPrivate,Closeable { public static final int TYPE_INDEX = 2; public static final int DEFAULT_BLOCK_SIZE = 16; private static final int BLOCKS_PER_BYTEBUFFER = 50000; protected FileChannel ch; protected ByteBuffer [] buffers; // Encoded sequence long [] posFirst; // Global byte position of the start of each buffer protected int blocksize; protected int numstrings; protected Sequence blocks; protected long dataSize; private File f; private long startOffset, endOffset; public PFCDictionarySectionMap(CountInputStream input, File f) throws IOException { this.f = f; startOffset=input.getTotalBytes(); CRCInputStream crcin = new CRCInputStream(input, new CRC8()); // Read type int type = crcin.read(); if(type!=TYPE_INDEX) { throw new IllegalFormatException("Trying to read a DictionarySectionPFC from data that is not of the suitable type"); } // Read vars numstrings = (int) VByte.decode(crcin); dataSize = VByte.decode(crcin); blocksize = (int) VByte.decode(crcin); if(!crcin.readCRCAndCheck()) { throw new CRCException("CRC Error while reading Dictionary Section Plain Front Coding Header."); } // Read blocks blocks = SequenceFactory.createStream(input, f); // blocks = SequenceFactory.createStream(input); // blocks.load(input, null); long base = input.getTotalBytes(); IOUtil.skip(crcin, dataSize+4); // Including CRC32 endOffset = input.getTotalBytes(); // Read packed data ch = new FileInputStream(f).getChannel(); int block = 0; int buffer = 0; long numBlocks = blocks.getNumberOfElements(); long bytePos = 0; long numBuffers = 1+numBlocks/BLOCKS_PER_BYTEBUFFER; buffers = new ByteBuffer[(int)numBuffers ]; posFirst = new long[(int)numBuffers]; // System.out.println("Buffers "+buffers.length); while(block<numBlocks-1) { int nextBlock = (int) Math.min(numBlocks-1, block+BLOCKS_PER_BYTEBUFFER); long nextBytePos = blocks.get(nextBlock); // System.out.println("From block "+block+" to "+nextBlock); // System.out.println("From pos "+ bytePos+" to "+nextBytePos); // System.out.println("Total size: "+ (nextBytePos-bytePos)); buffers[buffer] = ch.map(MapMode.READ_ONLY, base+bytePos, nextBytePos-bytePos); buffers[buffer].order(ByteOrder.LITTLE_ENDIAN); posFirst[buffer] = bytePos; bytePos = nextBytePos; block+=BLOCKS_PER_BYTEBUFFER; buffer++; } } protected int locateBlock(CharSequence str) { if(blocks.getNumberOfElements()==0) { return -1; } int low = 0; int high = (int)blocks.getNumberOfElements()-1; int max = high; while (low <= high) { int mid = low + (high - low)/2; int cmp; if(mid==max) { cmp=-1; } else { ByteBuffer buffer = buffers[mid/BLOCKS_PER_BYTEBUFFER]; cmp = ByteStringUtil.strcmp(str, buffer, (int)(blocks.get(mid)-posFirst[mid/BLOCKS_PER_BYTEBUFFER])); } if (cmp<0) { high = mid - 1; } else if (cmp > 0) { low = mid + 1; } else { return mid; // key found } } return -(low + 1); // key not found. } /* (non-Javadoc) * @see hdt.dictionary.DictionarySection#locate(java.lang.CharSequence) */ @Override public int locate(CharSequence str) { if(buffers==null || blocks==null) { return 0; } int blocknum = locateBlock(str); if(blocknum>=0) { // Located exactly return (blocknum*blocksize)+1; } else { // Not located exactly. blocknum = -blocknum-2; if(blocknum>=0) { int idblock = locateInBlock(blocknum, str); if(idblock != 0) { return (blocknum*blocksize)+idblock+1; } } } return 0; } public int locateInBlock(int block, CharSequence str) { if(block>=blocks.getNumberOfElements()) { return 0; } ReplazableString tempString = new ReplazableString(); int idInBlock = 0; int cshared=0; // dumpBlock(block); ByteBuffer buffer = buffers[block/BLOCKS_PER_BYTEBUFFER].duplicate(); buffer.position((int)(blocks.get(block)-posFirst[block/BLOCKS_PER_BYTEBUFFER])); try { if(!buffer.hasRemaining()) { return 0; } // Read the first string in the block tempString.replace(buffer, 0); idInBlock++; while( (idInBlock<blocksize) && buffer.hasRemaining()) { // Decode prefix long delta = VByte.decode(buffer); //Copy suffix tempString.replace(buffer, (int) delta); if(delta>=cshared) { // Current delta value means that this string // has a larger long common prefix than the previous one cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared); if((cshared==str.length()) && (tempString.length()==str.length())) { return idInBlock; } } else { // We have less common characters than before, // this string is bigger that what we are looking for. // i.e. Not found. return 0; } idInBlock++; } return 0; } catch (IOException e) { e.printStackTrace(); return 0; } } /* (non-Javadoc) * @see hdt.dictionary.DictionarySection#extract(int) */ @Override public CharSequence extract(int id) { if(buffers==null || blocks==null) { return null; } if(id<1 || id>numstrings) { return null; } int block = (id-1)/blocksize; ByteBuffer buffer = buffers[block/BLOCKS_PER_BYTEBUFFER].duplicate(); buffer.position((int)(blocks.get(block)-posFirst[block/BLOCKS_PER_BYTEBUFFER])); try { ReplazableString tempString = new ReplazableString(); tempString.replace(buffer,0); int stringid = (id-1)%blocksize; for(int i=0;i<stringid;i++) { long delta = VByte.decode(buffer); tempString.replace(buffer, (int) delta); } return new CompactString(tempString).getDelayed(); } catch (IOException e) { e.printStackTrace(); return null; } } /* (non-Javadoc) * @see hdt.dictionary.DictionarySection#size() */ @Override public long size() { return dataSize+blocks.size(); } /* (non-Javadoc) * @see hdt.dictionary.DictionarySection#getNumberOfElements() */ @Override public int getNumberOfElements() { return numstrings; } /* (non-Javadoc) * @see hdt.dictionary.DictionarySection#getEntries() */ @Override public Iterator<CharSequence> getSortedEntries() { return new Iterator<CharSequence>() { int id; ReplazableString tempString = new ReplazableString(); int bytebufferIndex; ByteBuffer buffer = buffers[0].duplicate(); @Override public boolean hasNext() { return id<getNumberOfElements(); } @Override public CharSequence next() { if(!buffer.hasRemaining()) { buffer = buffers[++bytebufferIndex].duplicate(); buffer.rewind(); } try { if((id%blocksize)==0) { tempString.replace(buffer, 0); } else { long delta = VByte.decode(buffer); tempString.replace(buffer, (int) delta); } id++; return new CompactString(tempString).getDelayed(); // return tempString.toString(); } catch (IOException e) { throw new RuntimeException(e); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public void close() throws IOException { ch.close(); } @Override public void load(TempDictionarySection other, ProgressListener listener) { throw new NotImplementedException(); } @Override public void save(OutputStream output, ProgressListener listener) throws IOException { InputStream in = new BufferedInputStream(new FileInputStream(f)); IOUtil.skip(in, startOffset); IOUtil.copyStream(in, output, endOffset-startOffset); in.close(); } @Override public void load(InputStream input, ProgressListener listener) throws IOException { throw new NotImplementedException(); } }