PFCDictionarySectionMap.java example

Explorer

hdt-java-master
- hdt-api
  - src
    - main
      - java
        org
        rdfhdt
        hdt
        dictionary
        Dictionary.java
        DictionarySection.java
        DictionaryUtil.java
        enums
        DictionaryRole.java
        DictionarySectionRole.java
        RDFNotation.java
        ResultEstimationType.java
        TripleComponentOrder.java
        TripleComponentRole.java
        exceptions
        CRCException.java
        IllegalFormatException.java
        NotFoundException.java
        NotImplementedException.java
        ParserException.java
        hdt
        HDT.java
        HDTManager.java
        HDTVocabulary.java
        header
        Header.java
        HeaderUtil.java
        listener
        ProgressListener.java
        ProgressOut.java
        options
        ControlInfo.java
        HDTOptions.java
        rdf
        RDFAccess.java
        RDFParserCallback.java
        RDFStorage.java
        triples
        IteratorTripleID.java
        IteratorTripleString.java
        TripleID.java
        TripleString.java
        Triples.java
        util
        UnicodeEscape.java
- hdt-fuseki
  - src
    - main
      - java
        org
        rdfhdt
        hdt
        fuseki
        FusekiHDTCmd.java
        HDTGenerateIndex.java
- hdt-java-cli
  - src
    - main
      - java
        org
        rdfhdt
        hdt
        tools
        HDT2RDF.java
        HDTInfo.java
        HDTVerify.java
        HdtSearch.java
        RDF2HDT.java
- hdt-java-core
  - examples
    - org
      - rdfhdt
        hdt
        examples
        ExampleGenerate.java
        ExampleSearch.java
  - src
    - main
      - java
        org
        rdfhdt
        hdt
        compact
        bitmap
        AdjacencyList.java
        Bitmap.java
        Bitmap375.java
        Bitmap64.java
        BitmapFactory.java
        ModifiableBitmap.java
        integer
        VByte.java
        sequence
        DeflateIntegerIterator.java
        DynamicSequence.java
        Sequence.java
        SequenceFactory.java
        SequenceInt32.java
        SequenceInt64.java
        SequenceLog64.java
        SequenceLog64Big.java
        SequenceLog64Map.java
        dictionary
        DictionaryFactory.java
        DictionaryPrivate.java
        DictionarySectionPrivate.java
        TempDictionary.java
        TempDictionarySection.java
        impl
        BaseDictionary.java
        BaseTempDictionary.java
        DictionaryIDMapping.java
        FourSectionDictionary.java
        FourSectionDictionaryBig.java
        HashDictionary.java
        section
        DictionarySectionCache.java
        DictionarySectionCacheAll.java
        DictionarySectionCachePerThread.java
        DictionarySectionFactory.java
        HashDictionarySection.java
        PFCDictionarySection.java
        PFCDictionarySectionBig.java
        PFCDictionarySectionMap.java
        example
        ExampleGenerate.java
        hdt
        HDTFactory.java
        HDTManagerImpl.java
        HDTPrivate.java
        TempDictTriplesFactory.java
        TempHDT.java
        TempHDTImporter.java
        impl
        HDTImpl.java
        ModeOfLoading.java
        TempHDTImpl.java
        TempHDTImporterOnePass.java
        TempHDTImporterTwoPass.java
        header
        HeaderFactory.java
        HeaderPrivate.java
        PlainHeader.java
        PlainHeaderIterator.java
        iterator
        DictionaryTranslateIterator.java
        RoleIteratorTripleID.java
        SequentialSearchIteratorTripleID.java
        utils
        Filter.java
        Iter.java
        IteratorConcat.java
        MergedIterator.java
        MergedReduceIterator.java
        MultiMerge.java
        PrefetchIterator.java
        Reducer.java
        ReducerLeft.java
        SortedReduceIterator.java
        Transform.java
        options
        ControlInformation.java
        HDTOptionsBase.java
        HDTSpecification.java
        rdf
        RDFParserFactory.java
        parsers
        JenaModelIterator.java
        RDFParserRAR.java
        RDFParserRIOT.java
        RDFParserSimple.java
        RDFParserTar.java
        triples
        TempTriples.java
        TripleIDComparator.java
        TripleIDComparatorSPO.java
        TripleStringComparator.java
        TriplesFactory.java
        TriplesPrivate.java
        impl
        BitmapTriples.java
        BitmapTriplesIterator.java
        BitmapTriplesIteratorY.java
        BitmapTriplesIteratorYFOQ.java
        BitmapTriplesIteratorZ.java
        BitmapTriplesIteratorZFOQ.java
        PredicateIndex.java
        PredicateIndexArray.java
        TripleOrderConvert.java
        TriplesList.java
        util
        BitUtil.java
        Histogram.java
        LRUCache.java
        Mutable.java
        ProfilingUtil.java
        RDFInfo.java
        Statistics.java
        StopWatch.java
        StopWatchCpu.java
        StringUtil.java
        crc
        CRC.java
        CRC16.java
        CRC32.java
        CRC8.java
        CRCInputStream.java
        CRCOutputStream.java
        io
        ByteBufferInputStream.java
        CountInputStream.java
        CountOutputStream.java
        DummyOutputStream.java
        ExternalDecompressStream.java
        IOUtil.java
        NonCloseInputStream.java
        SkipReadInputStream.java
        listener
        IntermediateListener.java
        ListenerUtil.java
        string
        ByteStringUtil.java
        CharSequenceComparator.java
        CompactString.java
        DelayedString.java
        ReplazableString.java
        UnicodeEscape.java
    - test
      - java
        org
        rdfhdt
        hdt
        compact
        array
        IntegerArrayTest.java
        LogArray64Dump.java
        LogArray64Test.java
        LongArrayTest.java
        bitmap
        BitSequence375Test.java
        integer
        VByteTest.java
        iterator
        MergedIteratorTest.java
        MultiMergeTest.java
        rdf
        parsers
        TarParserTest.java
        TarTest.java
        tests
        AllTests.java
        util
        crc
        CRCStreamTest.java
        io
        IOUtilTest.java
        string
        HDTEnsureDictionaryOrder.java
- hdt-java-package
  - examples
    - org
      - rdfhdt
        hdt
        examples
        ExampleGenerate.java
        ExampleSearch.java
- hdt-jena
  - src
    - main
      - java
        org
        apache
        jena
        graph
        JenaNodeCreator.java
        rdfhdt
        hdtjena
        CustomDatatype.java
        HDTCapabilities.java
        HDTGraph.java
        HDTGraphAssembler.java
        HDTJenaConstants.java
        HDTStatistics.java
        NodeDictionary.java
        bindings
        BindingHDTId.java
        BindingHDTNode.java
        HDTId.java
        cache
        DictionaryCache.java
        DictionaryCacheArray.java
        DictionaryCacheArrayWeak.java
        DictionaryCacheHash.java
        DictionaryCacheLRI.java
        DictionaryCacheLRU.java
        DictionaryCacheNone.java
        DummyMap.java
        cmd
        HDTSparql.java
        solver
        BindingOne.java
        HDTJenaIterator.java
        HDTOptimizeddOp.java
        HDTQueryEngine.java
        HDTSolverLib.java
        OpExecutorHDT.java
        OptimizedCount.java
        QueryIterHDT.java
        ReorderTransformationHDT.java
        StageMatchTripleID.java
        util
        Abortable.java
        IterAbortable.java
        VarAppearance.java
    - test
      - java
        JenaHDTTest.java
        org
        rdfhdt
        hdtjena
        NodeDictionaryTest.java
        solver
        OptimizedCountTest.java

/**
 * File: $HeadURL: https://hdt-java.googlecode.com/svn/trunk/hdt-java/src/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java $
 * Revision: $Rev: 94 $
 * Last modified: $Date: 2012-11-20 23:44:36 +0000 (mar, 20 nov 2012) $
 * Last modified by: $Author: mario.arias $
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Contacting the authors:
 *   Mario Arias:               mario.arias@deri.org
 *   Javier D. Fernandez:       jfergar@infor.uva.es
 *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
 *   Alejandro Andres:          fuzzy.alej@gmail.com
 */

package org.rdfhdt.hdt.dictionary.impl.section;

import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.util.Iterator;

import org.rdfhdt.hdt.compact.integer.VByte;
import org.rdfhdt.hdt.compact.sequence.Sequence;
import org.rdfhdt.hdt.compact.sequence.SequenceFactory;
import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
import org.rdfhdt.hdt.dictionary.TempDictionarySection;
import org.rdfhdt.hdt.exceptions.CRCException;
import org.rdfhdt.hdt.exceptions.IllegalFormatException;
import org.rdfhdt.hdt.exceptions.NotImplementedException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.util.crc.CRC8;
import org.rdfhdt.hdt.util.crc.CRCInputStream;
import org.rdfhdt.hdt.util.io.CountInputStream;
import org.rdfhdt.hdt.util.io.IOUtil;
import org.rdfhdt.hdt.util.string.ByteStringUtil;
import org.rdfhdt.hdt.util.string.CompactString;
import org.rdfhdt.hdt.util.string.ReplazableString;

/**
 * @author mario.arias
 *
 */
public class PFCDictionarySectionMap implements DictionarySectionPrivate,Closeable {
	public static final int TYPE_INDEX = 2;
	public static final int DEFAULT_BLOCK_SIZE = 16;
	
	private static final int BLOCKS_PER_BYTEBUFFER = 50000;
	protected FileChannel ch;
	protected ByteBuffer [] buffers; // Encoded sequence
	long [] posFirst;	// Global byte position of the start of each buffer
	protected int blocksize;
	protected int numstrings;
	protected Sequence blocks;
	protected long dataSize;

	private File f;
	private long startOffset, endOffset;

	public PFCDictionarySectionMap(CountInputStream input, File f) throws IOException {
		this.f = f;
		startOffset=input.getTotalBytes();

		CRCInputStream crcin = new CRCInputStream(input, new CRC8());
		
		// Read type
		int type = crcin.read();
		if(type!=TYPE_INDEX) {
			throw new IllegalFormatException("Trying to read a DictionarySectionPFC from data that is not of the suitable type");
		}
		
		// Read vars
		numstrings = (int) VByte.decode(crcin);
		dataSize = VByte.decode(crcin);
		blocksize = (int) VByte.decode(crcin);		
	
		if(!crcin.readCRCAndCheck()) {
			throw new CRCException("CRC Error while reading Dictionary Section Plain Front Coding Header.");
		}
		
		// Read blocks
		blocks = SequenceFactory.createStream(input, f);
//		blocks = SequenceFactory.createStream(input);
//		blocks.load(input, null);
		
		long base = input.getTotalBytes();
		IOUtil.skip(crcin, dataSize+4); // Including CRC32

		endOffset = input.getTotalBytes();

		// Read packed data
		ch = new FileInputStream(f).getChannel();
		int block = 0;
		int buffer = 0;
		long numBlocks = blocks.getNumberOfElements();
		long bytePos = 0;
		long numBuffers = 1+numBlocks/BLOCKS_PER_BYTEBUFFER;
		buffers = new ByteBuffer[(int)numBuffers ];
		posFirst = new long[(int)numBuffers];
		
//		System.out.println("Buffers "+buffers.length);
		while(block<numBlocks-1) {
			int nextBlock = (int) Math.min(numBlocks-1, block+BLOCKS_PER_BYTEBUFFER);
			long nextBytePos = blocks.get(nextBlock);
			
//			System.out.println("From block "+block+" to "+nextBlock);
//			System.out.println("From pos "+ bytePos+" to "+nextBytePos);
//			System.out.println("Total size: "+ (nextBytePos-bytePos));
			buffers[buffer] = ch.map(MapMode.READ_ONLY, base+bytePos, nextBytePos-bytePos);
			buffers[buffer].order(ByteOrder.LITTLE_ENDIAN);
			
			posFirst[buffer] = bytePos;
			
			bytePos = nextBytePos;
			block+=BLOCKS_PER_BYTEBUFFER;
			buffer++;
		}
	}

	protected int locateBlock(CharSequence str) {
		if(blocks.getNumberOfElements()==0) {
			return -1;
		}
		
		int low = 0;
		int high = (int)blocks.getNumberOfElements()-1;
		int max = high;
		
		while (low <= high) {
			int mid = low + (high - low)/2;
		
			int cmp;
			if(mid==max) {
				cmp=-1;
			} else {
				ByteBuffer buffer = buffers[mid/BLOCKS_PER_BYTEBUFFER];
				cmp = ByteStringUtil.strcmp(str, buffer, (int)(blocks.get(mid)-posFirst[mid/BLOCKS_PER_BYTEBUFFER]));
			}
			if (cmp<0) {
				high = mid - 1;
			} else if (cmp > 0) {
				low = mid + 1;
			} else {
				return mid; // key found
			}
		}
		return -(low + 1);  // key not found.
	}
	
	
	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#locate(java.lang.CharSequence)
	 */
	@Override
	public int locate(CharSequence str) {
		if(buffers==null || blocks==null) {
			return 0;
		}
		
		int blocknum = locateBlock(str);
		if(blocknum>=0) {
			// Located exactly
			return (blocknum*blocksize)+1;
		} else {
			// Not located exactly.
			blocknum = -blocknum-2;
			
			if(blocknum>=0) {
				int idblock = locateInBlock(blocknum, str);

				if(idblock != 0) {
					return (blocknum*blocksize)+idblock+1;
				}
			}
		}
		
		return 0;
	}
	
	public int locateInBlock(int block, CharSequence str) {
		if(block>=blocks.getNumberOfElements()) {
			return 0;
		}
		
		ReplazableString tempString = new ReplazableString();
		
		int idInBlock = 0;
		int cshared=0;
		
//		dumpBlock(block);

		ByteBuffer buffer = buffers[block/BLOCKS_PER_BYTEBUFFER].duplicate();
		buffer.position((int)(blocks.get(block)-posFirst[block/BLOCKS_PER_BYTEBUFFER]));
		
		try {
			if(!buffer.hasRemaining()) {
				return 0;
			}
			
			// Read the first string in the block
			tempString.replace(buffer, 0);

			idInBlock++;

			while( (idInBlock<blocksize) && buffer.hasRemaining()) 
			{
				// Decode prefix
				long delta = VByte.decode(buffer);

				//Copy suffix
				tempString.replace(buffer, (int) delta);

				if(delta>=cshared)
				{
					// Current delta value means that this string
					// has a larger long common prefix than the previous one
					cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared);

					if((cshared==str.length()) && (tempString.length()==str.length())) {
						return idInBlock;
					}
				} else {
					// We have less common characters than before, 
					// this string is bigger that what we are looking for.
					// i.e. Not found.
					return 0;
				}
				idInBlock++;
			}
			return 0;
		} catch (IOException e) {
			e.printStackTrace();
			return 0;
		}
	}
	
	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#extract(int)
	 */
	@Override
	public CharSequence extract(int id) {
		if(buffers==null || blocks==null) {
			return null;
		}
		
		if(id<1 || id>numstrings) {
			return null;
		}
		
		int block = (id-1)/blocksize;
		ByteBuffer buffer = buffers[block/BLOCKS_PER_BYTEBUFFER].duplicate();
		buffer.position((int)(blocks.get(block)-posFirst[block/BLOCKS_PER_BYTEBUFFER]));
		
		try {
			ReplazableString tempString = new ReplazableString();
			tempString.replace(buffer,0);

			int stringid = (id-1)%blocksize;
			for(int i=0;i<stringid;i++) {
				long delta = VByte.decode(buffer);
				tempString.replace(buffer, (int) delta);
			}
			return new CompactString(tempString).getDelayed();
		} catch (IOException e) {
			e.printStackTrace();
			return null;
		}
	}

	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#size()
	 */
	@Override
	public long size() {
		return dataSize+blocks.size();
	}

	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#getNumberOfElements()
	 */
	@Override
	public int getNumberOfElements() {
		return numstrings;
	}

	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#getEntries()
	 */
	@Override
	public Iterator<CharSequence> getSortedEntries() {
		return new Iterator<CharSequence>() {
			int id;

			ReplazableString tempString = new ReplazableString();
			int bytebufferIndex;
			ByteBuffer buffer = buffers[0].duplicate();

			@Override
			public boolean hasNext() {
				return id<getNumberOfElements();
			}

			@Override
			public CharSequence next() {
				if(!buffer.hasRemaining()) {
					buffer = buffers[++bytebufferIndex].duplicate();
					buffer.rewind();
				}
				try {
					if((id%blocksize)==0) {
						tempString.replace(buffer, 0);
					} else {				
						long delta = VByte.decode(buffer);
						tempString.replace(buffer, (int) delta);
					}
					id++;
					return new CompactString(tempString).getDelayed();
//					return tempString.toString();
				} catch (IOException e) {
					throw new RuntimeException(e);
				}
			}

			@Override
			public void remove() {
				throw new UnsupportedOperationException();
			}
		};
	}

	@Override
	public void close() throws IOException {
		ch.close();
	}

	@Override
	public void load(TempDictionarySection other, ProgressListener listener) {
		throw new NotImplementedException();
	}

	@Override
	public void save(OutputStream output, ProgressListener listener) throws IOException {
		InputStream in = new BufferedInputStream(new FileInputStream(f));
		IOUtil.skip(in, startOffset);
		IOUtil.copyStream(in, output, endOffset-startOffset);
		in.close();
	}

	@Override
	public void load(InputStream input, ProgressListener listener)
			throws IOException {
		throw new NotImplementedException();
	}
}