TokenizeKeysGenerator.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Aug 28, 2009
 */

package com.bigdata.btree.raba.codec;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

import org.apache.log4j.Logger;

import com.bigdata.util.BytesUtil;

/**
 * Tokenize an input file, collect the set of distinct keywords, and encode
 * those keywords as unsigned byte[]s.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class TokenizeKeysGenerator implements IRabaGenerator {

    protected static final Logger log = Logger
            .getLogger(TokenizeKeysGenerator.class);
    
    /**
     * The encoding used to serialize the term (the value of each tuple).
     */
    public static final transient String charset = "UTF-8";

    public TokenizeKeysGenerator(String fileOrResource) {
        
        final Reader r;
        if(new File(fileOrResource).exists()) {
        
            try {
                r = new BufferedReader(new FileReader(fileOrResource));
            } catch (FileNotFoundException e) {
                throw new RuntimeException("Could not open file: "
                        + fileOrResource);
            }
            
        } else {
            
            final InputStream is = getClass().getResourceAsStream(
                    fileOrResource);

            if (is == null) {

                throw new RuntimeException("No such resource: "+fileOrResource);
                
            }
            
            r = new BufferedReader(new InputStreamReader(is));
            
        }
        
        // tokenize.
        final Set<String> tokens;
        try {
            tokens = tokenize(fileOrResource, r);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        
        // encode.
        data = new byte[tokens.size()][];
        int i = 0;
        for(String s : tokens) {
            try {
                data[i++] = s.getBytes(charset);
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException("Could not encode: " + s
                        + ", charset=" + charset + " : " + e, e);
            }
        }
        
    }
    final byte[][] data;
    
    // @todo bias is for the keys which are earliest in the lexical order.
    public byte[][] generateKeys(final int size) {

        // clone so we don't disturb the order when we sort the data.
        final byte[][] a = data.clone();

        // Place the keys into sorted order.
        Arrays.sort(a, BytesUtil.UnsignedByteArrayComparator.INSTANCE);

        // clear unused keys.
        for (int i = size; i < a.length; i++) {

            a[i] = null;

        }
        
        return a;
        
    }

    public byte[][] generateValues(final int size) {

        // clone so we don't disturb the order when we clear unused entries.
        final byte[][] a = data.clone();

        // @todo could also set some percentage of the values to null.
        
        // clear unused values.
        for (int i = size; i < a.length; i++) {

            a[i] = null;

        }
        
        return a;
        
    }

    /**
     * Yes.
     */
    public boolean isKeysGenerator() {
        return true;
    }

    /**
     * Yes.
     */
    public boolean isValuesGenerator() {
        return true;
    }

//    Reader r = new BufferedReader(new InputStreamReader(is));

    public Set<String> tokenize(final String fileOrResource, final Reader r)
            throws Exception {

        // the distinct terms.
        final Set<String> terms = new HashSet<String>(10000);

        // the tokenizer.
        final StreamTokenizer tok = new StreamTokenizer(r);

        // #of tokens processed.
        int count = 0;

        boolean done = false;

        while (!done) {

            final int ttype = tok.nextToken();

            switch (ttype) {

            case StreamTokenizer.TT_EOF:

                done = true;

                break;

            case StreamTokenizer.TT_NUMBER: {

                double d = tok.nval;

                String s = Double.toString(d);

                terms.add(s);

                count++;

                break;

            }

            case StreamTokenizer.TT_WORD: {

                final String s = tok.sval;

                terms.add(s);

                count++;

                break;

            }

            }

        }

        if (log.isInfoEnabled()) {
            log.info("Tokenized: " + count + " tokens with " + terms.size()
                    + " distinct terms : src=" + fileOrResource);
        }

        return terms;

    }

}