/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Aug 28, 2009
*/
package com.bigdata.btree.raba.codec;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.log4j.Logger;
import com.bigdata.util.BytesUtil;
/**
* Tokenize an input file, collect the set of distinct keywords, and encode
* those keywords as unsigned byte[]s.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TokenizeKeysGenerator implements IRabaGenerator {
protected static final Logger log = Logger
.getLogger(TokenizeKeysGenerator.class);
/**
* The encoding used to serialize the term (the value of each tuple).
*/
public static final transient String charset = "UTF-8";
public TokenizeKeysGenerator(String fileOrResource) {
final Reader r;
if(new File(fileOrResource).exists()) {
try {
r = new BufferedReader(new FileReader(fileOrResource));
} catch (FileNotFoundException e) {
throw new RuntimeException("Could not open file: "
+ fileOrResource);
}
} else {
final InputStream is = getClass().getResourceAsStream(
fileOrResource);
if (is == null) {
throw new RuntimeException("No such resource: "+fileOrResource);
}
r = new BufferedReader(new InputStreamReader(is));
}
// tokenize.
final Set<String> tokens;
try {
tokens = tokenize(fileOrResource, r);
} catch (Exception e) {
throw new RuntimeException(e);
}
// encode.
data = new byte[tokens.size()][];
int i = 0;
for(String s : tokens) {
try {
data[i++] = s.getBytes(charset);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Could not encode: " + s
+ ", charset=" + charset + " : " + e, e);
}
}
}
final byte[][] data;
// @todo bias is for the keys which are earliest in the lexical order.
public byte[][] generateKeys(final int size) {
// clone so we don't disturb the order when we sort the data.
final byte[][] a = data.clone();
// Place the keys into sorted order.
Arrays.sort(a, BytesUtil.UnsignedByteArrayComparator.INSTANCE);
// clear unused keys.
for (int i = size; i < a.length; i++) {
a[i] = null;
}
return a;
}
public byte[][] generateValues(final int size) {
// clone so we don't disturb the order when we clear unused entries.
final byte[][] a = data.clone();
// @todo could also set some percentage of the values to null.
// clear unused values.
for (int i = size; i < a.length; i++) {
a[i] = null;
}
return a;
}
/**
* Yes.
*/
public boolean isKeysGenerator() {
return true;
}
/**
* Yes.
*/
public boolean isValuesGenerator() {
return true;
}
// Reader r = new BufferedReader(new InputStreamReader(is));
public Set<String> tokenize(final String fileOrResource, final Reader r)
throws Exception {
// the distinct terms.
final Set<String> terms = new HashSet<String>(10000);
// the tokenizer.
final StreamTokenizer tok = new StreamTokenizer(r);
// #of tokens processed.
int count = 0;
boolean done = false;
while (!done) {
final int ttype = tok.nextToken();
switch (ttype) {
case StreamTokenizer.TT_EOF:
done = true;
break;
case StreamTokenizer.TT_NUMBER: {
double d = tok.nval;
String s = Double.toString(d);
terms.add(s);
count++;
break;
}
case StreamTokenizer.TT_WORD: {
final String s = tok.sval;
terms.add(s);
count++;
break;
}
}
}
if (log.isInfoEnabled()) {
log.info("Tokenized: " + count + " tokens with " + terms.size()
+ " distinct terms : src=" + fileOrResource);
}
return terms;
}
}