package ivory.core.tokenize; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.Level; import org.apache.log4j.Logger; public class BigramChineseTokenizer extends Tokenizer { private static final Logger LOG = Logger.getLogger(BigramChineseTokenizer.class); static{ LOG.setLevel(Level.INFO); } public BigramChineseTokenizer(){ super(); } @Override public void configure(Configuration conf) { } @Override public void configure(Configuration conf, FileSystem fs) { } @Override public String[] processContent(String text) { int numTokens = 0; String[] chunks = text.split("\\s+"); List<String> tokens = new ArrayList<String>(); for (String chunk : chunks){ chunk = chunk.toLowerCase(); // LOG.info("chunk="+chunk.length()); char prev = 0, cur; for (int i = 0; i < chunk.length(); i++) { cur = chunk.charAt(i); if (i > 0) { String bigram = ""; bigram += prev; bigram += cur; tokens.add(bigram); // LOG.info("bigram="+bigram); numTokens++; } prev = cur; } } String[] tokensArr = new String[numTokens]; return tokens.toArray(tokensArr); } @Override public String removeBorderStopWords(String tokenizedText) { return tokenizedText; } }