package com.transmem.nlp; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.logging.Logger; /** * Implements the ISegmenter interface to break a Japanese sentence into separate words. * It is based on the lucene Paoding segmentation program. It needs a list of dictionary files * which must be placed in the dic folder under the current package. */ public class JapaneseSegmenter implements ISegmenter { public static final Logger log_ = Logger.getLogger(JapaneseSegmenter.class.getName()); public String[] segment(String sent) throws LanguageException { String[] tokens = null; Reader reader = new StringReader(sent); try { //TODO: CJKAnalyzer cannot break Japanese characters into meaningful words! Analyzer analyzer = new CJKAnalyzer(); TokenStream ts = analyzer.tokenStream("", reader); Token t; ArrayList<String> tlist = new ArrayList<String>(); while ((t = ts.next())!=null) { //System.out.println(t); tlist.add( t.termText() ); } tokens = tlist.toArray(new String[tlist.size()]); } catch (Exception e) { log_.severe(e.toString()); throw new LanguageException(e.toString()); } return tokens; } }