package com.transmem.nlp;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.logging.Logger;
/**
* Implements the ISegmenter interface to break a Thai language sentence into separate words.
* This program relies on the lucene analyzer/tokenizer and snowball stemmer to break the sentense.
*/
public class ThaiSegmenter implements ISegmenter
{
public static final Logger log_ = Logger.getLogger(ThaiSegmenter.class.getName());
public ThaiSegmenter()
{
}
public String[] segment(String sent) throws LanguageException
{
String[] tokens = null;
Reader reader = new StringReader(sent);
try
{
Analyzer analyzer = new ThaiAnalyzer();
TokenStream ts = analyzer.tokenStream("", reader);
Token t;
ArrayList<String> tlist = new ArrayList<String>();
while ((t = ts.next())!=null)
{
//System.out.println(t);
tlist.add( t.termText() );
}
tokens = tlist.toArray(new String[tlist.size()]);
}
catch (Exception e)
{
log_.severe(e.toString());
throw new LanguageException(e.toString());
}
return tokens;
}
}