package edu.stanford.nlp.wordseg.demo;
import java.io.*;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;
/** This is a very simple demo of calling the Chinese Word Segmenter
* programmatically. It assumes an input file in UTF8.
* <p/>
* <code>
* Usage: java -mx1g -cp seg.jar SegDemo fileName
* </code>
* This will run correctly in the distribution home directory. To
* run in general, the properties for where to find dictionaries or
* normalizations have to be set.
*
* @author Christopher Manning
*/
public class SegDemo {
private static final String basedir = System.getProperty("SegDemo", "data");
public static void main(String[] args) throws Exception {
System.setOut(new PrintStream(System.out, true, "utf-8"));
Properties props = new Properties();
props.setProperty("sighanCorporaDict", basedir);
// props.setProperty("NormalizationTable", "data/norm.simp.utf8");
// props.setProperty("normTableEncoding", "UTF-8");
// below is needed because CTBSegDocumentIteratorFactory accesses it
props.setProperty("serDictionary", basedir + "/dict-chris6.ser.gz");
if (args.length > 0) {
props.setProperty("testFile", args[0]);
}
props.setProperty("inputEncoding", "UTF-8");
props.setProperty("sighanPostProcessing", "true");
CRFClassifier<CoreLabel> segmenter = new CRFClassifier<>(props);
segmenter.loadClassifierNoExceptions(basedir + "/ctb.gz", props);
for (String filename : args) {
segmenter.classifyAndWriteAnswers(filename);
}
String sample = "我住在美国。";
List<String> segmented = segmenter.segmentString(sample);
System.out.println(segmented);
}
}