package edu.stanford.nlp.wordseg;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
public class NonDict2 {
//public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
public String corporaDict = "/u/nlp/data/gale/segtool/stanford-seg/data/";
private static CorpusDictionary cd = null;
private static Redwood.RedwoodChannels logger = Redwood.channels(NonDict2.class);
public NonDict2(SeqClassifierFlags flags) {
if (cd == null) {
if (flags.sighanCorporaDict != null) {
corporaDict = flags.sighanCorporaDict; // use the same flag for Sighan 2005,
// but our list is extracted from ctb
}
String path;
if (flags.useAs || flags.useHk || flags.useMsr) {
throw new RuntimeException("only support settings for CTB and PKU now.");
} else if ( flags.usePk ) {
path = corporaDict+"/dict/pku.non";
} else { // CTB
path = corporaDict+"/dict/ctb.non";
}
cd = new CorpusDictionary(path);
// just output the msg...
if (flags.useAs || flags.useHk || flags.useMsr) {
} else if ( flags.usePk ) {
logger.info("INFO: flags.usePk=true | building NonDict2 from "+path);
} else { // CTB
logger.info("INFO: flags.usePk=false | building NonDict2 from "+path);
}
}
}
public String checkDic(String c2, SeqClassifierFlags flags) {
if (cd.getW(c2).equals("1")) {
return "1";
}
return "0";
}
}