package edu.stanford.nlp.wordseg;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.logging.Redwood;
/** @author Huihsin Tseng */
class TagAffixDetector {
private static final Redwood.RedwoodChannels logger = Redwood.channels(TagAffixDetector.class);
private static final boolean VERBOSE = false;
private final CorpusChar cc;
private final AffixDictionary aD;
// String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
private static final String DEFAULT_CORPORA_DICT = "/u/nlp/data/gale/segtool/stanford-seg/data";
public TagAffixDetector(SeqClassifierFlags flags) {
String corporaDict;
if (flags.sighanCorporaDict != null) {
corporaDict = flags.sighanCorporaDict;
} else {
corporaDict = DEFAULT_CORPORA_DICT;
}
if ( ! corporaDict.isEmpty() && ! corporaDict.endsWith("/")) {
corporaDict = corporaDict + '/';
}
String ccPath;
String adPath;
if (flags.useChPos || flags.useCTBChar2 || flags.usePKChar2) {
// if we're using POS information, override the ccPath
// For now we only have list for CTB and PK
if (flags.useASBCChar2 || flags.useHKChar2 || flags.useMSRChar2) {
throw new RuntimeException("only support settings for CTB and PK now.");
} else if (flags.useCTBChar2) {
ccPath = corporaDict+"dict/character_list";
adPath = corporaDict+"dict/in.ctb";
} else if (flags.usePKChar2) {
ccPath = corporaDict+"dict/pos_open/character_list.pku.utf8";
adPath = corporaDict+"dict/in.pk";
} else {
throw new RuntimeException("none of flags.useXXXChar2 are on");
}
} else {
ccPath = corporaDict+"dict/pos_close/char.ctb.list";
adPath = corporaDict+"dict/in.ctb";
}
if (VERBOSE) {
logger.info("TagAffixDetector: useChPos=" + flags.useChPos +
" | useCTBChar2=" + flags.useCTBChar2 + " | usePKChar2=" + flags.usePKChar2);
logger.info("TagAffixDetector: building TagAffixDetector from " + ccPath + " and " + adPath);
}
cc = new CorpusChar(ccPath);
aD = new AffixDictionary(adPath);
}
String checkDic(String t2, String c2 ) {
if(cc.getTag(t2, c2).equals("1"))
return "1";
return "0";
}
String checkInDic(String c2 ){
if(aD.getInDict(c2).equals("1"))
return "1";
return "0";
}
}