package edu.stanford.nlp.wordseg;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.Serializable;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PaddedList;
/**
* A Chinese segmenter Feature Factory for GALE project. (modified from Sighan Bakeoff 2005.)
* This is supposed to have all the good closed-track features from Sighan bakeoff 2005,
* and some other "open-track" features
*
* This will also be used to do a character-based chunking!
* <p>
* c is Chinese character ("char"). c means current, n means next and p means previous.
* </p>
*
* <table>
* <tr>
* <th>Feature</th><th>Templates</th>
* </tr>
* <tr>
* <tr>
* <th></th><th>Current position clique</th>
* </tr>
* <tr>
* <td>useWord1</td><td>CONSTANT, cc, nc, pc, pc+cc, if (As|Msr|Pk|Hk) cc+nc, pc,nc </td>
* </tr>
* </table>
*
* @author Huihsin Tseng
* @author Pichuan Chang
*/
public class ChineseSegmenterFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> implements Serializable {
/**
*
*/
private static final long serialVersionUID = 3387166382968763350L;
private static TagAffixDetector taDetector = null;
private static Redwood.RedwoodChannels logger = Redwood.channels(ChineseSegmenterFeatureFactory.class);
public void init(SeqClassifierFlags flags) {
super.init(flags);
}
/**
* Extracts all the features from the input data at a certain index.
*
* @param cInfo The complete data set as a List of WordInfo
* @param loc The index at which to extract features.
*/
public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) {
Collection<String> features = Generics.newHashSet();
if (clique == cliqueC) {
addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C");
} else if (clique == cliqueCpC) {
addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC");
addAllInterningAndSuffixing(features, featuresCnC(cInfo, loc-1), "CnC");
}
// else if (clique == cliqueCpCp2C) {
// addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C");
// } else if (clique == cliqueCpCp2Cp3C) {
// addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C");
// } else if (clique == cliqueCpCp2Cp3Cp4C) {
// addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4C(cInfo, loc), "CpCp2Cp3Cp4C");
// } else if (clique == cliqueCpCp2Cp3Cp4Cp5C) {
// addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4Cp5C(cInfo, loc), "CpCp2Cp3Cp4Cp5C");
// }
return features;
}
private static Pattern patE = Pattern.compile("[a-z]");
private static Pattern patEC = Pattern.compile("[A-Z]");
private static String isEnglish(String Ep, String Ec) {
String chp = Ep;
String chc = Ec;
Matcher mp = patE.matcher(chp); // previous char is [a-z]
Matcher mc = patE.matcher(chc); // current char is [a-z]
Matcher mpC = patEC.matcher(chp); // previous char is [A-Z]
Matcher mcC = patEC.matcher(chc); // current char is [A-Z]
if (mp.matches() && mcC.matches()){
return "BND"; // [a-z][A-Z]
} else if (mp.matches() && mc.matches()){
return "ENG"; // [a-z][a-z]
} else if (mpC.matches() && mcC.matches()){
return "BCC"; // [A-Z][A-Z]
} else if (mp.matches() && !mc.matches() && !mcC.matches()){
return "e1"; // [a-z][^A-Za-z]
} else if (mc.matches() && !mp.matches() && !mpC.matches()) {
return "e2"; // [^A-Za-z][a-z]
} else if (mpC.matches() && !mc.matches() && !mcC.matches()){
return "e3"; // [A-Z][^A-Za-z]
} else if (mcC.matches() && !mp.matches() && !mpC.matches()) {
return "e4"; // [^A-Za-z][A-Z]
} else {
return "";
}
}//is English
private static Pattern patP = Pattern.compile("[\u00b7\\-\\.]");
private static String isEngPU(String Ep) {
Matcher mp = patP.matcher(Ep);
if (mp.matches()){
return "1:EngPU";
} else {
return "";
}
}//is EnglishPU
public Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel c1 = cInfo.get(loc + 1);
CoreLabel c2 = cInfo.get(loc + 2);
CoreLabel c3 = cInfo.get(loc + 3);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
String charc2 = c2.get(CoreAnnotations.CharAnnotation.class);
String charc3 = c3.get(CoreAnnotations.CharAnnotation.class);
String charp = p.get(CoreAnnotations.CharAnnotation.class);
String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);
/**
* N-gram features. N is upto 2.
*/
if (flags.useWord1) {
// features.add(charc +"c");
// features.add(charc1+"c1");
// features.add(charp +"p");
// features.add(charp +charc +"pc");
// if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as
// features.add(charc +charc1 +"cc1");
// features.add(charp + charc1 +"pc1");
// }
features.add(charc +"::c");
features.add(charc1+"::c1");
features.add(charp +"::p");
features.add(charp2 +"::p2");
// trying to restore the features that Huishin described in SIGHAN 2005 paper
features.add(charc +charc1 +"::cn");
features.add(charp +charc +"::pc");
features.add(charp +charc1 +"::pn");
features.add(charp2 +charp +"::p2p");
features.add(charp2 +charc +"::p2c");
features.add(charc2 +charc +"::n2c");
features.add("|word1");
}
return features;
}
private static CorpusDictionary outDict = null;
public Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel c1 = cInfo.get(loc + 1);
CoreLabel c2 = cInfo.get(loc + 2);
CoreLabel c3 = cInfo.get(loc + 3);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
if (charc == null) charc = "";
String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
if (charc1 == null) charc1 = "";
String charc2 = c2.get(CoreAnnotations.CharAnnotation.class);
if (charc2 == null) charc2 = "";
String charc3 = c3.get(CoreAnnotations.CharAnnotation.class);
if (charc3 == null) charc3 = "";
String charp = p.get(CoreAnnotations.CharAnnotation.class);
if (charp == null) charp = "";
String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
if (charp2 == null) charp2 = "";
String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);
if (charp3 == null) charp3 = "";
/*
* N-gram features. N is upto 2.
*/
if (flags.useWord2) {
// features.add(charc +"c");
// features.add(charc1+"c1");
// features.add(charp +"p");
// features.add(charp +charc +"pc");
// if( flags.useMsr ){
// features.add(charc +charc1 +"cc1");
// features.add(charp + charc1 +"pc1");
// }
features.add(charc +"::c");
features.add(charc1+"::c1");
features.add(charp +"::p");
features.add(charp2 +"::p2");
// trying to restore the features that Huishin described in SIGHAN 2005 paper
features.add(charc +charc1 +"::cn");
features.add(charp +charc +"::pc");
features.add(charp +charc1 +"::pn");
features.add(charp2 +charp +"::p2p");
features.add(charp2 +charc +"::p2c");
features.add(charc2 +charc +"::n2c");
features.add("|word2");
}
/*
Radical N-gram features. N is upto 4.
Smoothing method of N-gram, because there are too many characters in Chinese.
(It works better than N-gram when they are used individually. less sparse)
*/
char rcharc, rcharc1,rcharc2, rcharc3, rcharp, rcharp1,rcharp2,rcharp3;
if (charc.length()==0) { rcharc='n'; } else { rcharc=RadicalMap.getRadical(charc.charAt(0));}
if (charc1.length()==0) { rcharc1='n'; } else { rcharc1=RadicalMap.getRadical(charc1.charAt(0));}
if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));}
if (charc3.length()==0) { rcharc3='n'; } else { rcharc3=RadicalMap.getRadical(charc3.charAt(0));}
if (charp.length()==0) { rcharp='n'; } else { rcharp=RadicalMap.getRadical(charp.charAt(0));}
if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));}
if (charp3.length()==0) { rcharp3='n'; } else { rcharp3=RadicalMap.getRadical(charp3.charAt(0));}
if(flags.useRad2){
features.add(rcharc+"rc");
features.add(rcharc1+"rc1");
features.add(rcharp+"rp");
features.add(rcharp + rcharc+"rpc");
features.add(rcharc +rcharc1 +"rcc1");
features.add(rcharp + rcharc +rcharc1 +"rpcc1");
features.add("|rad2");
}
/* non-word dictionary:SEEM bi-gram marked as non-word */
if (flags.useDict2) {
NonDict2 nd = new NonDict2(flags);
features.add(nd.checkDic(charp+charc, flags)+"nondict");
features.add("|useDict2");
}
if (flags.useOutDict2){
if (outDict == null) {
logger.info("reading "+flags.outDict2+" as a seen lexicon");
outDict = new CorpusDictionary(flags.outDict2, true);
}
features.add(outDict.getW(charp+charc)+"outdict"); // -1 0
features.add(outDict.getW(charc+charc1)+"outdict"); // 0 1
features.add(outDict.getW(charp2+charp)+"outdict"); // -2 -1
features.add(outDict.getW(charp2+charp+charc)+"outdict"); // -2 -1 0
features.add(outDict.getW(charp3+charp2+charp)+"outdict"); // -3 -2 -1
features.add(outDict.getW(charp+charc+charc1)+"outdict"); // -1 0 1
features.add(outDict.getW(charc+charc1+charc2)+"outdict"); // 0 1 2
features.add(outDict.getW(charp+charc+charc1+charc2)+"outdict"); // -1 0 1 2
}
/*
(CTB/ASBC/HK/PK/MSR) POS information of each characters.
If a character falls into some function categories,
it is very likely there is a boundary.
A lot of Chinese function words belong to single characters.
This feature is also good for numbers and punctuations.
DE* are grouped into DE.
*/
if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2
|| flags.usePKChar2 || flags.useMSRChar2) {
String[] tagsets;
// the "useChPos" now only works for CTB and PK
if (flags.useChPos) {
if(flags.useCTBChar2) {
tagsets = new String[]{"AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
} else if (flags.usePKChar2) {
//tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
tagsets = new String[]{"2","3","4"};
} else {
throw new RuntimeException("only support settings for CTB and PK now.");
}
} else {
//logger.info("Using Derived features");
tagsets = new String[]{"2","3","4"};
}
if (taDetector == null) {
taDetector = new TagAffixDetector(flags);
}
for (String tagset : tagsets) {
features.add(taDetector.checkDic(tagset + "p", charp) + taDetector.checkDic(tagset + "i", charp) + taDetector.checkDic(tagset + "s", charc) + taDetector.checkInDic(charp) + taDetector.checkInDic(charc) + tagset + "prep-sufc");
// features.add("|ctbchar2"); // Added a constant feature several times!!
}
}
/*
In error analysis, we found English words and numbers are often separated.
Rule 1: isNumber feature: check if the current and previous char is a number.
Rule 2: Disambiguation of time point and time duration.
Rule 3: isEnglish feature: check if the current and previous character is an english letter.
Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
Most of PUs are a good indicator for word boundary, but - and . is a strong indicator that there is no boundry within a previous , a follow char and it.
*/
if (flags.useRule2) {
/* Reduplication features */
// previous character == current character
if(charp.equals(charc)){ features.add("11");}
// previous character == next character
if(charp.equals(charc1)){ features.add("22");}
// current character == next next character
// fire only when usePk and useHk are both false.
// Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
if( !flags.usePk && !flags.useHk) {
if(charc.equals(charc2)){features.add("33");}
}
char cur1 = ' ';
char cur2 = ' ';
char cur = ' ';
char pre = ' ';
// actually their length must be either 0 or 1
if (charc1.length() > 0) { cur1 = charc1.charAt(0); }
if (charc2.length() > 0) { cur2 = charc2.charAt(0); }
if (charc.length() > 0) { cur = charc.charAt(0); }
if (charp.length() > 0) { pre = charp.charAt(0); }
String prer= String.valueOf(rcharp); // the radical of previous character
Pattern E = Pattern.compile("[a-zA-Z]");
Pattern N = Pattern.compile("[0-9]");
Matcher m = E.matcher(charp);
Matcher ce = E.matcher(charc);
Matcher pe = E.matcher(charp2);
Matcher cn = N.matcher(charc);
Matcher pn = N.matcher(charp2);
// if current and previous characters are numbers...
if (cur >= '0' && cur <= '9'&& pre >= '0' && pre <= '9'){
if (cur == '9' && pre == '1' && cur1 == '9'&& cur2 >= '0' && cur2 <= '9'){ //199x
features.add("YR");
}else{
features.add("2N");
}
// if current and previous characters are not both numbers
// but previous char is a number
// i.e. patterns like "1N" , "2A", etc
} else if (pre >= '0' && pre <= '9'){
features.add("1N");
// if previous character is an English character
} else if(m.matches()){
features.add("E");
// if the previous character contains no radical (and it exist)
} else if(prer.equals(".") && charp.length() == 1){
// fire only when usePk and useHk are both false. Not sure why. -pichuan
if(!flags.useHk && !flags.usePk ){
if(ce.matches()){
features.add("PU+E");
}
if(pe.matches()){
features.add("E+PU");
}
if(cn.matches()){
features.add("PU+N");
}
if(pn.matches()){
features.add("N+PU");
}
}
features.add("PU");
}
String engType = isEnglish(charp, charc);
String engPU = isEngPU(charp);
if ( ! engType.equals(""))
features.add(engType);
if ( ! engPU.equals("") && ! engType.equals(""))
features.add(engPU + engType);
}//end of use rule
// features using "Character.getType" information!
String origS = c.get(CoreAnnotations.OriginalCharAnnotation.class);
char origC = ' ';
if (origS.length() > 0) { origC = origS.charAt(0); }
int type = Character.getType(origC);
switch (type) {
case Character.UPPERCASE_LETTER: // A-Z and full-width A-Z
case Character.LOWERCASE_LETTER: // a-z and full-width a-z
features.add("CHARTYPE-LETTER");
break;
case Character.DECIMAL_DIGIT_NUMBER:
features.add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
break;
case Character.OTHER_LETTER: // mostly chinese chars
features.add("CHARTYPE-OTHER_LETTER");
break;
default: // other types
features.add("CHARTYPE-MISC");
}
return features;
}
public Collection<String> featuresCnC(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel c1 = cInfo.get(loc + 1);
CoreLabel p = cInfo.get(loc - 1);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
String charp = p.get(CoreAnnotations.CharAnnotation.class);
if (flags.useWordn) {
features.add(charc +"c");
features.add(charc1+"c1");
features.add(charp +"p");
features.add(charp +charc +"pc");
if(flags.useAs || flags.useMsr||flags.usePk||flags.useHk){
features.add(charc +charc1 +"cc1");
features.add(charp + charc1 +"pc1");
}
features.add("|wordn");
}
return features;
}//end of CnC
}//end of Class