package edu.stanford.nlp.wordseg; import java.util.ArrayList; import java.util.Collection; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.io.EncodingPrintWriter; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.sequences.Clique; import edu.stanford.nlp.sequences.FeatureFactory; import edu.stanford.nlp.sequences.SeqClassifierFlags; import edu.stanford.nlp.trees.international.pennchinese.RadicalMap; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.PaddedList; /** * A Chinese segmenter Feature Factory for the GALE project. * (Modified from the feature factory for Sighan Bakeoff 2005.) * <p> * c is Chinese character ("char"). c means current, n means next and p means previous. * </p> * * <table> * <tr> * <th>Feature</th><th>Templates</th> * </tr> * <tr> * <tr> * <th></th><th>Current position clique</th> * </tr> * <tr> * <td>useWord1</td><td>CONSTANT, cc, nc, pc, pc+cc, if (As|Msr|Pk|Hk) cc+nc, pc,nc </td> * </tr> * </table> * * @author Huihsin Tseng * @author Pichuan Chang * @author Christopher Manning */ public class Gale2007ChineseSegmenterFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> { private static final int DEBUG = 0; private static Redwood.RedwoodChannels logger = Redwood.channels(Gale2007ChineseSegmenterFeatureFactory.class); private transient TagAffixDetector taDetector; // = null; private transient CorpusDictionary outDict; // = null; @Override public void init(SeqClassifierFlags flags) { super.init(flags); } private synchronized void createTADetector() { if (taDetector == null) { taDetector = new TagAffixDetector(flags); } } private synchronized void createOutDict() { if (outDict == null) { logger.info("reading "+flags.outDict2+" as a seen lexicon"); outDict = new CorpusDictionary(flags.outDict2); } } /** * Extracts all the features from the input data at a certain index. * * @param cInfo The complete data set as a List of WordInfo * @param loc The index at which to extract features. */ @Override public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) { Collection<String> features = Generics.newHashSet(); if (clique == cliqueC) { addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C"); } else if (clique == cliqueCpC) { addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC"); addAllInterningAndSuffixing(features, featuresCnC(cInfo, loc-1), "CnC"); } else if (clique == cliqueCpCp2C) { addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C"); } else if (clique == cliqueCpCp2Cp3C) { addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C"); } if (DEBUG > 0) { EncodingPrintWriter.err.println("For " + cInfo.get(loc) + ", features: " + features, "UTF-8"); } return features; } private static final Pattern patE = Pattern.compile("[a-z]"); private static final Pattern patEC = Pattern.compile("[A-Z]"); private static String isEnglish(String chp, String chc) { Matcher mp = patE.matcher(chp); // previous char is [a-z] Matcher mc = patE.matcher(chc); // current char is [a-z] Matcher mpC = patEC.matcher(chp); // previous char is [A-Z] Matcher mcC = patEC.matcher(chc); // current char is [A-Z] if (mp.matches() && mcC.matches()){ return "BND"; // [a-z][A-Z] } else if (mp.matches() && mc.matches()){ return "ENG"; // [a-z][a-z] } else if (mpC.matches() && mcC.matches()){ return "BCC"; // [A-Z][A-Z] } else if (mp.matches() && !mc.matches() && !mcC.matches()){ return "e1"; // [a-z][^A-Za-z] } else if (mc.matches() && !mp.matches() && !mpC.matches()) { return "e2"; // [^A-Za-z][a-z] } else if (mpC.matches() && !mc.matches() && !mcC.matches()){ return "e3"; // [A-Z][^A-Za-z] } else if (mcC.matches() && !mp.matches() && !mpC.matches()) { return "e4"; // [^A-Za-z][A-Z] } else { return ""; } } // end isEnglish // the pattern used to be [\u00b7\\-\\.] which AFAICS matched only . because - wasn't escaped. CDM Nov 2007 private static final Pattern patP = Pattern.compile("[-\u00b7.]"); private static String isEngPU(String Ep) { Matcher mp = patP.matcher(Ep); if (mp.matches()) { return "1:EngPU"; } else { return ""; } } //is EnglishPU private static void dictionaryFeaturesC(Class<? extends CoreAnnotation<String>> lbeginFieldName, Class<? extends CoreAnnotation<String>> lmiddleFieldName, Class<? extends CoreAnnotation<String>> lendFieldName, String dictSuffix, Collection<String> features, CoreLabel p, CoreLabel c, CoreLabel c2) { String lbegin = c.getString(lbeginFieldName); String lmiddle = c.getString(lmiddleFieldName); String lend = c.getString(lendFieldName); features.add(lbegin+dictSuffix+"-lb"); features.add(lmiddle+dictSuffix+"-lm"); features.add(lend+dictSuffix+"-le"); lbegin = p.getString(lbeginFieldName); lmiddle = p.getString(lmiddleFieldName); lend = p.getString(lendFieldName); features.add(lbegin+dictSuffix+"-plb"); features.add(lmiddle+dictSuffix+"-plm"); features.add(lend+dictSuffix+"-ple"); lbegin = c2.getString(lbeginFieldName); lmiddle = c2.getString(lmiddleFieldName); lend = c2.getString(lendFieldName); features.add(lbegin+dictSuffix+"-c2lb"); features.add(lmiddle+dictSuffix+"-c2lm"); features.add(lend+dictSuffix+"-c2le"); } protected Collection<String> featuresC(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel c3 = cInfo.get(loc + 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class); String charc3 = c3.getString(CoreAnnotations.CharAnnotation.class); String charp = p.getString(CoreAnnotations.CharAnnotation.class); String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class); String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class); Integer cI = c.get(CoreAnnotations.UTypeAnnotation.class); String uTypec = (cI != null ? cI.toString() : ""); Integer c2I = c2.get(CoreAnnotations.UTypeAnnotation.class); String uTypec2 = (c2I != null ? c2I.toString() : ""); Integer c3I = c3.get(CoreAnnotations.UTypeAnnotation.class); String uTypec3 = (c3I != null ? c3I.toString() : ""); Integer pI = p.get(CoreAnnotations.UTypeAnnotation.class); String uTypep = (pI != null ? pI.toString() : ""); Integer p2I = p2.get(CoreAnnotations.UTypeAnnotation.class); String uTypep2 = (p2I != null ? p2I.toString() : ""); /* N-gram features. N is upto 2. */ if (flags.useWord1) { // features.add(charc +"c"); // features.add(charc2+"c2"); // features.add(charp +"p"); // features.add(charp + charc +"pc"); // features.add(charc + charc2 +"cc2"); // cdm: need hyphen so you can see which of charp or charc2 is null.... // features.add(charp + "-" + charc2 + "pc2"); features.add(charc +"::c"); features.add(charc2+"::c2"); features.add(charp +"::p"); features.add(charp2 +"::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.add(charc +charc2 +"::cn"); features.add(charc +charc3 +"::cn2"); features.add(charp +charc +"::pc"); features.add(charp +charc2 +"::pn"); features.add(charp2 +charp +"::p2p"); features.add(charp2 +charc +"::p2c"); features.add(charc2 +charc +"::n2c"); } if (flags.dictionary != null || flags.serializedDictionary != null) { dictionaryFeaturesC(CoreAnnotations.LBeginAnnotation.class, CoreAnnotations.LMiddleAnnotation.class, CoreAnnotations.LEndAnnotation.class,"",features, p, c, c2); } if (flags.dictionary2 != null) { dictionaryFeaturesC(CoreAnnotations.D2_LBeginAnnotation.class, CoreAnnotations.D2_LMiddleAnnotation.class, CoreAnnotations.D2_LEndAnnotation.class,"-D2-",features, p, c, c2); } if (flags.useFeaturesC4gram || flags.useFeaturesC5gram || flags.useFeaturesC6gram) { features.add(charp2 + charp +"p2p"); features.add(charp2 + "p2"); } if (flags.useFeaturesC5gram || flags.useFeaturesC6gram) { features.add(charc3+"c3"); features.add(charc2 + charc3 + "c2c3"); } if (flags.useFeaturesC6gram) { features.add(charp3 + "p3"); features.add(charp3 + charp2 + "p3p2"); } if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4"); } if (flags.useUnicodeType5gram) { features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5"); } if (flags.useUnicodeBlock) { features.add(p.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c2.getString(CoreAnnotations.UBlockAnnotation.class) + "-uBlock"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + "ps"); features.add(c.getString(CoreAnnotations.ShapeAnnotation.class) + "cs"); features.add(c2.getString(CoreAnnotations.ShapeAnnotation.class) + "c2s"); } if (flags.useShapeStrings3) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "pscsc2s"); } if (flags.useShapeStrings4) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2sc3s"); } } features.add("cliqueC"); return features; } private void dictionaryFeaturesCpC(Class<? extends CoreAnnotation<String>> lbeginFieldName, Class<? extends CoreAnnotation<String>> lmiddleFieldName, Class<? extends CoreAnnotation<String>> lendFieldName, String dictSuffix, Collection<String> features, CoreLabel p2, CoreLabel p, CoreLabel c, CoreLabel c2) { String lbegin = c.getString(lbeginFieldName); String lmiddle = c.getString(lmiddleFieldName); String lend = c.getString(lendFieldName); features.add(lbegin+dictSuffix+"-lb"); features.add(lmiddle+dictSuffix+"-lm"); features.add(lend+dictSuffix+"-le"); lbegin = p.getString(lbeginFieldName); lmiddle = p.getString(lmiddleFieldName); lend = p.get(lendFieldName); features.add(lbegin+dictSuffix+"-plb"); features.add(lmiddle+dictSuffix+"-plm"); features.add(lend+dictSuffix+"-ple"); lbegin = c2.getString(lbeginFieldName); lmiddle = c2.getString(lmiddleFieldName); lend = c2.getString(lendFieldName); features.add(lbegin+dictSuffix+"-c2lb"); features.add(lmiddle+dictSuffix+"-c2lm"); features.add(lend+dictSuffix+"-c2le"); if (flags.useDictionaryConjunctions) { String p2Lend = p2.getString(lendFieldName); String pLend = p.getString(lendFieldName); String pLbegin = p.getString(lbeginFieldName); String cLbegin = c.getString(lbeginFieldName); String cLmiddle = c.getString(lmiddleFieldName); if (flags.useDictionaryConjunctions3) { features.add(pLend + cLbegin + cLmiddle + dictSuffix + "-pcLconj1"); } features.add(p2Lend + pLend + cLbegin + cLmiddle + dictSuffix + "-p2pcLconj1"); features.add(p2Lend + pLend + pLbegin + cLbegin + cLmiddle + dictSuffix + "-p2pcLconj2"); } } protected Collection<String> featuresCpC(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel c3 = cInfo.get(loc + 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class); String charc3 = c3.getString(CoreAnnotations.CharAnnotation.class); String charp = p.getString(CoreAnnotations.CharAnnotation.class); String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class); String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class); Integer cI = c.get(CoreAnnotations.UTypeAnnotation.class); String uTypec = (cI != null ? cI.toString() : ""); Integer c2I = c2.get(CoreAnnotations.UTypeAnnotation.class); String uTypec2 = (c2I != null ? c2I.toString() : ""); Integer c3I = c3.get(CoreAnnotations.UTypeAnnotation.class); String uTypec3 = (c3I != null ? c3I.toString() : ""); Integer pI = p.get(CoreAnnotations.UTypeAnnotation.class); String uTypep = (pI != null ? pI.toString() : ""); Integer p2I = p2.get(CoreAnnotations.UTypeAnnotation.class); String uTypep2 = (p2I != null ? p2I.toString() : ""); if (flags.dictionary != null || flags.serializedDictionary != null) { dictionaryFeaturesCpC(CoreAnnotations.LBeginAnnotation.class, CoreAnnotations.LMiddleAnnotation.class, CoreAnnotations.LEndAnnotation.class,"",features, p2, p, c, c2); } if (flags.dictionary2 != null) { dictionaryFeaturesCpC(CoreAnnotations.D2_LBeginAnnotation.class, CoreAnnotations.D2_LMiddleAnnotation.class, CoreAnnotations.D2_LEndAnnotation.class,"-D2-",features, p2, p, c, c2); } /* * N-gram features. N is upto 2. */ if (flags.useWord2) { // features.add(charc +"c"); // features.add(charc2+"c2"); // features.add(charp +"p"); // features.add(charp + charc +"pc"); // features.add(charc + charc2 +"cc2"); // // cdm: need hyphen so you can see which of charp or charc2 is null.... // features.add(charp + "-" + charc2 + "pc2"); features.add(charc +"::c"); features.add(charc2+"::c1"); features.add(charp +"::p"); features.add(charp2 +"::p2"); // trying to restore the features that Huihsin described in SIGHAN 2005 paper features.add(charc +charc2 +"::cn"); // (*) features.add(charp +charc +"::pc"); features.add(charp +charc2 +"::pn"); features.add(charp2 +charp +"::p2p"); features.add(charp2 +charc +"::p2c"); features.add(charc2 +charc +"::n2c"); // todo: this is messed up: Same as one above at (*); should be cn2 = charc + charc3 + "::cn2" } if (flags.useFeaturesCpC4gram || flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) { // todo: Both these features duplicate ones already in useWord2 features.add(charp2 + charp +"p2p"); features.add(charp2 + "p2"); } if (flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) { features.add(charc3+"c3"); features.add(charc2 + charc3 + "c2c3"); } if (flags.useFeaturesCpC6gram) { features.add(charp3 + "p3"); features.add(charp3 + charp2 + "p3p2"); } if (flags.useGoodForNamesCpC) { // these 2 features should be distinctively good at biasing from // picking up a Chinese family name in the p2 or p3 positions: // familyName X X startWord AND familyName X startWord // But actually they seem to have negative value. features.add(charp2 + "p2"); features.add(charp3 + "p3"); } if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4"); } if (flags.useUnicodeType5gram) { features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5"); } if (flags.useWordUTypeConjunctions2) { features.add(uTypep + charc + "putcc"); features.add(charp + uTypec + "pccut"); } if (flags.useWordUTypeConjunctions3) { features.add(uTypep2 + uTypep + charc + "p2utputcc"); features.add(uTypep + charc + uTypec2 + "putccc2ut"); features.add(charc + uTypec2 + uTypec3 + "ccc2utc3ut"); } if (flags.useUnicodeBlock) { features.add(p.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c2.getString(CoreAnnotations.UBlockAnnotation.class) + "-uBlock"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + "ps"); features.add(c.getString(CoreAnnotations.ShapeAnnotation.class) + "cs"); features.add(c2.getString(CoreAnnotations.ShapeAnnotation.class) + "c2s"); } if (flags.useShapeStrings3) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "pscsc2s"); } if (flags.useShapeStrings4) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2sc3s"); } if (flags.useWordShapeConjunctions2) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "pscc"); features.add(charp + c.getString(CoreAnnotations.ShapeAnnotation.class) + "pccs"); } if (flags.useWordShapeConjunctions3) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "p2spscc"); features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "psccc2s"); features.add(charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "ccc2sc3s"); } } /* Radical N-gram features. N is upto 4. Smoothing method of N-gram, because there are too many characters in Chinese. (It works better than N-gram when they are used individually. less sparse) */ char rcharc, rcharc2, rcharp, rcharp2; if (charc.length()==0) { rcharc='n'; } else { rcharc= RadicalMap.getRadical(charc.charAt(0));} if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));} if (charp.length()==0) { rcharp='n'; } else { rcharp=RadicalMap.getRadical(charp.charAt(0)); } if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));} if (flags.useRad2) { features.add(rcharc+"rc"); features.add(rcharc2+"rc2"); features.add(rcharp+"rp"); features.add(rcharp + rcharc+"rprc"); features.add(rcharc +rcharc2 +"rcrc2"); features.add(rcharp + rcharc +rcharc2 +"rprcrc2"); } if (flags.useRad2b) { features.add(rcharc+"rc"); features.add(rcharc2+"rc2"); features.add(rcharp+"rp"); features.add(rcharp + rcharc+"rprc"); features.add(rcharc +rcharc2 +"rcrc2"); features.add(rcharp2 +rcharp +"rp2rp"); } /* Non-word dictionary: SEEN bi-gram marked as non-word. * This is frickin' useful. I hadn't realized. CDM Oct 2007. */ if (flags.useDict2) { NonDict2 nd = new NonDict2(flags); features.add(nd.checkDic(charp+charc, flags)+"nondict"); } if (flags.useOutDict2) { if (outDict == null) { createOutDict(); } features.add(outDict.getW(charp+charc)+"outdict"); // -1 0 features.add(outDict.getW(charc+charc2)+"outdict"); // 0 1 features.add(outDict.getW(charp2+charp)+"outdict"); // -2 -1 features.add(outDict.getW(charp2+charp+charc)+"outdict"); // -2 -1 0 features.add(outDict.getW(charp3+charp2+charp)+"outdict"); // -3 -2 -1 features.add(outDict.getW(charp+charc+charc2)+"outdict"); // -1 0 1 features.add(outDict.getW(charc+charc2+charc3)+"outdict"); // 0 1 2 features.add(outDict.getW(charp+charc+charc2+charc3)+"outdict"); // -1 0 1 2 } /* (CTB/ASBC/HK/PK/MSR) POS information of each characters. If a character falls into some function categories, it is very likely there is a boundary. A lot of Chinese function words belong to single characters. This feature is also good for numbers and punctuations. DE* are grouped into DE. */ if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2) { String[] tagsets; // the "useChPos" now only works for CTB and PK if (flags.useChPos) { if(flags.useCTBChar2) { tagsets = new String[]{"AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; } else if (flags.usePKChar2) { //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"}; tagsets = new String[]{"2","3","4"}; } else { throw new RuntimeException("only support settings for CTB and PK now."); } } else { //logger.info("Using Derived features"); tagsets = new String[]{"2","3","4"}; } if (taDetector == null) { createTADetector(); } for (String tag : tagsets) { features.add(taDetector.checkDic(tag+"p", charp) + taDetector.checkDic(tag+"i", charp) + taDetector.checkDic(tag+"s", charc)+ taDetector.checkInDic(charp)+taDetector.checkInDic(charc)+ tag+ "prep-sufc" ); //features.add("|ctbchar2"); } } /* In error analysis, we found English words and numbers are often separated. Rule 1: isNumber feature: check if the current and previous char is a number. Rule 2: Disambiguation of time point and time duration. Rule 3: isEnglish feature: check if the current and previous character is an english letter. Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names. Most of PUs are a good indicator for word boundary, but - and . is a strong indicator that there is no boundry within a previous , a follow char and it. */ if (flags.useRule2) { /* Reduplication features */ // previous character == current character if(charp.equals(charc)){ features.add("11-R2");} // previous character == next character if(charp.equals(charc2)){ features.add("22-R2");} // current character == next next character // fire only when usePk and useHk are both false. // Notice: this should be (almost) the same as the "22" feature, but we keep it for now. if( !flags.usePk && !flags.useHk) { if(charc.equals(charc2)){features.add("33-R2");} } char cur1 = ' '; char cur2 = ' '; char cur = ' '; char pre = ' '; // actually their length must be either 0 or 1 if (charc2.length() > 0) { cur1 = charc2.charAt(0); } if (charc3.length() > 0) { cur2 = charc3.charAt(0); } if (charc.length() > 0) { cur = charc.charAt(0); } if (charp.length() > 0) { pre = charp.charAt(0); } String prer= String.valueOf(rcharp); // the radical of previous character Pattern E = Pattern.compile("[a-zA-Z]"); Pattern N = Pattern.compile("[0-9]"); Matcher m = E.matcher(charp); Matcher ce = E.matcher(charc); Matcher pe = E.matcher(charp2); Matcher cn = N.matcher(charc); Matcher pn = N.matcher(charp2); // if current and previous characters are numbers... if (cur >= '0' && cur <= '9'&& pre >= '0' && pre <= '9'){ if (cur == '9' && pre == '1' && cur1 == '9'&& cur2 >= '0' && cur2 <= '9'){ //199x features.add("YR-R2"); }else{ features.add("2N-R2"); } // if current and previous characters are not both numbers // but previous char is a number // i.e. patterns like "1N" , "2A", etc } else if (pre >= '0' && pre <= '9'){ features.add("1N-R2"); // if previous character is an English character } else if(m.matches()){ features.add("E-R2"); // if the previous character contains no radical (and it exist) } else if(prer.equals(".") && charp.length() == 1){ if(ce.matches()){ features.add("PU+E-R2"); } if(pe.matches()){ features.add("E+PU-R2"); } if(cn.matches()){ features.add("PU+N-R2"); } if(pn.matches()){ features.add("N+PU-R2"); } features.add("PU-R2"); } String engType = isEnglish(charp, charc); String engPU = isEngPU(charp); if ( ! engType.equals("")) features.add(engType); if ( ! engPU.equals("") && ! engType.equals("")) { StringBuilder sb = new StringBuilder(); sb.append(engPU).append(engType).append("R2"); features.add(sb.toString()); } }//end of use rule // features using "Character.getType" information! String origS = c.getString(CoreAnnotations.OriginalCharAnnotation.class); char origC = ' '; if (origS.length() > 0) { origC = origS.charAt(0); } int type = Character.getType(origC); switch (type) { case Character.UPPERCASE_LETTER: // A-Z and full-width A-Z case Character.LOWERCASE_LETTER: // a-z and full-width a-z features.add("CHARTYPE-LETTER"); break; case Character.DECIMAL_DIGIT_NUMBER: features.add("CHARTYPE-DECIMAL_DIGIT_NUMBER"); break; case Character.OTHER_LETTER: // mostly chinese chars features.add("CHARTYPE-OTHER_LETTER"); break; default: // other types features.add("CHARTYPE-MISC"); } features.add("cliqueCpC"); return features; } // end featuresCpC /** For a CRF, this shouldn't be necessary, since the features duplicate * those from CpC, but Huihsin found some valuable, presumably becuase * it modified the regularization a bit. * * @param cInfo The list of characters * @param loc Position of c in list * @return Collection of String features (sparse set of boolean features */ protected Collection<String> featuresCnC(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); if (flags.useWordn) { CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class); String charp = p.getString(CoreAnnotations.CharAnnotation.class); String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class); features.add(charc +"c"); features.add(charc2+"c2"); features.add(charp +"p"); features.add(charp2 + "p2"); features.add(charp2 + charp +"p2p"); features.add(charp + charc +"pc"); features.add(charc + charc2 +"cc2"); features.add(charp + "-" + charc2 + "pc2"); features.add("cliqueCnC"); } return features; } //end of CnC /** Second order clique features * * @param cInfo The list of characters * @param loc Position of c in list * @return Collection of String features (sparse set of boolean features */ protected Collection<String> featuresCpCp2C(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel c3 = cInfo.get(loc + 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class); String charc3 = c3.getString(CoreAnnotations.CharAnnotation.class); String charp = p.getString(CoreAnnotations.CharAnnotation.class); String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class); String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class); // N-gram features. N is up to 3 if (flags.useWord3) { features.add(charc +"::c"); features.add(charc2+"::n"); features.add(charp +"::p"); features.add(charp2 +"::p2"); // trying to restore the features that Huihsin described in SIGHAN 2005 paper features.add(charc + charc2 +"::cn"); features.add(charc + charc2 + charc3 + "::cnn2"); features.add(charp + charc +"::pc"); features.add(charp + charc2 +"::pn"); features.add(charp2 + charp +"::p2p"); features.add(charp3 + charp2 + charp + "::p3p2p"); features.add(charp2 + charc +"::p2c"); features.add(charc + charc3 +"::cn2"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + "ps"); features.add(c.getString(CoreAnnotations.ShapeAnnotation.class) + "cs"); features.add(c2.getString(CoreAnnotations.ShapeAnnotation.class) + "c2s"); } if (flags.useShapeStrings3) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "pscsc2s"); } if (flags.useShapeStrings4) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2sc3s"); } if (flags.useWordShapeConjunctions2) { features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "pscc"); features.add(charp + c.getString(CoreAnnotations.ShapeAnnotation.class) + "pccs"); } if (flags.useWordShapeConjunctions3) { features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "p2spscc"); features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "psccc2s"); features.add(charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "ccc2sc3s"); } } /* Radical N-gram features. N is upto 4. Smoothing method of N-gram, because there are too many characters in Chinese. (It works better than N-gram when they are used individually. less sparse) */ char rcharc, rcharc2, rcharp, rcharp2; if (charc.length()==0) { rcharc='n'; } else { rcharc= RadicalMap.getRadical(charc.charAt(0));} if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));} if (charp.length()==0) { rcharp='n'; } else { rcharp=RadicalMap.getRadical(charp.charAt(0)); } if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));} if (flags.useRad2) { features.add(rcharc+"rc"); features.add(rcharc2+"rc2"); features.add(rcharp+"rp"); features.add(rcharp + rcharc+"rprc"); features.add(rcharc +rcharc2 +"rcrc2"); features.add(rcharp + rcharc +rcharc2 +"rprcrc2"); } if (flags.useRad2b) { features.add(rcharc+"rc"); features.add(rcharc2+"rc2"); features.add(rcharp+"rp"); features.add(rcharp + rcharc+"rprc"); features.add(rcharc +rcharc2 +"rcrc2"); features.add(rcharp2 +rcharp +"rp2rp"); } features.add("cliqueCpCp2C"); return features; } // end featuresCpCp2C protected Collection<String> featuresCpCp2Cp3C(PaddedList<? extends CoreLabel> cInfo, int loc) { Collection<String> features = new ArrayList<>(); if (flags.use4Clique && flags.maxLeft >= 3) { CoreLabel c = cInfo.get(loc); CoreLabel c2 = cInfo.get(loc + 1); CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); String charc = c.getString(CoreAnnotations.CharAnnotation.class); String charp = p.getString(CoreAnnotations.CharAnnotation.class); String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class); String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class); Integer cI = c.get(CoreAnnotations.UTypeAnnotation.class); String uTypec = (cI != null ? cI.toString() : ""); Integer c2I = c2.get(CoreAnnotations.UTypeAnnotation.class); String uTypec2 = (c2I != null ? c2I.toString() : ""); Integer pI = p.get(CoreAnnotations.UTypeAnnotation.class); String uTypep = (pI != null ? pI.toString() : ""); Integer p2I = p2.get(CoreAnnotations.UTypeAnnotation.class); String uTypep2 = (p2I != null ? p2I.toString() : ""); Integer p3I = p3.get(CoreAnnotations.UTypeAnnotation.class); String uTypep3 = (p3I != null ? p3I.toString() : ""); if (flags.useLongSequences) { features.add(charp3 + charp2 + charp + charc + "p3p2pc"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4"); } if (flags.useUnicodeType5gram) { features.add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5"); } features.add("cliqueCpCp2Cp3C"); } return features; } private static final long serialVersionUID = 8197648719208850960L; } // end class Gale2007ChineseSegmenterFeatureFactory