ChineseSegmenterFeatureFactory.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.wordseg;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.Serializable;


import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PaddedList;

/**
 * A Chinese segmenter Feature Factory for GALE project. (modified from Sighan Bakeoff 2005.)
 * This is supposed to have all the good closed-track features from Sighan bakeoff 2005,
 * and some other "open-track" features
 *
 * This will also be used to do a character-based chunking!
 * <p>
 * c is Chinese character ("char").  c means current, n means next and p means previous.
 * </p>
 *
 * <table>
 * <tr>
 * <th>Feature</th><th>Templates</th>
 * </tr>
 * <tr>
 * <tr>
 * <th></th><th>Current position clique</th>
 * </tr>
 * <tr>
 * <td>useWord1</td><td>CONSTANT, cc, nc, pc, pc+cc, if (As|Msr|Pk|Hk) cc+nc, pc,nc </td>
 * </tr>
 * </table>
 *
 * @author Huihsin Tseng
 * @author Pichuan Chang
 */

public class ChineseSegmenterFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> implements Serializable {
  /**
   *
   */

  private static final long serialVersionUID = 3387166382968763350L;
  private static TagAffixDetector taDetector = null;

  private static Redwood.RedwoodChannels logger = Redwood.channels(ChineseSegmenterFeatureFactory.class);

  public void init(SeqClassifierFlags flags) {
    super.init(flags);
  }


  /**
   * Extracts all the features from the input data at a certain index.
   *
   * @param cInfo The complete data set as a List of WordInfo
   * @param loc  The index at which to extract features.
   */
  public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) {
    Collection<String> features = Generics.newHashSet();

    if (clique == cliqueC) {
      addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C");
    } else if (clique == cliqueCpC) {
      addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC");
      addAllInterningAndSuffixing(features, featuresCnC(cInfo, loc-1), "CnC");
    } 
    // else if (clique == cliqueCpCp2C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C");
    // } else if (clique == cliqueCpCp2Cp3C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C");
    // } else if (clique == cliqueCpCp2Cp3Cp4C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4C(cInfo, loc), "CpCp2Cp3Cp4C");
    // } else if (clique == cliqueCpCp2Cp3Cp4Cp5C) {
    //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4Cp5C(cInfo, loc), "CpCp2Cp3Cp4Cp5C");
    // }

    return features;
  }



  private static Pattern patE = Pattern.compile("[a-z]");
  private static Pattern patEC = Pattern.compile("[A-Z]");
  private static String isEnglish(String Ep, String Ec) {
    String chp = Ep;
    String chc = Ec;
    Matcher mp = patE.matcher(chp);   // previous char is [a-z]
    Matcher mc = patE.matcher(chc);   //  current char is [a-z]
    Matcher mpC = patEC.matcher(chp); // previous char is [A-Z]
    Matcher mcC = patEC.matcher(chc); //  current char is [A-Z]
    if (mp.matches() && mcC.matches()){
      return "BND"; // [a-z][A-Z]
    } else if (mp.matches() && mc.matches()){
      return "ENG"; // [a-z][a-z]
    } else if (mpC.matches() && mcC.matches()){
      return "BCC"; // [A-Z][A-Z]
    } else if (mp.matches() && !mc.matches() && !mcC.matches()){
      return "e1";  // [a-z][^A-Za-z]
    } else if (mc.matches() && !mp.matches() && !mpC.matches()) {
      return "e2";  // [^A-Za-z][a-z]
    } else if (mpC.matches() && !mc.matches() && !mcC.matches()){
      return "e3";  // [A-Z][^A-Za-z]
    } else if (mcC.matches() && !mp.matches() && !mpC.matches()) {
      return "e4";  // [^A-Za-z][A-Z]
    } else {
      return "";
    }
  }//is English

  private static Pattern patP = Pattern.compile("[\u00b7\\-\\.]");
  private static String isEngPU(String Ep) {
    Matcher mp = patP.matcher(Ep);
    if (mp.matches()){
      return "1:EngPU";
    } else {
      return "";
    }
  }//is EnglishPU



  public Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c1 = cInfo.get(loc + 1);
    CoreLabel c2 = cInfo.get(loc + 2);
    CoreLabel c3 = cInfo.get(loc + 3);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
    String charc2 = c2.get(CoreAnnotations.CharAnnotation.class);
    String charc3 = c3.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
    String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);

    /**
     * N-gram features. N is upto 2.
     */
    if (flags.useWord1) {
      // features.add(charc +"c");
      // features.add(charc1+"c1");
      // features.add(charp +"p");
      // features.add(charp +charc  +"pc");
      // if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as
      //   features.add(charc +charc1 +"cc1");
      //   features.add(charp + charc1 +"pc1");
      // }

      features.add(charc +"::c");
      features.add(charc1+"::c1");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huishin described in SIGHAN 2005 paper
      features.add(charc +charc1  +"::cn");
      features.add(charp +charc  +"::pc");
      features.add(charp +charc1  +"::pn");
      features.add(charp2 +charp  +"::p2p");
      features.add(charp2 +charc  +"::p2c");
      features.add(charc2 +charc  +"::n2c");

      features.add("|word1");
    }

    return features;
  }

  private static CorpusDictionary outDict = null;

  public Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c1 = cInfo.get(loc + 1);
    CoreLabel c2 = cInfo.get(loc + 2);
    CoreLabel c3 = cInfo.get(loc + 3);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    if (charc == null) charc = "";
    String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
    if (charc1 == null) charc1 = "";
    String charc2 = c2.get(CoreAnnotations.CharAnnotation.class);
    if (charc2 == null) charc2 = "";
    String charc3 = c3.get(CoreAnnotations.CharAnnotation.class);
    if (charc3 == null) charc3 = "";
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    if (charp == null) charp = "";
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
    if (charp2 == null) charp2 = "";
    String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);
    if (charp3 == null) charp3 = "";


    /*
     * N-gram features. N is upto 2.
     */

    if (flags.useWord2) {
      // features.add(charc +"c");
      // features.add(charc1+"c1");
      // features.add(charp +"p");
      // features.add(charp +charc  +"pc");
      // if( flags.useMsr ){
      //   features.add(charc +charc1 +"cc1");
      //   features.add(charp + charc1 +"pc1");
      // }

      features.add(charc +"::c");
      features.add(charc1+"::c1");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huishin described in SIGHAN 2005 paper
      features.add(charc +charc1  +"::cn");
      features.add(charp +charc  +"::pc");
      features.add(charp +charc1  +"::pn");
      features.add(charp2 +charp  +"::p2p");
      features.add(charp2 +charc  +"::p2c");
      features.add(charc2 +charc  +"::n2c");

      features.add("|word2");
    }

    /*
      Radical N-gram features. N is upto 4.
      Smoothing method of N-gram, because there are too many characters in Chinese.
      (It works better than N-gram when they are used individually. less sparse)
    */

    char rcharc, rcharc1,rcharc2, rcharc3, rcharp, rcharp1,rcharp2,rcharp3;
    if (charc.length()==0) { rcharc='n'; } else { rcharc=RadicalMap.getRadical(charc.charAt(0));}
    if (charc1.length()==0) { rcharc1='n'; } else { rcharc1=RadicalMap.getRadical(charc1.charAt(0));}
    if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));}
    if (charc3.length()==0) { rcharc3='n'; } else { rcharc3=RadicalMap.getRadical(charc3.charAt(0));}
    if (charp.length()==0) { rcharp='n'; } else { rcharp=RadicalMap.getRadical(charp.charAt(0));}
    if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));}
    if (charp3.length()==0) { rcharp3='n'; } else { rcharp3=RadicalMap.getRadical(charp3.charAt(0));}

    if(flags.useRad2){
      features.add(rcharc+"rc");
      features.add(rcharc1+"rc1");
      features.add(rcharp+"rp");
      features.add(rcharp  +  rcharc+"rpc");
      features.add(rcharc +rcharc1 +"rcc1");
      features.add(rcharp +  rcharc  +rcharc1 +"rpcc1");
      features.add("|rad2");
    }

    /* non-word dictionary:SEEM bi-gram marked as non-word */
    if (flags.useDict2) {
      NonDict2 nd = new NonDict2(flags);
      features.add(nd.checkDic(charp+charc, flags)+"nondict");
      features.add("|useDict2");
    }

    if (flags.useOutDict2){
      if (outDict == null) {
        logger.info("reading "+flags.outDict2+" as a seen lexicon");
        outDict = new CorpusDictionary(flags.outDict2, true);
      }
      features.add(outDict.getW(charp+charc)+"outdict");       // -1 0
      features.add(outDict.getW(charc+charc1)+"outdict");      // 0 1
      features.add(outDict.getW(charp2+charp)+"outdict");      // -2 -1
      features.add(outDict.getW(charp2+charp+charc)+"outdict");      // -2 -1 0
      features.add(outDict.getW(charp3+charp2+charp)+"outdict");      // -3 -2 -1
      features.add(outDict.getW(charp+charc+charc1)+"outdict");      // -1 0 1
      features.add(outDict.getW(charc+charc1+charc2)+"outdict");      // 0 1 2
      features.add(outDict.getW(charp+charc+charc1+charc2)+"outdict");      // -1 0 1 2
    }

    /*
      (CTB/ASBC/HK/PK/MSR) POS information of each characters.
      If a character falls into some function categories,
      it is very likely there is a boundary.
      A lot of Chinese function words belong to single characters.
      This feature is also good for numbers and punctuations.
      DE* are grouped into DE.
    */
    if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2
        || flags.usePKChar2 || flags.useMSRChar2) {
      String[] tagsets;
      // the "useChPos" now only works for CTB and PK
      if (flags.useChPos) {
        if(flags.useCTBChar2) {
          tagsets = new String[]{"AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M",  "NN",  "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
        } else if (flags.usePKChar2) {
          //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
          tagsets = new String[]{"2","3","4"};
        } else {
          throw new RuntimeException("only support settings for CTB and PK now.");
        }
      } else {
        //logger.info("Using Derived features");
        tagsets = new String[]{"2","3","4"};
      }

      if (taDetector == null) {
        taDetector = new TagAffixDetector(flags);
      }
      for (String tagset : tagsets) {
        features.add(taDetector.checkDic(tagset + "p", charp) + taDetector.checkDic(tagset + "i", charp) + taDetector.checkDic(tagset + "s", charc) + taDetector.checkInDic(charp) + taDetector.checkInDic(charc) + tagset + "prep-sufc");
        // features.add("|ctbchar2");  // Added a constant feature several times!!
      }
    }

    /*
      In error analysis, we found English words and numbers are often separated.
      Rule 1: isNumber feature: check if the current and previous char is a number.
      Rule 2: Disambiguation of time point and time duration.
      Rule 3: isEnglish feature: check if the current and previous character is an english letter.
      Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
      Most of PUs are a good indicator for word boundary, but - and .  is a strong indicator that there is no boundry within a previous , a follow char and it.
    */

    if (flags.useRule2) {
      /* Reduplication features */
      // previous character == current character
      if(charp.equals(charc)){ features.add("11");}
      // previous character == next character
      if(charp.equals(charc1)){ features.add("22");}

      // current character == next next character
      // fire only when usePk and useHk are both false.
      // Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
      if( !flags.usePk && !flags.useHk) {
        if(charc.equals(charc2)){features.add("33");}
      }

      char cur1 = ' ';
      char cur2 = ' ';
      char cur =  ' ';
      char pre =  ' ';
      // actually their length must be either 0 or 1
      if (charc1.length() > 0) { cur1 = charc1.charAt(0); }
      if (charc2.length() > 0) { cur2 = charc2.charAt(0); }
      if (charc.length() > 0) { cur = charc.charAt(0); }
      if (charp.length() > 0) { pre = charp.charAt(0); }

      String prer= String.valueOf(rcharp); // the radical of previous character

      Pattern E = Pattern.compile("[a-zA-Z]");
      Pattern N = Pattern.compile("[0-9]");
      Matcher m = E.matcher(charp);
      Matcher ce = E.matcher(charc);
      Matcher pe = E.matcher(charp2);
      Matcher cn = N.matcher(charc);
      Matcher pn = N.matcher(charp2);


      // if current and previous characters are numbers...
      if (cur >= '0' && cur <= '9'&& pre >= '0' && pre <= '9'){
        if (cur == '9' && pre == '1' && cur1 == '9'&& cur2 >= '0' && cur2 <= '9'){ //199x
          features.add("YR");
        }else{
          features.add("2N");
        }

      // if current and previous characters are not both numbers
      // but previous char is a number
      // i.e. patterns like "1N" , "2A", etc
      } else if (pre >= '0' && pre <= '9'){
        features.add("1N");

      // if previous character is an English character
      } else if(m.matches()){
        features.add("E");

      // if the previous character contains no radical (and it exist)
      } else if(prer.equals(".") && charp.length() == 1){
        // fire only when usePk and useHk are both false. Not sure why. -pichuan
        if(!flags.useHk && !flags.usePk ){
          if(ce.matches()){
            features.add("PU+E");
          }
          if(pe.matches()){
            features.add("E+PU");
          }
          if(cn.matches()){
            features.add("PU+N");
          }
          if(pn.matches()){
            features.add("N+PU");
          }
        }
        features.add("PU");
      }

      String engType = isEnglish(charp, charc);
      String engPU = isEngPU(charp);
      if ( ! engType.equals(""))
        features.add(engType);
      if ( ! engPU.equals("") && ! engType.equals(""))
        features.add(engPU + engType);
    }//end of use rule


    // features using "Character.getType" information!
    String origS = c.get(CoreAnnotations.OriginalCharAnnotation.class);
    char origC = ' ';
    if (origS.length() > 0) { origC = origS.charAt(0); }
    int type = Character.getType(origC);
    switch (type) {
    case Character.UPPERCASE_LETTER: // A-Z and full-width A-Z
    case Character.LOWERCASE_LETTER: // a-z and full-width a-z
      features.add("CHARTYPE-LETTER");
      break;
    case Character.DECIMAL_DIGIT_NUMBER:
      features.add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
      break;
    case Character.OTHER_LETTER: // mostly chinese chars
      features.add("CHARTYPE-OTHER_LETTER");
      break;
    default: // other types
      features.add("CHARTYPE-MISC");
    }

    return features;
  }


  public Collection<String> featuresCnC(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c1 = cInfo.get(loc + 1);
    CoreLabel p = cInfo.get(loc - 1);
    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charc1 = c1.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);


    if (flags.useWordn) {
      features.add(charc +"c");
      features.add(charc1+"c1");
      features.add(charp +"p");
      features.add(charp +charc  +"pc");

      if(flags.useAs || flags.useMsr||flags.usePk||flags.useHk){
        features.add(charc +charc1 +"cc1");
        features.add(charp + charc1 +"pc1");
      }
      features.add("|wordn");
    }
    return features;
  }//end of CnC


}//end of Class