ChineseEscaper.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.trees.international.pennchinese; 
import edu.stanford.nlp.util.logging.Redwood;

import java.util.List;
import java.util.ArrayList;
import java.util.regex.*;

import edu.stanford.nlp.ling.HasWord;
import java.util.function.Function;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.UTF8EquivalenceFunction;

/**
 * An Escaper for Chinese normalization to match Treebank.
 * Currently normalizes "ASCII" characters into the full-width
 * range used inside the Penn Chinese Treebank.
 * <p/>
 * <i>Notes:</i> Smart quotes appear in CTB, and are left unchanged.
 * I think you get various hyphen types from U+2000 range too - certainly,
 * Roger lists them in LanguagePack.
 *
 * @author Christopher Manning
 */
public class ChineseEscaper implements Function<List<HasWord>, List<HasWord>>  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseEscaper.class);

  /** IBM entity normalization patterns */
  private static final Pattern p2 = Pattern.compile("\\$[a-z]+_\\((.*?)\\|\\|.*?\\)");

  /** <i>Note:</i> At present this clobbers the input list items.
   *  This should be fixed.
   */
  public List<HasWord> apply(List<HasWord> arg) {
    List<HasWord> ans = new ArrayList<>(arg);
    for (HasWord wd : ans) {
      String w = wd.word();
      Matcher m2 = p2.matcher(w);
      // log.info("Escaper: w is " + w);
      if (m2.find()) {
        // log.info("  Found pattern.");
        w = m2.replaceAll("$1");
        // log.info("  Changed it to: " + w);
      }
      String newW = UTF8EquivalenceFunction.replaceAscii(w);
      wd.setWord(newW);
    }
    return ans;
  }

}