package edu.stanford.nlp.trees.international.pennchinese; import edu.stanford.nlp.util.logging.Redwood; import java.util.List; import java.util.ArrayList; import java.util.regex.*; import edu.stanford.nlp.ling.HasWord; import java.util.function.Function; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.UTF8EquivalenceFunction; /** * An Escaper for Chinese normalization to match Treebank. * Currently normalizes "ASCII" characters into the full-width * range used inside the Penn Chinese Treebank. * <p/> * <i>Notes:</i> Smart quotes appear in CTB, and are left unchanged. * I think you get various hyphen types from U+2000 range too - certainly, * Roger lists them in LanguagePack. * * @author Christopher Manning */ public class ChineseEscaper implements Function<List<HasWord>, List<HasWord>> { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseEscaper.class); /** IBM entity normalization patterns */ private static final Pattern p2 = Pattern.compile("\\$[a-z]+_\\((.*?)\\|\\|.*?\\)"); /** <i>Note:</i> At present this clobbers the input list items. * This should be fixed. */ public List<HasWord> apply(List<HasWord> arg) { List<HasWord> ans = new ArrayList<>(arg); for (HasWord wd : ans) { String w = wd.word(); Matcher m2 = p2.matcher(w); // log.info("Escaper: w is " + w); if (m2.find()) { // log.info(" Found pattern."); w = m2.replaceAll("$1"); // log.info(" Changed it to: " + w); } String newW = UTF8EquivalenceFunction.replaceAscii(w); wd.setWord(newW); } return ans; } }