ChineseNumberSequenceClassifier.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.ie.regexp;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
//import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.time.TimeExpressionExtractor;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.util.*;
import java.util.regex.Pattern;

/**
 * A simple rule-based classifier that detects NUMBERs in a sequence of Chinese tokens. This classifier mimics the
 * behavior of {@link edu.stanford.nlp.ie.regexp.NumberSequenceClassifier} (without using SUTime) and works on Chinese sequence.
 *
 * TODO: An interface needs to be used to reuse code for NumberSequenceClassifier
 * TODO: Ideally a Chinese version of SUTime needs to be used to provide more flexibility and accuracy.
 *
 * @author Yuhao Zhang
 * @author Peng Qi
 */
public class ChineseNumberSequenceClassifier extends AbstractSequenceClassifier<CoreLabel> {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseNumberSequenceClassifier.class);

  private static final boolean DEBUG = false;

  private final boolean useSUTime;

  public static final boolean USE_SUTIME_DEFAULT = false;
  public static final String USE_SUTIME_PROPERTY = "ner.useSUTime";
  public static final String USE_SUTIME_PROPERTY_BASE = "useSUTime";
  public static final String SUTIME_PROPERTY = "sutime";

  private final TimeExpressionExtractor timexExtractor;

  public ChineseNumberSequenceClassifier() {
    this(new Properties(), USE_SUTIME_DEFAULT, new Properties());
  }

  public ChineseNumberSequenceClassifier(boolean useSUTime) {
    this(new Properties(), useSUTime, new Properties());
  }

  public ChineseNumberSequenceClassifier(Properties props, boolean useSUTime, Properties sutimeProps) {
    super(props);
    this.useSUTime = useSUTime;
    if(this.useSUTime) {
      // TODO: Need a Chinese version of SUTime
      log.warn("SUTime currently does not support Chinese. Ignore property ner.useSUTime.");
    }
    this.timexExtractor = null;
  }

  // All the tags we need
  public static final String NUMBER_TAG = "NUMBER";
  public static final String DATE_TAG = "DATE";
  public static final String TIME_TAG = "TIME";
  public static final String MONEY_TAG = "MONEY";
  public static final String ORDINAL_TAG = "ORDINAL";
  public static final String PERCENT_TAG = "PERCENT";

  // Patterns we need
  public static final Pattern CURRENCY_WORD_PATTERN =
      Pattern.compile("元|刀|(?:美|欧|澳|加|日|韩)元|英?镑|法郎|卢比|卢布|马克|先令|克朗|泰?铢|(?:越南)?盾|美分|便士|块钱|毛钱|角钱");
  // In theory 块 钱 should be separated by segmenter, but just in case segmenter fails
  // TODO(yuhao): Need to add support for 块 钱, 毛 钱, 角 钱, 角, 五 块 二
  public static final Pattern PERCENT_WORD_PATTERN1 = Pattern.compile("(?:百分之|千分之).+");
  public static final Pattern PERCENT_WORD_PATTERN2 = Pattern.compile(".+%");
  public static final Pattern DATE_PATTERN1 = Pattern.compile(".+(?:年代?|月份?|日|号|世纪)");
  public static final Pattern DATE_PATTERN2 = Pattern.compile("(?:星期|周|礼拜).+");
  public static final Pattern DATE_PATTERN3 = Pattern.compile("[0-9一二三四五六七八九零〇十]{2,4}");
  public static final Pattern DATE_PATTERN4 = Pattern.compile("(?:[0-9]{2,4}[/\\-\\.][0-9]+[/\\-\\.][0-9]+|[0-9]+[/\\-\\.][0-9]+[/\\-\\.][0-9]{2,4}|[0-9]+[/\\-\\.]?[0-9]+)");
  public static final Pattern DATE_PATTERN5 = Pattern.compile("[昨今明][天晨晚夜早]");
  public static final Pattern TIME_PATTERN1 = Pattern.compile(".+(?::|点|时)(?:过|欠|差)?(?:.+(?::|分)?|整?|钟?|.+刻)?(?:.+秒?)"); // This only works when POS = NT

  private static final Pattern CHINESE_AND_ARABIC_NUMERALS_PATTERN = Pattern.compile("[一二三四五六七八九零十〇\\d]+");
  // This is used to capture a special case of date in Chinese: 70 后 or 七零 后
  private static final String DATE_AGE_LOCALIZER = "后";

  // order it by number of characters DESC for handy one-by-one matching of string suffix
  public static final String[] CURRENCY_WORDS_VALUES = new String[] {"越南盾", "美元", "欧元", "澳元", "加元", "日元", "韩元",
      "英镑", "法郎", "卢比", "卢布", "马克", "先令", "克朗", "泰铢", "盾", "铢", "刀", "镑", "元"};

  public static final String[] DATE_WORDS_VALUES = new String[] {"明天", "后天", "昨天", "前天", "明年", "后年", "去年", "前年",
      "昨日", "明日", "来年", "上月", "本月", "目前", "今后", "未来", "日前", "最近", "当时", "后来", "那时", "这时", "今", "今天",
      "当今", "如今", "之后", "当代", "以前", "现在", "将来", "此时", "此前", "元旦"};
  public static final HashSet<String> DATE_WORDS = new HashSet<>(Arrays.asList(DATE_WORDS_VALUES));

  public static final String[] TIME_WORDS_VALUES = new String[] {"早晨", "清晨", "凌晨", "上午", "中午", "下午", "傍晚", "晚上",
      "夜间", "晨间", "晚间", "午前", "午后", "早", "晚"};
  public static final HashSet<String> TIME_WORDS = new HashSet<>(Arrays.asList(TIME_WORDS_VALUES));

  /**
   * Use a set of heuristic rules to assign NER tags to tokens.
   * @param document A {@link List} of something that extends {@link CoreMap}.
   * @return
   */
  @Override
  public List<CoreLabel> classify(List<CoreLabel> document) {
    // The actual implementation of the classifier
    PaddedList<CoreLabel> pl = new PaddedList<>(document, pad);
    for (int i = 0, sz = pl.size(); i < sz; i++) {
      CoreLabel me = pl.get(i);
      CoreLabel prev = pl.get(i - 1);
      CoreLabel next = pl.get(i + 1);
      // by default set to be "O"
      me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);

      // If current word is OD, label it as ORDINAL
      if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("OD")) {
        me.set(CoreAnnotations.AnswerAnnotation.class, ORDINAL_TAG);
      } else if(CURRENCY_WORD_PATTERN.matcher(me.word()).matches() &&
          prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
        // If current word is currency word and prev word is a CD
        me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
      } else if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
        // TODO(yuhao): Need to support Chinese captial numbers like 叁拾 (This won't be POS-tagged as CD).
        // If current word is a CD
        if(PERCENT_WORD_PATTERN1.matcher(me.word()).matches() ||
            PERCENT_WORD_PATTERN2.matcher(me.word()).matches()) {
          // If current word is a percent
          me.set(CoreAnnotations.AnswerAnnotation.class, PERCENT_TAG);
        } else if(rightScanFindsMoneyWord(pl, i)) {
          // If one the right finds a currency word
          me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
        } else if(me.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(me.word()).matches() &&
            DATE_AGE_LOCALIZER.equals(next.word())) {
          // This is to extract a special case of DATE: 70 后 or 七零 后
          me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
        } else {
          // Otherwise we should safely label it as NUMBER
          me.set(CoreAnnotations.AnswerAnnotation.class, NUMBER_TAG);
        }
      } else if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NT")) {
        // If current word is a NT (temporal noun)
        if(DATE_PATTERN1.matcher(me.word()).matches() ||
            DATE_PATTERN2.matcher(me.word()).matches() ||
            DATE_PATTERN3.matcher(me.word()).matches() ||
            DATE_PATTERN4.matcher(me.word()).matches() ||
            DATE_PATTERN5.matcher(me.word()).matches() ||
            DATE_WORDS.contains(me.word())) {
          me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
        } else if(TIME_PATTERN1.matcher(me.word()).matches() ||
            TIME_WORDS.contains(me.word())) {
          me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
        } else {
          // TIME may have more variants (really?) so always add as TIME by default
          me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
        }
      } else if(DATE_AGE_LOCALIZER.equals(me.word()) && prev.word() != null && prev.word().length() == 2 &&
          CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(prev.word()).matches()) {
        // Label 后 as DATE if the sequence is 70 后 or 七零 后
        me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
      }
    }
    return document;
  }

  /**
   * Look along CD words and see if next thing is a money word.
   *
   * @param pl The list of CoreLabel
   * @param i The position to scan right from
   * @return Whether a money word is found
   */
  private static boolean rightScanFindsMoneyWord(List<CoreLabel> pl, int i) {
    int j = i;
    if (DEBUG) {
      log.info("rightScan from: " + pl.get(j).word());
    }
    int sz = pl.size();
    while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
      j++;
    }
    if (j >= sz) {
      return false;
    }
    String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class);
    String word = pl.get(j).word();
    if (DEBUG) {
      log.info("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches()));
    }
    return (tag.equals("M") || tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches();
  }

  @Override
  public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokenSequence, CoreMap document, CoreMap sentence) {
    if(useSUTime) {
      log.warn("Warning: ChineseNumberSequenceClassifier does not have SUTime implementation.");
    }
    return classify(tokenSequence);
  }

  @Override
  public void train(Collection<List<CoreLabel>> docs, DocumentReaderAndWriter<CoreLabel> readerAndWriter) {
    // Train is not needed for this rule-based classifier
  }

  @Override
  public void serializeClassifier(String serializePath) {
  }

  @Override
  public void serializeClassifier(ObjectOutputStream oos) {
  }

  @Override
  public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException {
  }

  public static void main(String[] args) throws IOException {
   /* Properties props = StringUtils.argsToProperties("-props", "/Users/yuhao/Research/tmp/ChineseNumberClassifierProps.properties");
//    Properties props = StringUtils.argsToProperties("-props", "/Users/yuhao/Research/tmp/EnglishNumberClassifierProps.properties");
    props.setProperty("outputFormat", "text");
    props.setProperty("ssplit.boundaryTokenRegex", "\\n"); // one sentence per line
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    String docFileName = "/Users/yuhao/Research/tmp/chinese_number_examples.txt";
//    String docFileName = "/Users/yuhao/Research/tmp/english_number_examples.txt";
    List<String> docLines = IOUtils.linesFromFile(docFileName);
    PrintStream out = new PrintStream(docFileName + ".out");
    for (String docLine : docLines) {
      Annotation sentenceAnnotation = new Annotation(docLine);
      pipeline.annotate(sentenceAnnotation);
      pipeline.prettyPrint(sentenceAnnotation, out);
      pipeline.prettyPrint(sentenceAnnotation, System.out);
    }

    out.close();*/
  }
}