package edu.stanford.nlp.ie.regexp; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; //import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.sequences.DocumentReaderAndWriter; import edu.stanford.nlp.time.TimeExpressionExtractor; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.PaddedList; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.logging.Redwood; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintStream; import java.util.*; import java.util.regex.Pattern; /** * A simple rule-based classifier that detects NUMBERs in a sequence of Chinese tokens. This classifier mimics the * behavior of {@link edu.stanford.nlp.ie.regexp.NumberSequenceClassifier} (without using SUTime) and works on Chinese sequence. * * TODO: An interface needs to be used to reuse code for NumberSequenceClassifier * TODO: Ideally a Chinese version of SUTime needs to be used to provide more flexibility and accuracy. * * @author Yuhao Zhang * @author Peng Qi */ public class ChineseNumberSequenceClassifier extends AbstractSequenceClassifier<CoreLabel> { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseNumberSequenceClassifier.class); private static final boolean DEBUG = false; private final boolean useSUTime; public static final boolean USE_SUTIME_DEFAULT = false; public static final String USE_SUTIME_PROPERTY = "ner.useSUTime"; public static final String USE_SUTIME_PROPERTY_BASE = "useSUTime"; public static final String SUTIME_PROPERTY = "sutime"; private final TimeExpressionExtractor timexExtractor; public ChineseNumberSequenceClassifier() { this(new Properties(), USE_SUTIME_DEFAULT, new Properties()); } public ChineseNumberSequenceClassifier(boolean useSUTime) { this(new Properties(), useSUTime, new Properties()); } public ChineseNumberSequenceClassifier(Properties props, boolean useSUTime, Properties sutimeProps) { super(props); this.useSUTime = useSUTime; if(this.useSUTime) { // TODO: Need a Chinese version of SUTime log.warn("SUTime currently does not support Chinese. Ignore property ner.useSUTime."); } this.timexExtractor = null; } // All the tags we need public static final String NUMBER_TAG = "NUMBER"; public static final String DATE_TAG = "DATE"; public static final String TIME_TAG = "TIME"; public static final String MONEY_TAG = "MONEY"; public static final String ORDINAL_TAG = "ORDINAL"; public static final String PERCENT_TAG = "PERCENT"; // Patterns we need public static final Pattern CURRENCY_WORD_PATTERN = Pattern.compile("元|刀|(?:美|欧|澳|加|日|韩)元|英?镑|法郎|卢比|卢布|马克|先令|克朗|泰?铢|(?:越南)?盾|美分|便士|块钱|毛钱|角钱"); // In theory 块 钱 should be separated by segmenter, but just in case segmenter fails // TODO(yuhao): Need to add support for 块 钱, 毛 钱, 角 钱, 角, 五 块 二 public static final Pattern PERCENT_WORD_PATTERN1 = Pattern.compile("(?:百分之|千分之).+"); public static final Pattern PERCENT_WORD_PATTERN2 = Pattern.compile(".+%"); public static final Pattern DATE_PATTERN1 = Pattern.compile(".+(?:年代?|月份?|日|号|世纪)"); public static final Pattern DATE_PATTERN2 = Pattern.compile("(?:星期|周|礼拜).+"); public static final Pattern DATE_PATTERN3 = Pattern.compile("[0-9一二三四五六七八九零〇十]{2,4}"); public static final Pattern DATE_PATTERN4 = Pattern.compile("(?:[0-9]{2,4}[/\\-\\.][0-9]+[/\\-\\.][0-9]+|[0-9]+[/\\-\\.][0-9]+[/\\-\\.][0-9]{2,4}|[0-9]+[/\\-\\.]?[0-9]+)"); public static final Pattern DATE_PATTERN5 = Pattern.compile("[昨今明][天晨晚夜早]"); public static final Pattern TIME_PATTERN1 = Pattern.compile(".+(?::|点|时)(?:过|欠|差)?(?:.+(?::|分)?|整?|钟?|.+刻)?(?:.+秒?)"); // This only works when POS = NT private static final Pattern CHINESE_AND_ARABIC_NUMERALS_PATTERN = Pattern.compile("[一二三四五六七八九零十〇\\d]+"); // This is used to capture a special case of date in Chinese: 70 后 or 七零 后 private static final String DATE_AGE_LOCALIZER = "后"; // order it by number of characters DESC for handy one-by-one matching of string suffix public static final String[] CURRENCY_WORDS_VALUES = new String[] {"越南盾", "美元", "欧元", "澳元", "加元", "日元", "韩元", "英镑", "法郎", "卢比", "卢布", "马克", "先令", "克朗", "泰铢", "盾", "铢", "刀", "镑", "元"}; public static final String[] DATE_WORDS_VALUES = new String[] {"明天", "后天", "昨天", "前天", "明年", "后年", "去年", "前年", "昨日", "明日", "来年", "上月", "本月", "目前", "今后", "未来", "日前", "最近", "当时", "后来", "那时", "这时", "今", "今天", "当今", "如今", "之后", "当代", "以前", "现在", "将来", "此时", "此前", "元旦"}; public static final HashSet<String> DATE_WORDS = new HashSet<>(Arrays.asList(DATE_WORDS_VALUES)); public static final String[] TIME_WORDS_VALUES = new String[] {"早晨", "清晨", "凌晨", "上午", "中午", "下午", "傍晚", "晚上", "夜间", "晨间", "晚间", "午前", "午后", "早", "晚"}; public static final HashSet<String> TIME_WORDS = new HashSet<>(Arrays.asList(TIME_WORDS_VALUES)); /** * Use a set of heuristic rules to assign NER tags to tokens. * @param document A {@link List} of something that extends {@link CoreMap}. * @return */ @Override public List<CoreLabel> classify(List<CoreLabel> document) { // The actual implementation of the classifier PaddedList<CoreLabel> pl = new PaddedList<>(document, pad); for (int i = 0, sz = pl.size(); i < sz; i++) { CoreLabel me = pl.get(i); CoreLabel prev = pl.get(i - 1); CoreLabel next = pl.get(i + 1); // by default set to be "O" me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol); // If current word is OD, label it as ORDINAL if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("OD")) { me.set(CoreAnnotations.AnswerAnnotation.class, ORDINAL_TAG); } else if(CURRENCY_WORD_PATTERN.matcher(me.word()).matches() && prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { // If current word is currency word and prev word is a CD me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG); } else if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { // TODO(yuhao): Need to support Chinese captial numbers like 叁拾 (This won't be POS-tagged as CD). // If current word is a CD if(PERCENT_WORD_PATTERN1.matcher(me.word()).matches() || PERCENT_WORD_PATTERN2.matcher(me.word()).matches()) { // If current word is a percent me.set(CoreAnnotations.AnswerAnnotation.class, PERCENT_TAG); } else if(rightScanFindsMoneyWord(pl, i)) { // If one the right finds a currency word me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG); } else if(me.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(me.word()).matches() && DATE_AGE_LOCALIZER.equals(next.word())) { // This is to extract a special case of DATE: 70 后 or 七零 后 me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG); } else { // Otherwise we should safely label it as NUMBER me.set(CoreAnnotations.AnswerAnnotation.class, NUMBER_TAG); } } else if(me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NT")) { // If current word is a NT (temporal noun) if(DATE_PATTERN1.matcher(me.word()).matches() || DATE_PATTERN2.matcher(me.word()).matches() || DATE_PATTERN3.matcher(me.word()).matches() || DATE_PATTERN4.matcher(me.word()).matches() || DATE_PATTERN5.matcher(me.word()).matches() || DATE_WORDS.contains(me.word())) { me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG); } else if(TIME_PATTERN1.matcher(me.word()).matches() || TIME_WORDS.contains(me.word())) { me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG); } else { // TIME may have more variants (really?) so always add as TIME by default me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG); } } else if(DATE_AGE_LOCALIZER.equals(me.word()) && prev.word() != null && prev.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(prev.word()).matches()) { // Label 后 as DATE if the sequence is 70 后 or 七零 后 me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG); } } return document; } /** * Look along CD words and see if next thing is a money word. * * @param pl The list of CoreLabel * @param i The position to scan right from * @return Whether a money word is found */ private static boolean rightScanFindsMoneyWord(List<CoreLabel> pl, int i) { int j = i; if (DEBUG) { log.info("rightScan from: " + pl.get(j).word()); } int sz = pl.size(); while (j < sz && pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) { j++; } if (j >= sz) { return false; } String tag = pl.get(j).getString(CoreAnnotations.PartOfSpeechAnnotation.class); String word = pl.get(j).word(); if (DEBUG) { log.info("rightScan testing: " + word + '/' + tag + "; answer is: " + Boolean.toString((tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches())); } return (tag.equals("M") || tag.equals("NN") || tag.equals("NNS")) && CURRENCY_WORD_PATTERN.matcher(word).matches(); } @Override public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokenSequence, CoreMap document, CoreMap sentence) { if(useSUTime) { log.warn("Warning: ChineseNumberSequenceClassifier does not have SUTime implementation."); } return classify(tokenSequence); } @Override public void train(Collection<List<CoreLabel>> docs, DocumentReaderAndWriter<CoreLabel> readerAndWriter) { // Train is not needed for this rule-based classifier } @Override public void serializeClassifier(String serializePath) { } @Override public void serializeClassifier(ObjectOutputStream oos) { } @Override public void loadClassifier(ObjectInputStream in, Properties props) throws IOException, ClassCastException, ClassNotFoundException { } public static void main(String[] args) throws IOException { /* Properties props = StringUtils.argsToProperties("-props", "/Users/yuhao/Research/tmp/ChineseNumberClassifierProps.properties"); // Properties props = StringUtils.argsToProperties("-props", "/Users/yuhao/Research/tmp/EnglishNumberClassifierProps.properties"); props.setProperty("outputFormat", "text"); props.setProperty("ssplit.boundaryTokenRegex", "\\n"); // one sentence per line StanfordCoreNLP pipeline = new StanfordCoreNLP(props); String docFileName = "/Users/yuhao/Research/tmp/chinese_number_examples.txt"; // String docFileName = "/Users/yuhao/Research/tmp/english_number_examples.txt"; List<String> docLines = IOUtils.linesFromFile(docFileName); PrintStream out = new PrintStream(docFileName + ".out"); for (String docLine : docLines) { Annotation sentenceAnnotation = new Annotation(docLine); pipeline.annotate(sentenceAnnotation); pipeline.prettyPrint(sentenceAnnotation, out); pipeline.prettyPrint(sentenceAnnotation, System.out); } out.close();*/ } }