package edu.stanford.nlp.ie;
import edu.stanford.nlp.ie.regexp.ChineseNumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.lang.System.err;
/**
* A Chinese correspondence of the {@link QuantifiableEntityNormalizer} that normalizes NUMBER, DATE, TIME,
* MONEY, PERCENT and ORDINAL amounts expressed in Chinese.
*
* Note that this class is originally designed for the Chinese KBP Challenge, so it only
* supports minimal functionalities. This needs to be completed in the future.
*
* @author Yuhao Zhang
* @author Peng Qi
*/
public class ChineseQuantifiableEntityNormalizer {
private static Redwood.RedwoodChannels log = Redwood.channels(ChineseQuantifiableEntityNormalizer.class);
private static final boolean DEBUG = false;
public static String BACKGROUND_SYMBOL = SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL;
private static final Set<String> quantifiable; //Entity types that are quantifiable
private static final ClassicCounter<String> wordsToValues;
private static final ClassicCounter<String> quantityUnitToValues;
private static final Map<String, Character> multiCharCurrencyWords; // used by money
private static final Map<String, Character> oneCharCurrencyWords; // used by money
private static final Map<String, String> fullDigitToHalfDigit;
private static final Map<String, Integer> yearModifiers;
private static final Map<String, Integer> monthDayModifiers;
private static final String LITERAL_DECIMAL_POINT = "点";
// Patterns we need
// TODO (yuhao): here we are not considering 1) negative numbers, 2) Chinese traditional characters
private static final Pattern ARABIC_NUMBERS_PATTERN = Pattern.compile("[-+]?\\d*\\.?\\d+");
// This is the all-literal-number-characters sequence, excluding unit characters like 十 or 万
private static final Pattern CHINESE_LITERAL_NUMBER_SEQUENCE_PATTERN = Pattern.compile("[一二三四五六七八九零〇]+");
// The decimal part of a float number should be exactly literal number sequence without units
private static final Pattern CHINESE_LITERAL_DECIMAL_PATTERN = CHINESE_LITERAL_NUMBER_SEQUENCE_PATTERN;
// Used by quantity modifiers
private static final String greaterEqualThreeWords = "(?:大|多|高)于或者?等于";
private static final String lessEqualThreeWords = "(?:小|少|低)于或者?等于";
private static final String greaterEqualTwoWords = "(?:大|多)于等于|不(?:少|小|低)于";
private static final String lessEqualTwoWords = "(?:小|少)于等于|不(?:大|少|高)于|不超过";
private static final String approxTwoWords = "大(?:概|约|致)(?:是|为)|大概其";
private static final String greaterThanOneWord = "(?:大|多|高)于|(?:超|高|多)过";;
private static final String lessThanOneWord = "(?:小|少|低)于|不(?:到|够|足)";
private static final String approxOneWord = "大(?:约|概|致)|接?近|差不多|几乎|左右|上下|约(?:为|略)";
// All the tags we need
private static final String NUMBER_TAG = "NUMBER";
private static final String DATE_TAG = "DATE";
private static final String TIME_TAG = "TIME";
private static final String MONEY_TAG = "MONEY";
private static final String ORDINAL_TAG = "ORDINAL";
private static final String PERCENT_TAG = "PERCENT";
// static initialization of useful properties
static {
quantifiable = Generics.newHashSet();
quantifiable.add(NUMBER_TAG);
quantifiable.add(DATE_TAG);
quantifiable.add(TIME_TAG);
quantifiable.add(MONEY_TAG);
quantifiable.add(PERCENT_TAG);
quantifiable.add(ORDINAL_TAG);
quantityUnitToValues = new ClassicCounter<>();
quantityUnitToValues.setCount("十", 10.0);
quantityUnitToValues.setCount("百", 100.0);
quantityUnitToValues.setCount("千", 1000.0);
quantityUnitToValues.setCount("万", 10000.0);
quantityUnitToValues.setCount("亿", 100000000.0);
wordsToValues = new ClassicCounter<>();
wordsToValues.setCount("零", 0.0);
wordsToValues.setCount("〇", 0.0);
wordsToValues.setCount("一", 1.0);
wordsToValues.setCount("二", 2.0);
wordsToValues.setCount("两", 2.0);
wordsToValues.setCount("三", 3.0);
wordsToValues.setCount("四", 4.0);
wordsToValues.setCount("五", 5.0);
wordsToValues.setCount("六", 6.0);
wordsToValues.setCount("七", 7.0);
wordsToValues.setCount("八", 8.0);
wordsToValues.setCount("九", 9.0);
wordsToValues.addAll(quantityUnitToValues); // all units are also quantifiable individually
multiCharCurrencyWords = Generics.newHashMap();
multiCharCurrencyWords.put("美元", '$');
multiCharCurrencyWords.put("美分", '$');
multiCharCurrencyWords.put("英镑", '£');
multiCharCurrencyWords.put("先令", '£');
multiCharCurrencyWords.put("便士", '£');
multiCharCurrencyWords.put("欧元", '€');
multiCharCurrencyWords.put("日元", '¥');
multiCharCurrencyWords.put("韩元", '₩');
oneCharCurrencyWords = Generics.newHashMap();
oneCharCurrencyWords.put("刀", '$');
oneCharCurrencyWords.put("镑", '£');
oneCharCurrencyWords.put("元", '元'); // We follow the tradition in English to use 元 instead of ¥ for RMB
// For all other currency, we use default currency symbol $
yearModifiers = Generics.newHashMap();
yearModifiers.put("前", -2);
yearModifiers.put("去", -1);
yearModifiers.put("上", -1);
yearModifiers.put("今", 0);
yearModifiers.put("同", 0);
yearModifiers.put("此", 0);
yearModifiers.put("该", 0);
yearModifiers.put("本", 0);
yearModifiers.put("明", 1);
yearModifiers.put("来", 1);
yearModifiers.put("下", 1);
yearModifiers.put("后", 2);
monthDayModifiers = Generics.newHashMap();
monthDayModifiers.put("昨", -1);
monthDayModifiers.put("上", -1);
monthDayModifiers.put("今", 0);
monthDayModifiers.put("同", 0);
monthDayModifiers.put("此", 0);
monthDayModifiers.put("该", 0);
monthDayModifiers.put("本", 0);
monthDayModifiers.put("来", 1);
monthDayModifiers.put("明", 1);
monthDayModifiers.put("下", 1);
fullDigitToHalfDigit = Generics.newHashMap();
fullDigitToHalfDigit.put("1", "1");
fullDigitToHalfDigit.put("2", "2");
fullDigitToHalfDigit.put("3", "3");
fullDigitToHalfDigit.put("4", "4");
fullDigitToHalfDigit.put("5", "5");
fullDigitToHalfDigit.put("6", "6");
fullDigitToHalfDigit.put("7", "7");
fullDigitToHalfDigit.put("8", "8");
fullDigitToHalfDigit.put("9", "9");
fullDigitToHalfDigit.put("0", "0");
}
// Patterns used by DATE and TIME (must be after the static initializers to make use of the modifiers)
private static final String CHINESE_DATE_NUMERALS_PATTERN = "[一二三四五六七八九零十〇]";
private static final String CHINESE_AND_ARABIC_NUMERALS_PATTERN = "[一二三四五六七八九零十〇\\d]";
private static final String CHINESE_AND_ARABIC_NUMERALS_PATTERN_WO_TEN = "[一二三四五六七八九零〇\\d]";
private static final String YEAR_MODIFIER_PATTERN = "[" + String.join("", yearModifiers.keySet()) + "]";
private static final String MONTH_DAY_MODIFIER_PATTERN = "[" + String.join("", monthDayModifiers.keySet()) + "]";
private static final String BASIC_DD_PATTERN = "("
+ CHINESE_AND_ARABIC_NUMERALS_PATTERN + "{1,3}|" + MONTH_DAY_MODIFIER_PATTERN + ")[日号&&[^年月]]?";
private static final String BASIC_MMDD_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "{1,2}|"
+ MONTH_DAY_MODIFIER_PATTERN + ")(?:月份?|\\-|/|\\.)(?:" + BASIC_DD_PATTERN + ")?";
private static final String BASIC_YYYYMMDD_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN_WO_TEN + "{2,4}|"
+ YEAR_MODIFIER_PATTERN + ")(?:年[份度]?|\\-|/|\\.)?" + "(?:" + BASIC_MMDD_PATTERN + ")?";
private static final String ENGLISH_MMDDYYYY_PATTERN = "(\\d{1,2})[/\\-\\.](\\d{1,2})(?:[/\\-\\.](\\d{4}))?";
private static final String RELATIVE_TIME_PATTERN = "([昨今明])[天晨晚夜早]";
private static final String BIRTH_DECADE_PATTERN = "(" + CHINESE_AND_ARABIC_NUMERALS_PATTERN + "[0零〇5五])后";
/**
* Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
* and tags each of their constituents with a "normalizedQuantity"
* label which contains the appropriate normalized string corresponding to
* the full quantity.
* Unlike the English normalizer, this method currently does not support
* concatenation or SUTime.
*
* @param list A list of {@link CoreMap}s representing a single document.
* Note: We assume the NERs has been labelled and the labels
* will be updated in place.
* @param document
* @param sentence
* @param <E>
*/
public static <E extends CoreMap> void addNormalizedQuantitiesToEntities(List<E> list, CoreMap document, CoreMap sentence) {
// Fix the NER sequence if necessay
fixupNerBeforeNormalization(list);
// Now that NER tags has been fixed up, we do another pass to add the normalization
String prevNerTag = BACKGROUND_SYMBOL;
int beforeIndex = -1;
ArrayList<E> collector = new ArrayList<>();
for (int i = 0, sz = list.size(); i <= sz; i++) {
// we should always keep list.size() unchanged inside the loop
E wi = null;
String currNerTag = null;
String nextWord = "";
if(i < sz) {
wi = list.get(i);
if(DEBUG) {
log.info("addNormalizedQuantitiesToEntities: wi=" + wi + ", collector=" + collector);
}
if(i+1 < sz) {
nextWord = list.get(i+1).get(CoreAnnotations.TextAnnotation.class);
if(nextWord == null) {
nextWord = "";
}
}
// We assume NERs have been set by previous NER taggers
currNerTag = wi.get(CoreAnnotations.NamedEntityTagAnnotation.class);
// TODO: may need to detect TIME modifier here?
}
E wprev = (i > 0) ? list.get(i-1) : null;
// if the current wi is a non-continuation and the last one was a
// quantity, we close and process the last segment.
// TODO: also need to check compatibility as the English normalizer does
if((currNerTag == null || !currNerTag.equals(prevNerTag)) && quantifiable.contains(prevNerTag)) {
String modifier = null;
// Need different handling for different tags
switch (prevNerTag) {
case TIME_TAG:
// TODO: add TIME
break;
case DATE_TAG:
processEntity(collector, prevNerTag, modifier, nextWord, document);
break;
default:
if(prevNerTag.equals(NUMBER_TAG) || prevNerTag.equals(PERCENT_TAG) ||
prevNerTag.equals(MONEY_TAG)) {
// we are doing for prev tag so afterIndex should really be i
modifier = detectQuantityModifier(list, beforeIndex, i);
}
processEntity(collector, prevNerTag, modifier, nextWord);
break;
}
collector = new ArrayList<>();
}
// If currNerTag is quantifiable, we add it into collector
if(quantifiable.contains(currNerTag)) {
if(collector.isEmpty()) {
beforeIndex = i - 1;
}
collector.add(wi);
}
// move on and update prev pointer
prevNerTag = currNerTag;
}
}
/**
* Detect the quantity modifiers ahead of a numeric string. This method will look at three words ahead
* and one word afterwards at most. Examples of modifiers are "大约", "多于".
*
* @param list
* @param beforeIndex
* @param afterIndex
* @param <E>
* @return
*/
private static <E extends CoreMap> String detectQuantityModifier(List<E> list, int beforeIndex, int afterIndex) {
String prev = (beforeIndex >= 0) ? list.get(beforeIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
String prev2 = (beforeIndex - 1 >= 0) ? list.get(beforeIndex - 1).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
String prev3 = (beforeIndex - 2 >= 0) ? list.get(beforeIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
int sz = list.size();
String next = (afterIndex < sz) ? list.get(afterIndex).get(CoreAnnotations.TextAnnotation.class).toLowerCase(): "";
if (DEBUG) {
// output space for clarity
log.info("Quantifiable modifiers: previous: " + prev3 + ' ' + prev2+ ' ' + prev);
log.info("Quantifiable modifiers: next: " + next);
}
// Actually spaces won't be used for Chinese
String longPrev = prev3 + prev2 + prev;
if (longPrev.matches(lessEqualThreeWords)) { return "<="; }
if (longPrev.matches(greaterEqualThreeWords)) { return ">="; }
longPrev = prev2 + prev;
if (longPrev.matches(greaterEqualTwoWords)) { return ">="; }
if (longPrev.matches(lessEqualTwoWords)) { return "<="; }
if (longPrev.matches(approxTwoWords)) { return "~"; }
if (prev.matches(greaterThanOneWord)) { return ">"; }
if (prev.matches(lessThanOneWord)) { return "<"; }
if (prev.matches(approxOneWord)) { return "~"; }
if (next.matches(approxOneWord)) { return "~"; }
// As backup, we also check whether prev matches a two-word pattern, just in case the segmenter fails
// This happens to <= or >= patterns sometime as observed.
if (prev.matches(greaterEqualTwoWords)) { return ">="; }
if (prev.matches(lessEqualTwoWords)) { return "<="; }
// otherwise, not modifier detected and return null
if (DEBUG) { err.println("Quantifiable: not a quantity modifier"); }
return null;
}
private static <E extends CoreMap> List<E> processEntity(List<E> l,
String entityType, String compModifier, String nextWord) {
return processEntity(l, entityType, compModifier, nextWord, null);
}
/**
* Process an entity given the NER tag, extracted modifier and the next word in the document.
* The normalized quantity will be written in place.
*
* @param l A collector that collects annotations for the entity.
* @param entityType Quantifiable NER tag.
* @param compModifier The extracted modifier around the entity of interest. Different NER tags should
* have different extraction rules.
* @param nextWord Next word in the document.
* @param document Reference to the document.
* @param <E>
* @return
*/
private static <E extends CoreMap> List<E> processEntity(List<E> l,
String entityType, String compModifier, String nextWord, CoreMap document) {
if(DEBUG) {
log.info("ChineseQuantifiableEntityNormalizer.processEntity: " + l);
}
// convert the entity annotations into a string
String s = singleEntityToString(l);
StringBuilder sb = new StringBuilder();
// convert all full digits to half digits
for (int i = 0, sz = s.length(); i < sz; i++) {
String ch = s.substring(i, i+1);
if (fullDigitToHalfDigit.containsKey(ch)) {
ch = fullDigitToHalfDigit.get(ch);
}
sb.append(ch);
}
s = sb.toString();
if(DEBUG) {
log.info("Quantifiable: Processing entity string " + s);
}
String p = null;
switch (entityType) {
case NUMBER_TAG:
p = "";
if (compModifier != null) {
p = compModifier;
}
String q = normalizedNumberString(s, nextWord, 1.0);
if (q != null) {
p = p.concat(q);
} else {
p = null;
}
break;
case ORDINAL_TAG:
// ordinal won't have modifier
p = normalizedOrdinalString(s, nextWord);
break;
case PERCENT_TAG:
p = normalizedPercentString(s, nextWord);
break;
case MONEY_TAG:
p = "";
if (compModifier != null) {
p = compModifier;
}
q = normalizedMoneyString(s, nextWord);
if (q != null) {
p = p.concat(q);
} else {
p = null;
}
break;
case DATE_TAG:
if (s.matches(BASIC_YYYYMMDD_PATTERN) || s.matches(BASIC_MMDD_PATTERN)
|| s.matches(ENGLISH_MMDDYYYY_PATTERN) || s.matches(BASIC_DD_PATTERN)
|| s.matches(RELATIVE_TIME_PATTERN) || s.matches(BIRTH_DECADE_PATTERN)) {
String docdate = document.get(CoreAnnotations.DocDateAnnotation.class);
p = normalizeDateString(s, docdate);
}
break;
case TIME_TAG:
break;
}
if (DEBUG) {
err.println("Quantifiable: Processed '" + s + "' as '" + p + '\'');
}
// Write the normalized NER values in place
for (E wi : l) {
if (p != null) {
if (DEBUG) {
log.info("Changing normalized NER from " + wi.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class) + " to " + p);
}
wi.set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, p);
}
}
// This return value is not necessarily useful as the labelling is done in place.
return l;
}
/**
* Normalize a money string. A currency symbol will be added accordingly.
* The assumption is that the money string will be clean enough: either lead by a currency sign (like $),
* or trailed by a currency word. Otherwise we give up normalization.
*
* @param s
* @param nextWord
* @return
*/
private static String normalizedMoneyString(String s, String nextWord) {
if (DEBUG) {
log.info("normalizedMoneyString: Normalizing " + s);
}
// default multiplier is 1
double multiplier = 1.0;
char currencySign = '$'; // by default we use $, following English
boolean notMatched = true;
// We check multiCharCurrencyWords first
for (String currencyWord : multiCharCurrencyWords.keySet()) {
if(notMatched && StringUtils.find(s, currencyWord)) {
if(currencyWord.equals("美分")) {
multiplier = 0.01;
} else if(currencyWord.equals("先令")) {
multiplier = 0.05;
} else if(currencyWord.equals("便士")) {
multiplier = 1.0/240;
}
s = s.replaceAll(currencyWord, "");
currencySign = multiCharCurrencyWords.get(currencyWord);
notMatched = false;
}
}
// Then we check oneCharCurrencyWords
if(notMatched) {
for(String currencyWord : oneCharCurrencyWords.keySet()) {
if(notMatched && StringUtils.find(s, currencyWord)) {
// TODO: change multiplier
s = s.replaceAll(currencyWord, "");
currencySign = oneCharCurrencyWords.get(currencyWord);
notMatched = false;
}
}
}
// We check all other currency cases if we miss both dictionaries above
if(notMatched) {
for(String currencyWord : ChineseNumberSequenceClassifier.CURRENCY_WORDS_VALUES) {
if(notMatched && StringUtils.find(s, currencyWord)) {
s = s.replaceAll(currencyWord, "");
break;
}
}
}
// Now we assert the string should be all numbers
String value = normalizedNumberString(s, nextWord, multiplier);
if(value == null) {
if(DEBUG) {
log.info("normalizedMoneyString: Failed to parse number " + s);
}
return null;
} else {
return currencySign + value;
}
}
/**
* Normalize a percent string. We handle both % and ‰.
*
* @param s
* @param nextWord
* @return
*/
private static String normalizedPercentString(String s, String nextWord) {
String ns = "";
if(s.startsWith("百分之")) {
ns = normalizedNumberString(s.substring(3), nextWord, 1.0);
if(ns != null) {
ns += "%";
}
} else if (s.startsWith("千分之")) {
ns = normalizedNumberString(s.substring(3), nextWord, 1.0);
if(ns != null) {
ns += "‰";
}
} else if (s.endsWith("%")) {
// we also handle the case where the percent ends with a % character
ns = normalizedNumberString(s.substring(0, s.length()-1), nextWord, 1.0);
if(ns != null) {
ns += "%";
}
} else if (s.endsWith("‰")) {
ns = normalizedNumberString(s.substring(0, s.length()-1), nextWord, 1.0);
ns += "‰";
} else {
// otherwise we assume the entire percent is a number
ns = normalizedNumberString(s, nextWord, 1.0);
if(ns != null) {
ns += "%";
}
}
return ns;
}
/**
* Normalize an ordinal string.
* If the string starts with "第", we assume the number is followed; otherwise
* we assume the entire body is a number.
*
* @param s
* @param nextWord
* @return
*/
private static String normalizedOrdinalString(String s, String nextWord) {
if(s.startsWith("第")) {
return normalizedNumberString(s.substring(1), nextWord, 1.0);
} else {
return normalizedNumberString(s, nextWord, 1.0);
}
}
/**
* Normalize a string into the corresponding standard numerical values (in String form).
* Note that this can only handle a string of pure numerical expressions, like
* "两万三千零七十二点五六" or "23072.56". Other NERs like MONEY or DATE needs to be handled
* in their own methods.
* In any case we fail, this method will just return a null.
*
* @param s The string input.
* @param nextWord The next word in sequence. This is likely to be useless for Chinese.
* @param multiplier A multiplier to make things simple for callers
* @return
*/
private static String normalizedNumberString(String s, String nextWord, double multiplier) {
// First remove unnecessary characters in the string
s = s.trim();
s = s.replaceAll("[ \t\n\0\f\r,]", ""); // remove all unnecessary characters
// In case of pure arabic numbers, return the straight value of it
if(ARABIC_NUMBERS_PATTERN.matcher(s).matches()) {
return prettyNumber(String.format("%f", multiplier * Double.valueOf(s)));
}
// If this is not all arabic, we assume it to be either Chinese literal or mix of Chinese literal and arabic
// We handle decimal point first
int decimalIndex = s.indexOf(LITERAL_DECIMAL_POINT);
Double decimalValue = Double.valueOf(0);
if(decimalIndex != -1) {
// handle decimal part
if(DEBUG) {
log.info("Normalizing decimal part: " + s.substring(decimalIndex+1));
}
decimalValue = normalizeLiteralDecimalString(s.substring(decimalIndex+1));
// if fails at parsing decimal value, return null
if(decimalValue == null) {
return null;
}
// update s to be the integer part
s = s.substring(0, decimalIndex);
}
if(DEBUG) {
log.info("Normalizing integer part: " + s);
}
Double integerValue = recurNormalizeLiteralIntegerString(s);
if(integerValue == null) {
return null;
}
// both decimal and integer part are parsable, we combine them to form the final result
// the formatting of numbers in Java is really annoying
return prettyNumber(String.format("%f", multiplier * Double.valueOf(integerValue.doubleValue() + decimalValue.doubleValue())));
}
/**
* Recursively parse a integer String expressed in either Chinese or a mix of Chinese and arabic numbers.
*
* @param s
* @return
*/
private static Double recurNormalizeLiteralIntegerString(String s) {
// If empty, return 0
if(s.isEmpty()) {
return Double.valueOf(0);
}
// TODO: check if it is valid. It is possible that this is a vague number like "五六十" which cannot be parsed by current implementation.
// In case of pure arabic numbers, return the straight value of it
if(ARABIC_NUMBERS_PATTERN.matcher(s).matches()) {
return Double.valueOf(s);
}
//If s has more than 1 char and first char is 零 or 〇, it is likely
// to be useless
if(s.length() > 1 && (s.startsWith("零") || s.startsWith("〇"))) {
s = s.substring(1);
}
//If there is only one char left and we can quantify it, we return the value of it
if(s.length() == 1 && wordsToValues.containsKey(s)) {
return Double.valueOf(wordsToValues.getCount(s));
}
// Now parse the integer, making use of the compositionality of Chinese literal numbers
Double value;
value = compositeAtUnitIfExists(s, "亿");
if(value != null) {
return value;
} else {
value = compositeAtUnitIfExists(s, "万");
}
if(value != null) {
return value;
} else {
value = compositeAtUnitIfExists(s, "千");
}
if(value != null) {
return value;
} else {
value = compositeAtUnitIfExists(s, "百");
}
if(value != null) {
return value;
} else {
value = compositeAtUnitIfExists(s, "十");
}
if(value != null) {
return value;
}
// otherwise we fail to parse and just return null
return null;
}
/**
* Check if a unit exists in the literal string. If so, parse it by making use of
* the compositionality; otherwise return null.
*
* @param s
* @param unit
* @return
*/
private static Double compositeAtUnitIfExists(String s, String unit) {
// invalid unit
if(!quantityUnitToValues.containsKey(unit)) {
return null;
}
int idx = s.indexOf(unit);
if(idx != -1) {
Double first = Double.valueOf(1.0);
// Here we need special handling for 十 and 百 when they occur as the first char
// As in Chinese 十二 is very common, 百二十 is sometimes valid as well.
if(("十".equals(unit) || "百".equals(unit)) && idx == 0) {
// do nothing
} else {
// otherwise we try to parse the value before the unit
first = recurNormalizeLiteralIntegerString(s.substring(0,idx));
}
Double second = recurNormalizeLiteralIntegerString(s.substring(idx+1));
if(first != null && second != null) {
return Double.valueOf(first.doubleValue() * quantityUnitToValues.getCount(unit) + second.doubleValue());
}
}
// return null if unit is not present or fails to parse
return null;
}
/**
* Normalize decimal part of the string. Note that this only handles Chinese literal expressions.
* @param s
* @return
*/
private static Double normalizeLiteralDecimalString(String s) {
// if s is empty return 0
if(s.isEmpty()) {
return Double.valueOf(0);
}
// if s is not valid Chinese literal decimal expressions, return null
if(!CHINESE_LITERAL_DECIMAL_PATTERN.matcher(s).matches()) {
return null;
}
// after checking we assume the decimal part should be correct
double decimalValue = 0;
double base = 1;
for(int i=0, sz=s.length(); i<sz; i++) {
// update base
base *= 0.1;
String c = Character.toString(s.charAt(i));
if(!wordsToValues.containsKey(c)) {
// some uncatchable character is present, return null
return null;
}
double v = wordsToValues.getCount(c);
decimalValue += v * base;
}
return Double.valueOf(decimalValue);
}
private static String normalizeMonthOrDay(String s, String context) {
int ctx = -1;
if (!context.equals("XX"))
ctx = Integer.valueOf(context);
if (monthDayModifiers.containsKey(s)) {
if (ctx >= 0)
// todo: this is unsafe as it's not bound-checked for validity
return String.format("%02d", ctx + monthDayModifiers.get(s));
else
return "XX";
} else {
String candidate;
if (s == null) {
return "XX";
} else {
if (s.matches(CHINESE_DATE_NUMERALS_PATTERN + "+"))
candidate = prettyNumber(String.format("%f", recurNormalizeLiteralIntegerString(s)));
else
candidate = s;
}
if (candidate.length() < 2)
candidate = "0" + candidate;
return candidate;
}
}
private static String normalizeYear(String s, String contextYear) {
return normalizeYear(s, contextYear, false);
}
private static String normalizeYear(String s, String contextYear, boolean strict) {
int ctx = -1;
if (!contextYear.equals("XXXX"))
ctx = Integer.valueOf(contextYear);
if (yearModifiers.containsKey(s)) {
if (ctx >= 0)
return String.format("%d", ctx + yearModifiers.get(s));
else
return "XXXX";
} else {
String candidate;
StringBuilder yearcandidate = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
String t = "" + s.charAt(i);
if (CHINESE_LITERAL_DECIMAL_PATTERN.matcher(t).matches()) {
if (wordsToValues.containsKey(t))
yearcandidate.append((int) wordsToValues.getCount(t));
else
// something unexpected happened
return null;
} else
yearcandidate.append(t);
}
candidate = yearcandidate.toString();
if (candidate.length() != 2) {
return candidate;
}
if (ctx < 0) {
// use the current year as reference point for two digit year normalization by default
ctx = Integer.valueOf(new SimpleDateFormat("yyyy").format(new Date()));
}
// note: this is a very crude heuristic for determining actual year from two digit expressions
int cand = Integer.valueOf(candidate);
if ((strict && cand >= (ctx % 100)) || cand > (ctx % 100 + 10)) {
// referring to the previous century
cand += (ctx / 100 - 1) * 100;
} else {
// referring to the same century
cand += (ctx / 100) * 100;
}
return String.format("%d", cand);
}
}
/**
* Normalizes date strings.
* @param s Input date string
* @param ctxdate Context date (usually doc_date)
* @return Normalized Timex expression of the input date string
*/
public static String normalizeDateString(String s, String ctxdate) {
// TODO [pengqi]: need to handle basic localization ("在七月二日到[八日]间")
// TODO [pengqi]: need to handle literal numeral dates (usually used in events, e.g. "三一五" for 03-15)
// TODO [pengqi]: might need to add a pattern for centuries ("上世纪90年代")?
Pattern p;
Matcher m;
String ctxyear = "XXXX", ctxmonth = "XX", ctxday = "XX";
// set up context date
if (ctxdate != null) {
p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
m = p.matcher(ctxdate);
if (m.find() && m.groupCount() == 3) {
ctxyear = m.group(1);
ctxmonth = m.group(2);
ctxday = m.group(3);
}
}
p = Pattern.compile("^" + BIRTH_DECADE_PATTERN + "$");
m = p.matcher(s);
if (m.find() && m.groupCount() == 1) {
StringBuilder res = new StringBuilder();
res.append(normalizeYear(m.group(1), ctxyear, true).substring(0, 3) + "X");
res.append("-XX-XX");
return res.toString();
}
p = Pattern.compile("^" + RELATIVE_TIME_PATTERN + "$");
m = p.matcher(s);
if (m.find() && m.groupCount() == 1) {
StringBuilder res = new StringBuilder();
res.append(ctxyear);
res.append("-");
res.append(ctxmonth);
res.append("-");
res.append(normalizeMonthOrDay(m.group(1), ctxday));
return res.toString();
}
p = Pattern.compile("^" + BASIC_YYYYMMDD_PATTERN + "$");
m = p.matcher(s);
if (m.find() && m.groupCount() == 3) {
StringBuilder res = new StringBuilder();
res.append(normalizeYear(m.group(1), ctxyear));
res.append("-");
res.append(normalizeMonthOrDay(m.group(2), ctxmonth));
res.append("-");
res.append(normalizeMonthOrDay(m.group(3), ctxday));
return res.toString();
}
p = Pattern.compile("^" + BASIC_MMDD_PATTERN + "$");
m = p.matcher(s);
if (m.find() && m.groupCount() == 2) {
StringBuilder res = new StringBuilder();
res.append(ctxyear);
res.append("-");
res.append(normalizeMonthOrDay(m.group(1), ctxmonth));
res.append("-");
res.append(normalizeMonthOrDay(m.group(2), ctxday));
return res.toString();
}
p = Pattern.compile("^" + BASIC_DD_PATTERN + "$");
m = p.matcher(s);
if (m.find() && m.groupCount() == 1) {
StringBuilder res = new StringBuilder();
res.append(ctxyear);
res.append("-");
res.append(ctxmonth);
res.append("-");
res.append(normalizeMonthOrDay(m.group(1), ctxday));
return res.toString();
}
p = Pattern.compile("^" + ENGLISH_MMDDYYYY_PATTERN + "$");
m = p.matcher(s);
if (m.find() && m.groupCount() == 3) {
StringBuilder res = new StringBuilder();
if (m.group(3) == null)
res.append(ctxyear);
else
res.append(normalizeYear(m.group(3), ctxyear));
res.append("-");
res.append(normalizeMonthOrDay(m.group(1), ctxmonth));
res.append("-");
res.append(normalizeMonthOrDay(m.group(2), ctxday));
return res.toString();
}
return s;
}
/**
* Concatenate entity annotations to a String. Note that Chinese does not use space to separate
* tokens so we will follow this convention here.
*
* @param l
* @param <E>
* @return
*/
public static <E extends CoreMap> String singleEntityToString(List<E> l) {
String entityType = l.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class);
StringBuilder sb = new StringBuilder();
for (E w : l) {
if(!w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(entityType)) {
log.error("differing NER tags detected in entity: " + l);
throw new Error("Error with entity construction, two tokens had inconsistent NER tags");
}
sb.append(w.get(CoreAnnotations.TextAnnotation.class));
}
return sb.toString();
}
public static String prettyNumber(String s) {
if(s == null) {
return null;
}
s = s.indexOf(".") < 0 ? s : s.replaceAll("0*$", "").replaceAll("\\.$", "");
return s;
}
/**
* Fix up the NER sequence in case this is necessary.
*
* @param list
* @param <E>
*/
public static <E extends CoreMap> void fixupNerBeforeNormalization(List<E> list) {
}
}