//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex.internals; import java.util.Collections; import java.util.regex.Matcher; import org.apache.commons.lang3.StringUtils; import org.apache.uima.jcas.JCas; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.helpers.QuantityUtils; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.types.common.Money; /** * Identifies money quantities using regular expressions. * * Where a currency symbol is used for multiple different currencies, e.g. $ for USD and AUD, the most common is selected (e.g. USD). * Symbols for the top 10 most traded currencies are supported, as are all ISO 4217 codes. * Where symbols are text (e.g. Fr for Swiss Francs), they are case sensitive. */ public class MoneyRegex extends AbstractRegexAnnotator<Money> { private static final String CURRENCY_CODES = "AED|AFN|ALL|AMD|ANG|AOA|ARS|AUD|AWG|AZN|" + "BAM|BBD|BDT|BGN|BHD|BIF|BMD|BND|BOB|BOV|BRL|BSD|BTN|BWP|BYR|BZD|" + "CAD|CDF|CHE|CHF|CHW|CLF|CLP|CNY|COP|COU|CRC|CUC|CUP|CVE|CZK|" + "DJF|DKK|DOP|DZD|" + "EGP|ERN|ETB|EUR|" + "FJD|FKP|" + "GBP|GEL|GHS|GIP|GMD|GNF|GTQ|GYD|" + "HKD|HNL|HRK|HTG|HUF|" + "IDR|ILS|INR|IQD|IRR|ISK|" + "JMD|JOD|JPY|" + "KES|KGS|KHR|KMF|KPW|KRW|KWD|KYD|KZT|" + "LAK|LBP|LKR|LRD|LSL|LYD|" + "MAD|MDL|MGA|MKD|MMK|MNT|MOP|MRO|MUR|MVR|MWK|MXN|MXV|MYR|MZN|" + "NAD|NGN|NIO|NOK|NPR|NZD|" + "OMR|" + "PAB|PEN|PGK|PHP|PKR|PLN|PYG|" + "QAR|" + "RON|RSD|RUB|RWF|" + "SAR|SBD|SCR|SDG|SEK|SGD|SHP|SLL|SOS|SRD|SSP|STD|SYP|SZL|" + "THB|TJS|TMT|TND|TOP|TRY|TTD|TWD|TZS|" + "UAH|UGX|USD|USN|USS|UYI|UYU|UZS|" + "VEF|VND|VUV|" + "WST|" + "XAF|XAG|XAU|XBA|XBB|XBC|XBD|XCD|XDR|XFU|XOF|XPD|XPF|XPT|XSU|XTS|XUA|XXX|" + "YER|" + "ZAR|ZMW"; private static final String CURRENCY_SYMBOLS = "£|\\$|€|¥|Fr"; private static final String CURRENCY_SYMBOLS_FRACTIONS = "p|¢|c"; private static final String MULTIPLIERS = "k|thousand|million|m|billion|b|trillion|t"; private static final String WHITESPACE = "\\h*"; private static final String START = "(?<=^|\\(|\\s)"; private static final String END = "(?=$|\\)|\\?|\\!|\\s|[\\.,](\\s|$))"; private static final String MONEY_REGEX = START+"("+CURRENCY_CODES+"|"+CURRENCY_SYMBOLS+")?("+WHITESPACE+"([0-9]+([,\\. ][0-9]{3})*([,.][0-9]+)?))("+WHITESPACE+"("+MULTIPLIERS+"))?("+WHITESPACE+"("+CURRENCY_CODES+"|"+CURRENCY_SYMBOLS+"|"+CURRENCY_SYMBOLS_FRACTIONS+"))?("+WHITESPACE+"("+MULTIPLIERS+"))?"+END; /** * New instance. */ public MoneyRegex() { super(MONEY_REGEX, false, 1.0f); } @Override protected Money create(JCas jCas, Matcher matcher) { if(Strings.isNullOrEmpty(matcher.group(1)) && Strings.isNullOrEmpty(matcher.group(9))){ //Must find at least one currency token return null; } //Edge case to remove times being detected if(matcher.group().toLowerCase().matches("([0-1]?[0-9]|2[0-4])\\.[0-5][0-9]\\s*pm")){ return null; } //First, work out the number and parse it to a Double String numbers = matcher.group(3).replaceAll("\\s", ""); numbers = correctCommasAndDecimals(numbers); if(numbers == null){ return null; } //Then apply any multipliers Double value = applyMultipliers(matcher, Double.parseDouble(numbers)); //Now work out the currency String currency = ""; if(!Strings.isNullOrEmpty(matcher.group(1))){ currency = matcher.group(1); }else{ //We've already checked that we have a symbol, so it must be in group 9 currency = matcher.group(9); } return createMoney(jCas, currency, value); } private boolean isCommaDecimal(Integer commaCount, Integer periodCount, String numbers){ if(commaCount == 1 && periodCount > 1){ //Only one comma and multiple periods, so comma must be decimal return true; } if(periodCount == 1 && commaCount == 1 && numbers.indexOf(',') > numbers.indexOf('.')){ //Only one comma and one period, but the comma is after the period so must be the decimal return true; } return false; } private boolean isPeriodDecimal(Integer commaCount, Integer periodCount, String numbers){ if(periodCount == 1 && commaCount > 1){ //Only one period and multiple commas, so period must be decimal return true; } if(periodCount == 1 && commaCount == 1 && numbers.indexOf(',') < numbers.indexOf('.')){ //Only one period and one comma, but the period is after the comma so must be the decimal return true; } return false; } private String processCommasAndDecimals(String text, Integer commaCount, Integer periodCount) throws BaleenException{ String numbers = text; //Check we don't have alternating periods and commas if(numbers.matches(".*,.*\\..*,.*") || numbers.matches(".*\\..*,.*\\..*")){ getMonitor().warn("Unable to parse monetary value '{}', as it contains alternating commas and periods", numbers); throw new BaleenException("Value contains alternating commas and periods"); } //We have commas and periods, so work out which is being used for what if(isCommaDecimal(commaCount, periodCount, numbers)){ //Using comma as the decimal, and period as thousands separator numbers = numbers.replaceAll("\\.", ""); numbers = numbers.replaceAll(",", "."); }else if(isPeriodDecimal(commaCount, periodCount, numbers)){ //Using period as the decimal, and commas as thousands separator numbers = numbers.replaceAll(",", ""); }else{ //We have multiple commas and multiple decimals getMonitor().warn("Unable to parse monetary value '{}', as it contains multiple commas and periods", numbers); throw new BaleenException("Value contains multiple commas and periods"); } return numbers; } /** * Take a number (as a string), and try to work out whether commas/periods are being used as decimals or thousand separators * Returns a corrected string, with no thousand separators and a period for the decimal. */ public String correctCommasAndDecimals(String number){ String correctedNumber = number; Integer commaCount = StringUtils.countMatches(correctedNumber, ','); Integer periodCount = StringUtils.countMatches(correctedNumber, '.'); if(commaCount > 0 && periodCount > 0){ try{ correctedNumber = processCommasAndDecimals(correctedNumber, commaCount, periodCount); }catch(BaleenException be){ getMonitor().warn("Unable to parse monetary value '{}'", be); return null; } }else if(commaCount > 1){ //Using comma as a thousands separator correctedNumber = correctedNumber.replaceAll(",", ""); }else if(commaCount == 1){ String[] parts = correctedNumber.split(","); if(parts[1].length() == 3){ //Probably using comma as a thousands separator correctedNumber = correctedNumber.replaceAll(",", ""); }else{ //Probably using comma as a decimal correctedNumber = correctedNumber.replaceAll(",", "."); } }else if(periodCount > 1){ //Using period as a thousands separator correctedNumber = correctedNumber.replaceAll("\\.", ""); }else if(periodCount == 1){ String[] parts = correctedNumber.split("\\."); if(parts[1].length() == 3){ //Probably using period as a thousands separator correctedNumber = correctedNumber.replaceAll("\\.", ""); } //Else, probably using period as a decimal and so we don't need to do anything } return correctedNumber; } private Double applyMultipliers(Matcher matcher, Double value){ Double val = value; if(!Strings.isNullOrEmpty(matcher.group(7))){ val = QuantityUtils.scaleByMultipler(val, matcher.group(7)); } if(!Strings.isNullOrEmpty(matcher.group(11))){ val = QuantityUtils.scaleByMultipler(val, matcher.group(11)); } return val; } private Money createMoney(JCas jCas, String currency, Double value){ Money m = new Money(jCas); Double val = value; switch(currency){ case "p": val /= 100; m.setCurrency("GBP"); break; case "£": m.setCurrency("GBP"); break; case "¢": val /= 100; m.setCurrency("USD"); break; case "$": m.setCurrency("USD"); break; case "c": val /= 100; m.setCurrency("EUR"); break; case "€": m.setCurrency("EUR"); break; case "¥": m.setCurrency("JPY"); break; case "Fr": m.setCurrency("CHF"); break; default: if(CURRENCY_CODES.contains(currency.toUpperCase())){ m.setCurrency(currency.toUpperCase()); } } m.setAmount(val); return m; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Money.class)); } }