package marytts.language.en; import java.io.IOException; import java.text.ParseException; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import marytts.datatypes.MaryData; import marytts.datatypes.MaryDataType; import marytts.datatypes.MaryXML; import marytts.exceptions.MaryConfigurationException; import marytts.modules.InternalModule; import marytts.util.MaryRuntimeUtils; import marytts.util.dom.MaryDomUtils; import marytts.util.dom.NameNodeFilter; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.traversal.DocumentTraversal; import org.w3c.dom.traversal.NodeFilter; import org.w3c.dom.traversal.TreeWalker; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.ULocale.Category; import com.ibm.icu.text.DateFormat; import com.ibm.icu.text.RuleBasedNumberFormat; import com.google.common.base.Joiner; import com.google.common.collect.Lists; /** * @author Tristan Hamilton * * <p> * Can process following formats: * <ul> * <li>cardinal (handled by real number) * <li>ordinal * <li>year (as a 4 digit number or any number followed by AD/BC variation) * <li>currency * <li>numberandword together * <li>dashes (read each number singly) or (split into two words) * <li>underscores * <li>decimal point, minus symbol (real numbers) also handles %, however Jtokeniser splits % into separate * tokens * <li>time * <li>dates (in format mm/dd/yyyy) * <li>acronyms (only split into single characters, never expanded) * <li>abbreviations (list of known expansions in resource preprocess/abbrev.dat, a properties file separated by * whitespace. If an abbrev has two different expansions then the capitalized version comes first, followed by a comma) * <li>contractions → first check lexicon, if not then → split and check if map contains contraction, if not * then just remove apostrophe else → split before apostrophe into two tokens, use map to manually add ph → for * 's if word ends in c,f,k,p,t then add ph = s otherwise ph = z * <li>ampersand &, "at" @ symbol, → symbols * <li>urls → note that jtokeniser splits off http[s]?:// * <li>number ranges "18-35" * <li>words without vowels → first check lexicon, if not then separate into single character tokens * <li>#hashtags * <li>single "A/a" character → if there is no next token or the next token is punctuation or next token * string.length == 1 * <li>should also as a last processing attempt, split by punctuation,symbols,etc. and attempt to process these tokens * separately * <li>durations hours:minutes:seconds(:milliseconds) * <li>numbers followed by an s * <li>punctuation → add ph attribute to tag to prevent phonemisation * </ul> * <p> * May include: * <ul> * <li>roman numerals * </ul> */ public class Preprocess extends InternalModule { // abbreviations map private Map<Object, Object> abbrevMap; // symbols map private static final Map<String, String> symbols; // contractions map private static final Map<String, String[]> contractions; // icu4j stuff private RuleBasedNumberFormat rbnf; protected final String cardinalRule; protected final String ordinalRule; protected final String yearRule; private DateFormat df; // Regex matching patterns private static final Pattern moneyPattern; private static final Pattern timePattern; private static final Pattern durationPattern; private static final Pattern abbrevPattern; private static final Pattern acronymPattern; private static final Pattern realNumPattern; private static final Pattern numberWordPattern; private static final Pattern datePattern; private static final Pattern yearPattern; private static final Pattern contractPattern; private static final Pattern symbolsPattern; private static final Pattern URLPattern; private static final Pattern rangePattern; private static final Pattern consonantPattern; private static final Pattern punctuationPattern; private static final Pattern myPunctPattern; private static final Pattern hashtagPattern; private static final Pattern ordinalPattern; private static final Pattern currencySymbPattern; private static final Pattern numberSPattern; // Regex initialization static { moneyPattern = Pattern.compile("(\\d+)(\\.\\d+)?"); currencySymbPattern = Pattern.compile("[$£€]"); timePattern = Pattern.compile( "((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])):([0-5][0-9])(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)?", Pattern.CASE_INSENSITIVE); yearPattern = Pattern.compile("(\\d+)(bc|ad|b\\.c\\.|b\\.c|a\\.d\\.|a\\.d)", Pattern.CASE_INSENSITIVE); ordinalPattern = Pattern.compile("\\d+(st|nd|rd|th)", Pattern.CASE_INSENSITIVE); durationPattern = Pattern.compile("(\\d+):([0-5][0-9]):([0-5][0-9])(:([0-5][0-9]))?"); abbrevPattern = Pattern.compile("[a-zA-Z]{2,}\\."); acronymPattern = Pattern.compile("([a-zA-Z]\\.[a-zA-Z](\\.)?)+([a-zA-Z](\\.)?)?"); realNumPattern = Pattern.compile("(-)?(\\d+)?(\\.(\\d+)(%)?)?"); numberWordPattern = Pattern.compile("([a-zA-Z]+[0-9]+|[0-9]+[a-zA-Z]+)\\w*"); datePattern = Pattern.compile("(\\d{2})[\\/\\.](\\d{2})[\\/\\.]\\d{4}"); contractPattern = Pattern.compile("[a-zA-Z]+('[a-zA-Z]+)"); symbolsPattern = Pattern.compile("[@%#\\/\\+=&><-]"); rangePattern = Pattern.compile("([0-9]+)-([0-9]+)"); consonantPattern = Pattern.compile("[b-df-hj-np-tv-z]+", Pattern.CASE_INSENSITIVE); punctuationPattern = Pattern.compile("\\p{Punct}"); numberSPattern = Pattern.compile("([0-9]+)([sS])"); myPunctPattern = Pattern.compile(",\\.:;?'\""); hashtagPattern = Pattern.compile("(#)(\\w+)"); URLPattern = Pattern .compile("(https?:\\/\\/)?((www\\.)?([-a-zA-Z0-9@:%._\\\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\\\+.~#?&\\/=]*)))"); } // HashMap initialization static { contractions = new HashMap<String, String[]>(); contractions.put("'s", new String[] { "z", "s" }); contractions.put("'ll", new String[] { "l" }); contractions.put("'ve", new String[] { "v" }); contractions.put("'d", new String[] { "d" }); contractions.put("'m", new String[] { "m" }); contractions.put("'re", new String[] { "r" }); symbols = new HashMap<String, String>(); symbols.put("@", "at"); symbols.put("#", "hashtag"); symbols.put("/", "forward slash"); symbols.put("%", "per cent"); symbols.put("+", "plus"); symbols.put("-", "minus"); symbols.put("=", "equals"); symbols.put(">", "greater than"); symbols.put("<", "less than"); symbols.put("&", "and"); } public Preprocess() { super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, Locale.ENGLISH); this.rbnf = new RuleBasedNumberFormat(ULocale.ENGLISH, RuleBasedNumberFormat.SPELLOUT); this.cardinalRule = "%spellout-numbering"; this.ordinalRule = getOrdinalRuleName(rbnf); this.yearRule = getYearRuleName(rbnf); this.df = DateFormat.getDateInstance(DateFormat.LONG, ULocale.ENGLISH); try { this.abbrevMap = loadAbbrevMap(); } catch (IOException e) { e.printStackTrace(); } } public MaryData process(MaryData d) throws Exception { Document doc = d.getDocument(); expand(doc); MaryData result = new MaryData(getOutputType(), d.getLocale()); result.setDocument(doc); return result; } /*** * processes a document in mary xml format, from Tokens to Words which can be phonemised. * * @param doc * doc * @throws ParseException * parse exception * @throws IOException * IO Exception * @throws MaryConfigurationException * mary configuration exception */ protected void expand(Document doc) throws ParseException, IOException, MaryConfigurationException { String whichCurrency = ""; boolean URLFirst = false; boolean isYear; boolean isURL = false; boolean puncSplit = false; boolean dashSplit = false; String webEmailTemp = ""; boolean splitContraction; TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); Element t = null; // loop through each node in dom tree while ((t = (Element) tw.nextNode()) != null) { /* * PRELIM FOR EACH NODE */ // to accommodate the first token being a url if (URLFirst) { t = (Element) tw.previousNode(); URLFirst = false; } isYear = true; splitContraction = false; if (MaryDomUtils.hasAncestor(t, MaryXML.SAYAS) || t.hasAttribute("ph") || t.hasAttribute("sounds_like")) { // if token already has any of these attributes then ignore continue; } // save the original token text String origText = MaryDomUtils.tokenText(t); // remove commas if (MaryDomUtils.tokenText(t).matches("[\\$|£|€]?\\d+,[\\d,]+")) { MaryDomUtils.setTokenText(t, MaryDomUtils.tokenText(t).replaceAll(",", "")); // presume that a 4 digit number which had commas is not a year if (MaryDomUtils.tokenText(t).matches("\\d{4}")) { isYear = false; } } // isYear extra check if (MaryDomUtils.tokenText(t).matches("\\d{4}") && !whichCurrency.equals("")) { isYear = false; } // check if currency if (MaryDomUtils.tokenText(t).matches(currencySymbPattern.pattern())) { whichCurrency = MaryDomUtils.tokenText(t); } /* * ACTUAL PROCESSING */ // ordinal if (MaryDomUtils.tokenText(t).matches("(?i)" + ordinalPattern.pattern())) { String matched = MaryDomUtils.tokenText(t).split("(?i)st|nd|rd|th")[0]; MaryDomUtils.setTokenText(t, expandOrdinal(Double.parseDouble(matched))); // single a or A character } else if (MaryDomUtils.tokenText(t).matches("[aA]")) { Element checkNextNode = MaryDomUtils.getNextSiblingElement((Element) t); if (checkNextNode == null || MaryDomUtils.tokenText(checkNextNode).matches(myPunctPattern.pattern()) || MaryDomUtils.tokenText(checkNextNode).length() == 1) { MaryDomUtils.setTokenText(t, "_a"); } // date } else if (MaryDomUtils.tokenText(t).matches(datePattern.pattern())) { MaryDomUtils.setTokenText(t, expandDate(MaryDomUtils.tokenText(t))); // number followed by s } else if (MaryDomUtils.tokenText(t).matches(numberSPattern.pattern())) { MaryDomUtils.setTokenText(t, expandNumberS(MaryDomUtils.tokenText(t))); // year with bc or ad } else if (MaryDomUtils.tokenText(t).matches("(?i)" + yearPattern.pattern())) { MaryDomUtils.setTokenText(t, expandYearBCAD(MaryDomUtils.tokenText(t))); // year as just 4 digits → this should always be checked BEFORE real number } else if (MaryDomUtils.tokenText(t).matches("\\d{4}") && isYear == true) { MaryDomUtils.setTokenText(t, expandYear(Double.parseDouble(MaryDomUtils.tokenText(t)))); // wordAndNumber → must come AFTER year } else if (MaryDomUtils.tokenText(t).matches(numberWordPattern.pattern())) { MaryDomUtils.setTokenText(t, expandWordNumber(MaryDomUtils.tokenText(t))); // real number & currency } else if (MaryDomUtils.tokenText(t).matches(realNumPattern.pattern())) { if (!whichCurrency.equals("")) { MaryDomUtils.setTokenText(t, expandMoney(MaryDomUtils.tokenText(t), whichCurrency)); whichCurrency = ""; } else { MaryDomUtils.setTokenText(t, expandRealNumber(MaryDomUtils.tokenText(t))); } // contractions } else if (MaryDomUtils.tokenText(t).matches(contractPattern.pattern())) { // first check lexicon if (MaryRuntimeUtils.checkLexicon("en_US", MaryDomUtils.tokenText(t)).length == 0) { Matcher contractionMatch = contractPattern.matcher(MaryDomUtils.tokenText(t)); contractionMatch.find(); // if no contraction we allow g2p rules to handle if (!contractions.containsKey(contractionMatch.group(1))) { MaryDomUtils.setTokenText(t, MaryDomUtils.tokenText(t).replaceAll("'", "")); } // FIXME: we do not want to have to phonological word => for now we do not split ! // // if not in lexicon and we have a contraction expansion then split into two tokens // else // { // splitContraction = true; // MaryDomUtils.setTokenText(t, splitContraction(MaryDomUtils.tokenText(t))); // } } // acronym } else if (MaryDomUtils.tokenText(t).matches(acronymPattern.pattern())) { MaryDomUtils.setTokenText(t, expandAcronym(MaryDomUtils.tokenText(t))); // abbreviation } else if ((MaryDomUtils.tokenText(t).matches(abbrevPattern.pattern()) || this.abbrevMap.containsKey(MaryDomUtils .tokenText(t).toLowerCase())) && !isURL) { Element testAbbNode = MaryDomUtils.getNextSiblingElement((Element) t); boolean nextTokenIsCapital = false; if (testAbbNode != null && Character.isUpperCase(MaryDomUtils.tokenText(testAbbNode).charAt(0))) { nextTokenIsCapital = true; } MaryDomUtils.setTokenText(t, expandAbbreviation(MaryDomUtils.tokenText(t), nextTokenIsCapital)); // time } else if (MaryDomUtils.tokenText(t).matches("(?i)" + timePattern.pattern())) { Element testTimeNode = MaryDomUtils.getNextSiblingElement((Element) t); boolean nextTokenIsTime = false; if (testTimeNode != null && MaryDomUtils.tokenText(testTimeNode).matches("a\\.m\\.|AM|PM|am|pm|p\\.m\\.")) { nextTokenIsTime = true; } MaryDomUtils.setTokenText(t, expandTime(MaryDomUtils.tokenText(t), nextTokenIsTime)); // duration } else if (MaryDomUtils.tokenText(t).matches(durationPattern.pattern())) { MaryDomUtils.setTokenText(t, expandDuration(MaryDomUtils.tokenText(t))); // hashtags } else if (MaryDomUtils.tokenText(t).matches(hashtagPattern.pattern())) { MaryDomUtils.setTokenText(t, expandHashtag(MaryDomUtils.tokenText(t))); // URLs } else if (MaryDomUtils.tokenText(t).matches(URLPattern.pattern())) { // matching group 2 contains the chunk we want Matcher urlMatcher = URLPattern.matcher(MaryDomUtils.tokenText(t)); urlMatcher.find(); webEmailTemp = MaryDomUtils.tokenText(t); isURL = true; MaryDomUtils.setTokenText(t, expandURL(urlMatcher.group(2))); // dot . for web and email addresses } else if (MaryDomUtils.tokenText(t).equals(".") && isURL) { MaryDomUtils.setTokenText(t, "dot"); webEmailTemp = webEmailTemp.replaceFirst("\\.", "dot"); if (!webEmailTemp.contains(".")) { isURL = false; } // symbols } else if (MaryDomUtils.tokenText(t).matches(symbolsPattern.pattern())) { MaryDomUtils.setTokenText(t, symbols.get(MaryDomUtils.tokenText(t))); // number ranges → before checking for dashes } else if (MaryDomUtils.tokenText(t).matches(rangePattern.pattern())) { MaryDomUtils.setTokenText(t, expandRange(MaryDomUtils.tokenText(t))); // dashes and underscores } else if (MaryDomUtils.tokenText(t).contains("-") || MaryDomUtils.tokenText(t).contains("_")) { dashSplit = true; String[] tokens = MaryDomUtils.tokenText(t).split("[-_]"); int i = 0; for (String tok : tokens) { if (tok.matches("\\d+")) { String newTok = ""; for (char c : tok.toCharArray()) { newTok += expandNumber(Double.parseDouble(String.valueOf(c))) + " "; } tokens[i] = newTok; } i++; } MaryDomUtils.setTokenText(t, Arrays.toString(tokens).replaceAll("[,\\]\\[]", "")); // words containing only consonants } else if (MaryDomUtils.tokenText(t).matches("(?i)" + consonantPattern.pattern())) { // first check lexicon if (MaryRuntimeUtils.checkLexicon("en_US", MaryDomUtils.tokenText(t)).length == 0) { MaryDomUtils.setTokenText(t, expandConsonants(MaryDomUtils.tokenText(t))); } // a final attempt to split by punctuation } else if (punctuationPattern.matcher(MaryDomUtils.tokenText(t)).find() && MaryDomUtils.tokenText(t).length() > 1) { puncSplit = true; String[] puncTokens = MaryDomUtils.tokenText(t).split("((?<=\\p{Punct})|(?=\\p{Punct}))"); MaryDomUtils.setTokenText(t, Arrays.toString(puncTokens).replaceAll("[,\\]\\[]", "")); // FIXME: skip quotes for now as we don't have any clever management of the POS for the prosodic feature } else if (MaryDomUtils.tokenText(t).equals("\"")) { } else if (MaryDomUtils.tokenText(t).matches(punctuationPattern.pattern())) { t.setAttribute("pos", "."); } // if token isn't ignored but there is no handling rule don't add MTU if (!origText.equals(MaryDomUtils.tokenText(t))) { MaryDomUtils.encloseWithMTU(t, origText, null); // finally, split new expanded token separated by spaces into separate tokens (also catch any leftover dashes) String[] newTokens = MaryDomUtils.tokenText(t).replaceAll("-", " ").split("\\s+"); MaryDomUtils.setTokenText(t, newTokens[0]); for (int i = 1; i < newTokens.length; i++) { MaryDomUtils.appendToken(t, newTokens[i]); t = MaryDomUtils.getNextSiblingElement((Element) t); // if tokens are an expanded contraction if (splitContraction && newTokens.length == 2) { if (newTokens[0].substring(newTokens[0].length() - 1).matches("[cfkpt]") && contractions.get(newTokens[i]).length > 1) { t.setAttribute("ph", contractions.get(newTokens[i])[1]); } else { t.setAttribute("ph", contractions.get(newTokens[i])[0]); } } } // if expanded url or punctuation go over each node, otherwise let TreeWalker catch up if (!isURL && !puncSplit && !dashSplit) { tw.setCurrentNode((Node) t); } else { Node n = tw.previousNode(); // if the first node in doc is an email or web address, account for this if (n == null) { URLFirst = true; } puncSplit = false; dashSplit = false; } } } } protected String expandNumber(double number) { this.rbnf.setDefaultRuleSet(cardinalRule); return this.rbnf.format(number); } protected String expandOrdinal(double number) { this.rbnf.setDefaultRuleSet(ordinalRule); return this.rbnf.format(number); } protected String expandYear(double number) { this.rbnf.setDefaultRuleSet(yearRule); return this.rbnf.format(number); } protected String expandDuration(String duration) { Matcher durMatcher = durationPattern.matcher(duration); durMatcher.find(); String hrs = expandNumber(Double.parseDouble(durMatcher.group(1))) + " hours "; String mins = expandNumber(Double.parseDouble(durMatcher.group(2))) + " minutes "; String secs = expandNumber(Double.parseDouble(durMatcher.group(3))) + " seconds "; String ms = ""; if (durMatcher.group(4) != null) { ms = "and " + expandNumber(Double.parseDouble(durMatcher.group(5))) + " milliseconds "; } else { secs = "and " + secs; } return hrs + mins + secs + ms; } protected String expandAcronym(String acronym) { return acronym.replaceAll("\\.", " "); } /*** * expand a URL string partially by splitting by @, / and . symbols (but retaining them) * * @param email * email * @return Arrays.toString(tokens).replaceAll("[,\\]\\[]", "") */ protected String expandURL(String email) { String[] tokens = email.split("((?<=[\\.@\\/])|(?=[\\.@\\/]))"); return Arrays.toString(tokens).replaceAll("[,\\]\\[]", ""); } protected String expandYearBCAD(String year) { String abbrev = ""; Matcher yearMatcher = yearPattern.matcher(year); yearMatcher.find(); if (yearMatcher.group(2).contains(".")) { String[] abbrevAr = yearMatcher.group(2).split("\\."); abbrev = Arrays.toString(abbrevAr).replaceAll("[,\\]\\[]", ""); } else { abbrev = expandConsonants(yearMatcher.group(2)); } return expandYear(Double.parseDouble(yearMatcher.group(1))) + " " + abbrev; } /*** * add a space between each char of a string * * @param consonants * consonants * @return Joiner.on(" ").join(Lists.charactersOf(consonants)) */ protected String expandConsonants(String consonants) { return Joiner.on(" ").join(Lists.charactersOf(consonants)); } protected String expandHashtag(String hashtag) { String tag = ""; String expandedTag = ""; Matcher hashTagMatcher = hashtagPattern.matcher(hashtag); hashTagMatcher.find(); tag = hashTagMatcher.group(2); if (!tag.matches("[a-z]+") || !tag.matches("[A-Z]+")) { String temp = ""; for (char c : tag.toCharArray()) { if (Character.isDigit(c) && temp.matches("^$|[0-9]+")) { temp += c; } else if (Character.isDigit(c) && temp.matches(".+[0-9]")) { temp += c; } else if (Character.isDigit(c)) { temp += " " + c; } else if (!temp.equals("") && Character.isUpperCase(c)) { if (Character.isUpperCase(temp.charAt(temp.length() - 1))) { temp += c; } else { temp += " " + c; } } else if (Character.isAlphabetic(c) && temp.length() > 0) { if (Character.isDigit(temp.charAt(temp.length() - 1))) { temp += " " + c; } else { temp += c; } } else { temp += c; } } expandedTag = temp; } else { expandedTag = tag; } return symbols.get(hashTagMatcher.group(1)) + " " + expandedTag; } protected String expandRange(String range) { Matcher rangeMatcher = rangePattern.matcher(range); rangeMatcher.find(); return expandNumber(Double.parseDouble(rangeMatcher.group(1))) + " to " + expandNumber(Double.parseDouble(rangeMatcher.group(2))); } /*** * expands a digit followed by an s. e.g. 7s and 8s and the 60s * * @param numberS * numberS * @return number */ protected String expandNumberS(String numberS) { Matcher numberSMatcher = numberSPattern.matcher(numberS); numberSMatcher.find(); String number = expandNumber(Double.parseDouble(numberSMatcher.group(1))); if (number.endsWith("x")) { number += "es"; } else if (number.endsWith("y")) { number = number.replace("y", "ies"); } else { number += "s"; } return number; } protected String splitContraction(String contraction) { int aposIndex = contraction.indexOf("'"); String lemma = contraction.substring(0, aposIndex); String end = contraction.substring(aposIndex); return lemma + " " + end; } /*** * * @param abbrev * the token to be expanded * @param isCapital * whether the following token begins with a capital letter * @return abbrev */ protected String expandAbbreviation(String abbrev, boolean isCapital) { String expAbb = abbrev.replaceAll("\\.", "").toLowerCase(); if (!abbrevMap.containsKey(expAbb)) { logger.warn(String.format("Could not expand unknown abbreviation \"%s\", ignoring", abbrev)); return abbrev; } expAbb = (String) this.abbrevMap.get(expAbb); String[] multiExp = expAbb.split(","); if (multiExp.length > 1) { if (isCapital) { expAbb = multiExp[0]; } else { expAbb = multiExp[1]; } } return expAbb; } protected String expandDate(String date) throws ParseException { // date format is "month/day/year" Date humanDate = df.getPatternInstance("MM.dd.yyyy", ULocale.ENGLISH).parse(date); String[] dateParts = df.format(humanDate).replaceAll(",", "").split("\\s"); dateParts[1] = expandOrdinal(Double.parseDouble(dateParts[1])); dateParts[2] = expandYear(Double.parseDouble(dateParts[2])); return Arrays.toString(dateParts).replaceAll("[,\\]\\[]", ""); } /*** * * @param time * the token to be expanded * @param isNextTokenTime * whether the following token contains am or pm * @return theTime */ protected String expandTime(String time, boolean isNextTokenTime) { boolean pastNoon = false; String theTime = ""; String hour = ""; Double pmHour; Matcher timeMatch = timePattern.matcher(time); timeMatch.find(); // hour if (timeMatch.group(2) != null || timeMatch.group(3) != null) { hour = (timeMatch.group(2) != null) ? timeMatch.group(2) : timeMatch.group(3); if (hour.equals("00")) { hour = "12"; } theTime += expandNumber(Double.parseDouble(hour)); } else { pastNoon = true; hour = (timeMatch.group(4) != null) ? timeMatch.group(4) : timeMatch.group(5); pmHour = Double.parseDouble(hour) - 12; if (pmHour == 0) { hour = "12"; theTime += expandNumber(Double.parseDouble(hour)); } else { theTime += expandNumber(pmHour); } } // minutes if (timeMatch.group(7) != null && !isNextTokenTime) { if (!timeMatch.group(6).equals("00")) { if (timeMatch.group(6).matches("0\\d")) { theTime += " oh " + expandNumber(Double.parseDouble(timeMatch.group(6))); } else { theTime += " " + expandNumber(Double.parseDouble(timeMatch.group(6))); } } for (char c : timeMatch.group(7).replaceAll("\\.", "").toCharArray()) { theTime += " " + c; } } else if (!isNextTokenTime) { if (!timeMatch.group(6).equals("00")) { if (timeMatch.group(6).matches("0\\d")) { theTime += " oh " + expandNumber(Double.parseDouble(timeMatch.group(6))); } else { theTime += " " + expandNumber(Double.parseDouble(timeMatch.group(6))); } } theTime += !pastNoon ? " a m" : " p m"; } else { if (!timeMatch.group(6).equals("00")) { if (timeMatch.group(6).matches("0\\d")) { theTime += " oh " + expandNumber(Double.parseDouble(timeMatch.group(6))); } else { theTime += " " + expandNumber(Double.parseDouble(timeMatch.group(6))); } } } return theTime; } protected String expandRealNumber(String number) { Matcher realNumMatch = realNumPattern.matcher(number); realNumMatch.find(); String newTok = ""; if (realNumMatch.group(1) != null) { newTok += "minus "; } if (realNumMatch.group(2) != null) { newTok += expandNumber(Double.parseDouble(realNumMatch.group(2))) + " "; } if (realNumMatch.group(3) != null) { newTok += "point "; for (char c : realNumMatch.group(4).toCharArray()) { newTok += expandNumber(Double.parseDouble(String.valueOf(c))) + " "; } if (realNumMatch.group(5) != null) { newTok += "per cent"; } } return newTok.trim(); } protected String expandWordNumber(String wordnumseq) { String[] groups = wordnumseq.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)"); int i = 0; for (String g : groups) { if (g.matches("\\d+")) { String newTok = ""; for (char c : g.toCharArray()) { newTok += expandNumber(Double.parseDouble(String.valueOf(c))) + " "; } groups[i] = newTok; } i++; } return Arrays.toString(groups).replaceAll("[,\\]\\[]", ""); } protected String expandMoney(String money, String currency) { String origText = money; Matcher currencyMatch = moneyPattern.matcher(money); currencyMatch.find(); switch (currency) { case "$": if (Double.parseDouble(currencyMatch.group(1)) > 1) { money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " dollars"; } else { money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " dollar"; } if (currencyMatch.group(2) != null) { int dotIndex = origText.indexOf('.'); money = money + " " + expandNumber(Double.parseDouble(origText.substring(dotIndex + 1))) + " cents"; } break; case "£": money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " pound sterling"; if (currencyMatch.group(2) != null) { int dotIndex = origText.indexOf('.'); money = money + " " + expandNumber(Double.parseDouble(origText.substring(dotIndex + 1))) + " pence"; } break; case "€": money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " euro"; if (currencyMatch.group(2) != null) { int dotIndex = origText.indexOf('.'); money = money + " " + expandNumber(Double.parseDouble(origText.substring(dotIndex + 1))) + " cents"; } break; default: logger.warn(String.format("Could not expand amount [%s] for currency [%s]", origText, currency)); break; } return money; } /** * Try to extract the rule name for "expand ordinal" from the given RuleBasedNumberFormat. * <p> * The rule name is locale sensitive, but usually starts with "%spellout-ordinal". * * @param rbnf * The RuleBasedNumberFormat from where we will try to extract the rule name. * @return The rule name for "ordinal spell out". */ protected static String getOrdinalRuleName(final RuleBasedNumberFormat rbnf) { List<String> l = Arrays.asList(rbnf.getRuleSetNames()); if (l.contains("%spellout-ordinal")) { return "%spellout-ordinal"; } else if (l.contains("%spellout-ordinal-masculine")) { return "%spellout-ordinal-masculine"; } else { for (String string : l) { if (string.startsWith("%spellout-ordinal")) { return string; } } } throw new UnsupportedOperationException("The locale " + rbnf.getLocale(ULocale.ACTUAL_LOCALE) + " doesn't support ordinal spelling."); } /** * Try to extract the rule name for "expand year" from the given RuleBasedNumberFormat. * <p> * The rule name is locale sensitive, but usually starts with "%spellout-numbering-year". * * @param rbnf * The RuleBasedNumberFormat from where we will try to extract the rule name. * @return The rule name for "year spell out". */ protected static String getYearRuleName(final RuleBasedNumberFormat rbnf) { List<String> l = Arrays.asList(rbnf.getRuleSetNames()); if (l.contains("%spellout-numbering-year")) { return "%spellout-numbering-year"; } else { for (String string : l) { if (string.startsWith("%spellout-numbering-year")) { return string; } } } throw new UnsupportedOperationException("The locale " + rbnf.getLocale(ULocale.ACTUAL_LOCALE) + " doesn't support year spelling."); } public static Map<Object, Object> loadAbbrevMap() throws IOException { Map<Object, Object> abbMap = new Properties(); ((Properties) abbMap).load(Preprocess.class.getResourceAsStream("preprocess/abbrev.dat")); return abbMap; } }