/** * Portions Copyright 2001-2003 Sun Microsystems, Inc. * Portions Copyright 1999-2001 Language Technologies Institute, * Carnegie Mellon University. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. */ package edu.cmu.sphinx.alignment; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.cmu.sphinx.alignment.tokenizer.CharTokenizer; import edu.cmu.sphinx.alignment.tokenizer.DecisionTree; import edu.cmu.sphinx.alignment.tokenizer.FeatureSet; import edu.cmu.sphinx.alignment.tokenizer.Item; import edu.cmu.sphinx.alignment.tokenizer.NumberExpander; import edu.cmu.sphinx.alignment.tokenizer.PrefixFSM; import edu.cmu.sphinx.alignment.tokenizer.PronounceableFSM; import edu.cmu.sphinx.alignment.tokenizer.Relation; import edu.cmu.sphinx.alignment.tokenizer.SuffixFSM; import edu.cmu.sphinx.alignment.tokenizer.Utterance; import edu.cmu.sphinx.alignment.tokenizer.WordRelation; /** * Provides the definitions for US English whitespace, punctuations, * prepunctuation, and postpunctuation symbols. It also contains a set of * Regular Expressions for the US English language. With regular expressions, * it specifies what are whitespace, letters in the alphabet, uppercase and * lowercase letters, alphanumeric characters, identifiers, integers, doubles, * digits, and 'comma and int'. * * It translates the following code from flite: src/regex/cst_regex.c * lang/usenglish/us_text.c */ class UsEnglish { /** default whitespace regular expression pattern */ public static final String RX_DEFAULT_US_EN_WHITESPACE = "[ \n\t\r]+"; /** default letter regular expression pattern */ public static final String RX_DEFAULT_US_EN_ALPHABET = "[A-Za-z]+"; /** default uppercase regular expression pattern */ public static final String RX_DEFAULT_US_EN_UPPERCASE = "[A-Z]+"; /** default lowercase regular expression pattern */ public static final String RX_DEFAULT_US_EN_LOWERCASE = "[a-z]+"; /** default alpha-numeric regular expression pattern */ public static final String RX_DEFAULT_US_EN_ALPHANUMERIC = "[0-9A-Za-z]+"; /** default identifier regular expression pattern */ public static final String RX_DEFAULT_US_EN_IDENTIFIER = "[A-Za-z_][0-9A-Za-z_]+"; /** default integer regular expression pattern */ public static final String RX_DEFAULT_US_EN_INT = "-?[0-9]+"; /** default double regular expression pattern */ public static final String RX_DEFAULT_US_EN_DOUBLE = "-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][---+]?[0-9]+)?"; /** default integer with commas regular expression pattern */ public static final String RX_DEFAULT_US_EN_COMMAINT = "[0-9][0-9]?[0-9]?[,']([0-9][0-9][0-9][,'])*[0-9][0-9][0-9](\\.[0-9]+)?"; /** default digits regular expression pattern */ public static final String RX_DEFAULT_US_EN_DIGITS = "[0-9][0-9]*"; /** default dotted abbreviation regular expression pattern */ public static final String RX_DEFAULT_US_EN_DOTTED_ABBREV = "([A-Za-z]\\.)*[A-Za-z]"; /** default ordinal number regular expression pattern */ public static final String RX_DEFAULT_US_EN_ORDINAL_NUMBER = "[0-9][0-9,]*(th|TH|st|ST|nd|ND|rd|RD)"; /** default has-vowel regular expression */ public static final String RX_DEFAULT_HAS_VOWEL = ".*[aeiouAEIOU].*"; /** default US money regular expression */ public static final String RX_DEFAULT_US_MONEY = "\\$[0-9,]+(\\.[0-9]+)?"; /** default -illion regular expression */ public static final String RX_DEFAULT_ILLION = ".*illion"; /** default digits2dash (e.g. 999-999-999) regular expression */ public static final String RX_DEFAULT_DIGITS2DASH = "[0-9]+(-[0-9]+)(-[0-9]+)+"; /** default digits/digits (e.g. 999/999) regular expression */ public static final String RX_DEFAULT_DIGITSSLASHDIGITS = "[0-9]+/[0-9]+"; /** default number time regular expression */ public static final String RX_DEFAULT_NUMBER_TIME = "((0[0-2])|(1[0-9])):([0-5][0-9])"; /** default Roman numerals regular expression */ public static final String RX_DEFAULT_ROMAN_NUMBER = "(II?I?|IV|VI?I?I?|IX|X[VIX]*)"; /** default drst "Dr. St" regular expression */ public static final String RX_DEFAULT_DRST = "([dD][Rr]|[Ss][Tt])"; /** default numess */ public static final String RX_DEFAULT_NUMESS = "[0-9]+s"; /** default 7-digit phone number */ public static final String RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER = "[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]"; /** default 4-digit number */ public static final String RX_DEFAULT_FOUR_DIGIT = "[0-9][0-9][0-9][0-9]"; /** default 3-digit number */ public static final String RX_DEFAULT_THREE_DIGIT = "[0-9][0-9][0-9]"; /** whitespace regular expression pattern */ public static String RX_WHITESPACE = RX_DEFAULT_US_EN_WHITESPACE; /** letter regular expression pattern */ public static String RX_ALPHABET = RX_DEFAULT_US_EN_ALPHABET; /** uppercase regular expression pattern */ public static String RX_UPPERCASE = RX_DEFAULT_US_EN_UPPERCASE; /** lowercase regular expression pattern */ public static String RX_LOWERCASE = RX_DEFAULT_US_EN_LOWERCASE; /** alphanumeric regular expression pattern */ public static String RX_ALPHANUMERIC = RX_DEFAULT_US_EN_ALPHANUMERIC; /** identifier regular expression pattern */ public static String RX_IDENTIFIER = RX_DEFAULT_US_EN_IDENTIFIER; /** integer regular expression pattern */ public static String RX_INT = RX_DEFAULT_US_EN_INT; /** double regular expression pattern */ public static String RX_DOUBLE = RX_DEFAULT_US_EN_DOUBLE; /** comma separated integer regular expression pattern */ public static String RX_COMMAINT = RX_DEFAULT_US_EN_COMMAINT; /** digits regular expression pattern */ public static String RX_DIGITS = RX_DEFAULT_US_EN_DIGITS; /** dotted abbreviation regular expression pattern */ public static String RX_DOTTED_ABBREV = RX_DEFAULT_US_EN_DOTTED_ABBREV; /** ordinal number regular expression pattern */ public static String RX_ORDINAL_NUMBER = RX_DEFAULT_US_EN_ORDINAL_NUMBER; /** has-vowel regular expression */ public static final String RX_HAS_VOWEL = RX_DEFAULT_HAS_VOWEL; /** US money regular expression */ public static final String RX_US_MONEY = RX_DEFAULT_US_MONEY; /** -illion regular expression */ public static final String RX_ILLION = RX_DEFAULT_ILLION; /** digits2dash (e.g. 999-999-999) regular expression */ public static final String RX_DIGITS2DASH = RX_DEFAULT_DIGITS2DASH; /** digits/digits (e.g. 999/999) regular expression */ public static final String RX_DIGITSSLASHDIGITS = RX_DEFAULT_DIGITSSLASHDIGITS; /** number time regular expression */ public static final String RX_NUMBER_TIME = RX_DEFAULT_NUMBER_TIME; /** Roman numerals regular expression */ public static final String RX_ROMAN_NUMBER = RX_DEFAULT_ROMAN_NUMBER; /** drst "Dr. St" regular expression */ public static final String RX_DRST = RX_DEFAULT_DRST; /** default numess */ public static final String RX_NUMESS = RX_DEFAULT_NUMESS; /** 7-digit phone number */ public static final String RX_SEVEN_DIGIT_PHONE_NUMBER = RX_DEFAULT_SEVEN_DIGIT_PHONE_NUMBER; /** 4-digit number */ public static final String RX_FOUR_DIGIT = RX_DEFAULT_FOUR_DIGIT; /** 3-digit number */ public static final String RX_THREE_DIGIT = RX_DEFAULT_THREE_DIGIT; // the following symbols are from lang/usenglish/us_text.c /** punctuation regular expression pattern */ public static final String PUNCTUATION_SYMBOLS = "\"'`.,:;!?(){}[]"; /** pre-punctuation regular expression pattern */ public static final String PREPUNCTUATION_SYMBOLS = "\"'`({["; /** single char symbols regular expression pattern */ public static final String SINGLE_CHAR_SYMBOLS = ""; /** whitespace symbols regular expression pattern */ public static final String WHITESPACE_SYMBOLS = " \t\n\r"; /** * Not constructable */ private UsEnglish() {} } /** * Converts the Tokens (in US English words) in an Utterance into a list of * words. It puts the produced list back into the Utterance. Usually, the * tokens that gets expanded are numbers like "23" (to "twenty" "three"). * <p> * * It translates the following code from flite: <br> * <code> * lang/usenglish/us_text.c * </code> */ public class USEnglishTokenizer implements TextTokenizer { // Patterns for regular expression matching private static final Pattern alphabetPattern; private static final Pattern commaIntPattern; private static final Pattern digits2DashPattern; private static final Pattern digitsPattern; private static final Pattern digitsSlashDigitsPattern; private static final Pattern dottedAbbrevPattern; private static final Pattern doublePattern; private static final Pattern drStPattern; private static final Pattern fourDigitsPattern; private static final Pattern illionPattern; private static final Pattern numberTimePattern; private static final Pattern numessPattern; private static final Pattern ordinalPattern; private static final Pattern romanNumbersPattern; private static final Pattern sevenPhoneNumberPattern; private static final Pattern threeDigitsPattern; private static final Pattern usMoneyPattern; static { alphabetPattern = Pattern.compile(UsEnglish.RX_ALPHABET); commaIntPattern = Pattern.compile(UsEnglish.RX_COMMAINT); digits2DashPattern = Pattern.compile(UsEnglish.RX_DIGITS2DASH); digitsPattern = Pattern.compile(UsEnglish.RX_DIGITS); digitsSlashDigitsPattern = Pattern.compile(UsEnglish.RX_DIGITSSLASHDIGITS); dottedAbbrevPattern = Pattern.compile(UsEnglish.RX_DOTTED_ABBREV); doublePattern = Pattern.compile(UsEnglish.RX_DOUBLE); drStPattern = Pattern.compile(UsEnglish.RX_DRST); fourDigitsPattern = Pattern.compile(UsEnglish.RX_FOUR_DIGIT); Pattern.compile(UsEnglish.RX_HAS_VOWEL); illionPattern = Pattern.compile(UsEnglish.RX_ILLION); numberTimePattern = Pattern.compile(UsEnglish.RX_NUMBER_TIME); numessPattern = Pattern.compile(UsEnglish.RX_NUMESS); ordinalPattern = Pattern.compile(UsEnglish.RX_ORDINAL_NUMBER); romanNumbersPattern = Pattern.compile(UsEnglish.RX_ROMAN_NUMBER); sevenPhoneNumberPattern = Pattern.compile(UsEnglish.RX_SEVEN_DIGIT_PHONE_NUMBER); threeDigitsPattern = Pattern.compile(UsEnglish.RX_THREE_DIGIT); usMoneyPattern = Pattern.compile(UsEnglish.RX_US_MONEY); } // King-like words private static final String[] kingNames = {"louis", "henry", "charles", "philip", "george", "edward", "pius", "william", "richard", "ptolemy", "john", "paul", "peter", "nicholas", "frederick", "james", "alfonso", "ivan", "napoleon", "leo", "gregory", "catherine", "alexandria", "pierre", "elizabeth", "mary", "elmo", "erasmus"}; private static final String[] kingTitles = {"king", "queen", "pope", "duke", "tsar", "emperor", "shah", "caesar", "duchess", "tsarina", "empress", "baron", "baroness", "sultan", "count", "countess"}; // Section-like words private static final String[] sectionTypes = {"section", "chapter", "part", "phrase", "verse", "scene", "act", "book", "volume", "chap", "war", "apollo", "trek", "fortran"}; /** * Here we use a map for constant time matching, instead of using if * (A.equals(B) || A.equals(C) || ...) to match Strings */ private static Map<String, String> kingSectionLikeMap = new HashMap<String, String>(); private static final String KING_NAMES = "kingNames"; private static final String KING_TITLES = "kingTitles"; private static final String SECTION_TYPES = "sectionTypes"; static { for (int i = 0; i < kingNames.length; i++) { kingSectionLikeMap.put(kingNames[i], KING_NAMES); } for (int i = 0; i < kingTitles.length; i++) { kingSectionLikeMap.put(kingTitles[i], KING_TITLES); } for (int i = 0; i < sectionTypes.length; i++) { kingSectionLikeMap.put(sectionTypes[i], SECTION_TYPES); } } // Finite state machines to check if a Token is pronounceable private PronounceableFSM prefixFSM = null; private PronounceableFSM suffixFSM = null; // List of US states abbreviations and their full names private static final String[][] usStates = { {"AL", "ambiguous", "alabama"}, {"Al", "ambiguous", "alabama"}, {"Ala", "", "alabama"}, {"AK", "", "alaska"}, {"Ak", "", "alaska"}, {"AZ", "", "arizona"}, {"Az", "", "arizona"}, {"CA", "", "california"}, {"Ca", "", "california"}, {"Cal", "ambiguous", "california"}, {"Calif", "", "california"}, {"CO", "ambiguous", "colorado"}, {"Co", "ambiguous", "colorado"}, {"Colo", "", "colorado"}, {"DC", "", "d", "c"}, {"DE", "", "delaware"}, {"De", "ambiguous", "delaware"}, {"Del", "ambiguous", "delaware"}, {"FL", "", "florida"}, {"Fl", "ambiguous", "florida"}, {"Fla", "", "florida"}, {"GA", "", "georgia"}, {"Ga", "", "georgia"}, {"HI", "ambiguous", "hawaii"}, {"Hi", "ambiguous", "hawaii"}, {"IA", "", "iowa"}, {"Ia", "ambiguous", "iowa"}, {"IN", "ambiguous", "indiana"}, {"In", "ambiguous", "indiana"}, {"Ind", "ambiguous", "indiana"}, {"ID", "ambiguous", "idaho"}, {"IL", "ambiguous", "illinois"}, {"Il", "ambiguous", "illinois"}, {"ILL", "ambiguous", "illinois"}, {"KS", "", "kansas"}, {"Ks", "", "kansas"}, {"Kans", "", "kansas"}, {"KY", "ambiguous", "kentucky"}, {"Ky", "ambiguous", "kentucky"}, {"LA", "ambiguous", "louisiana"}, {"La", "ambiguous", "louisiana"}, {"Lou", "ambiguous", "louisiana"}, {"Lous", "ambiguous", "louisiana"}, {"MA", "ambiguous", "massachusetts"}, {"Mass", "ambiguous", "massachusetts"}, {"Ma", "ambiguous", "massachusetts"}, {"MD", "ambiguous", "maryland"}, {"Md", "ambiguous", "maryland"}, {"ME", "ambiguous", "maine"}, {"Me", "ambiguous", "maine"}, {"MI", "", "michigan"}, {"Mi", "ambiguous", "michigan"}, {"Mich", "ambiguous", "michigan"}, {"MN", "ambiguous", "minnestota"}, {"Minn", "ambiguous", "minnestota"}, {"MS", "ambiguous", "mississippi"}, {"Miss", "ambiguous", "mississippi"}, {"MT", "ambiguous", "montanna"}, {"Mt", "ambiguous", "montanna"}, {"MO", "ambiguous", "missouri"}, {"Mo", "ambiguous", "missouri"}, {"NC", "ambiguous", "north", "carolina"}, {"ND", "ambiguous", "north", "dakota"}, {"NE", "ambiguous", "nebraska"}, {"Ne", "ambiguous", "nebraska"}, {"Neb", "ambiguous", "nebraska"}, {"NH", "ambiguous", "new", "hampshire"}, {"NV", "", "nevada"}, {"Nev", "", "nevada"}, {"NY", "", "new", "york"}, {"OH", "ambiguous", "ohio"}, {"OK", "ambiguous", "oklahoma"}, {"Okla", "", "oklahoma"}, {"OR", "ambiguous", "oregon"}, {"Or", "ambiguous", "oregon"}, {"Ore", "ambiguous", "oregon"}, {"PA", "ambiguous", "pennsylvania"}, {"Pa", "ambiguous", "pennsylvania"}, {"Penn", "ambiguous", "pennsylvania"}, {"RI", "ambiguous", "rhode", "island"}, {"SC", "ambiguous", "south", "carlolina"}, {"SD", "ambiguous", "south", "dakota"}, {"TN", "ambiguous", "tennesee"}, {"Tn", "ambiguous", "tennesee"}, {"Tenn", "ambiguous", "tennesee"}, {"TX", "ambiguous", "texas"}, {"Tx", "ambiguous", "texas"}, {"Tex", "ambiguous", "texas"}, {"UT", "ambiguous", "utah"}, {"VA", "ambiguous", "virginia"}, {"WA", "ambiguous", "washington"}, {"Wa", "ambiguous", "washington"}, {"Wash", "ambiguous", "washington"}, {"WI", "ambiguous", "wisconsin"}, {"Wi", "ambiguous", "wisconsin"}, {"WV", "ambiguous", "west", "virginia"}, {"WY", "ambiguous", "wyoming"}, {"Wy", "ambiguous", "wyoming"}, {"Wyo", "", "wyoming"}, {"PR", "ambiguous", "puerto", "rico"}}; // Again map for constant time searching. private static Map<String, String[]> usStatesMap = new HashMap<String, String[]>(); static { for (int i = 0; i < usStates.length; i++) { usStatesMap.put(usStates[i][0], usStates[i]); } }; // class variables // the word relation that we are building private WordRelation wordRelation; // the current token Item private Item tokenItem; // a CART for classifying numbers private DecisionTree cart; /** * Constructs a default USTokenWordProcessor. It uses the USEnglish regular * expression set (USEngRegExp) by default. */ public USEnglishTokenizer() { try { cart = new DecisionTree(getClass().getResource("nums_cart.txt")); prefixFSM = new PrefixFSM(getClass().getResource("prefix_fsm.txt")); suffixFSM = new SuffixFSM(getClass().getResource("suffix_fsm.txt")); } catch (IOException e) { throw new IllegalStateException("resources not found", e); } } /** * Returns the currently processing token Item. * * @return the current token Item; null if no item */ public Item getTokenItem() { return tokenItem; } /** * Process the utterance * * @param text the utterance containing the tokens * @return the list of tokens */ public List<String> expand(String text) { String simplifiedText = simplifyChars(text); CharTokenizer tokenizer = new CharTokenizer(); tokenizer.setWhitespaceSymbols(UsEnglish.WHITESPACE_SYMBOLS); tokenizer.setSingleCharSymbols(UsEnglish.SINGLE_CHAR_SYMBOLS); tokenizer.setPrepunctuationSymbols(UsEnglish.PREPUNCTUATION_SYMBOLS); tokenizer.setPostpunctuationSymbols(UsEnglish.PUNCTUATION_SYMBOLS); tokenizer.setInputText(simplifiedText); Utterance utterance = new Utterance(tokenizer); Relation tokenRelation; if ((tokenRelation = utterance.getRelation(Relation.TOKEN)) == null) { throw new IllegalStateException("token relation does not exist"); } wordRelation = WordRelation.createWordRelation(utterance, this); for (tokenItem = tokenRelation.getHead(); tokenItem != null; tokenItem = tokenItem.getNext()) { FeatureSet featureSet = tokenItem.getFeatures(); String tokenVal = featureSet.getString("name"); // convert the token into a list of words tokenToWords(tokenVal); } List<String> words = new ArrayList<String>(); for (Item item = utterance.getRelation(Relation.WORD).getHead(); item != null; item = item.getNext()) { if (!item.toString().isEmpty() && !item.toString().contains("#")) { words.add(item.toString()); } } return words; } private String simplifyChars(String text) { text = text.replace('’', '\''); text = text.replace('‘', '\''); text = text.replace('”', '"'); text = text.replace('“', '"'); text = text.replace('»', '"'); text = text.replace('«', '"'); text = text.replace('–', '-'); text = text.replace('—', ' '); text = text.replace('…', ' '); text = text.replace((char)0xc, ' '); return text; } /** * Returns true if the given token matches part of a phone number * * @param tokenItem the token * @param tokenVal the string value of the token * * @return true or false */ private boolean matchesPartPhoneNumber(String tokenVal) { String n_name = (String) tokenItem.findFeature("n.name"); String n_n_name = (String) tokenItem.findFeature("n.n.name"); String p_name = (String) tokenItem.findFeature("p.name"); String p_p_name = (String) tokenItem.findFeature("p.p.name"); boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name); return ((matches(threeDigitsPattern, tokenVal) && ((!matches( digitsPattern, p_name) && matches(threeDigitsPattern, n_name) && matches( fourDigitsPattern, n_n_name)) || (matches(sevenPhoneNumberPattern, n_name)) || (!matches( digitsPattern, p_p_name) && matches3DigitsP_name && matches( fourDigitsPattern, n_name)))) || (matches( fourDigitsPattern, tokenVal) && (!matches(digitsPattern, n_name) && matches3DigitsP_name && matches(threeDigitsPattern, p_p_name)))); } /** * Converts the given Token into (word) Items in the WordRelation. * * @param tokenVal the String value of the token, which may or may not be * same as the one in called "name" in flite * */ private void tokenToWords(String tokenVal) { FeatureSet tokenFeatures = tokenItem.getFeatures(); String itemName = tokenFeatures.getString("name"); int tokenLength = tokenVal.length(); if (tokenFeatures.isPresent("phones")) { wordRelation.addWord(tokenVal); } else if ((tokenVal.equals("a") || tokenVal.equals("A")) && ((tokenItem.getNext() == null) || !(tokenVal.equals(itemName)) || !(((String) tokenItem .findFeature("punc")).equals("")))) { /* if A is a sub part of a token, then its ey not ah */ wordRelation.addWord("_a"); } else if (matches(alphabetPattern, tokenVal)) { if (matches(romanNumbersPattern, tokenVal)) { /* XVIII */ romanToWords(tokenVal); } else if (matches(illionPattern, tokenVal) && matches(usMoneyPattern, (String) tokenItem.findFeature("p.name"))) { /* $ X -illion */ wordRelation.addWord(tokenVal); wordRelation.addWord("dollars"); } else if (matches(drStPattern, tokenVal)) { /* St Andrew's St, Dr King Dr */ drStToWords(tokenVal); } else if (tokenVal.equals("Mr")) { tokenItem.getFeatures().setString("punc", ""); wordRelation.addWord("mister"); } else if (tokenVal.equals("Mrs")) { tokenItem.getFeatures().setString("punc", ""); wordRelation.addWord("missus"); } else if (tokenLength == 1 && Character.isUpperCase(tokenVal.charAt(0)) && ((String) tokenItem.findFeature("n.whitespace")) .equals(" ") && Character.isUpperCase(((String) tokenItem .findFeature("n.name")).charAt(0))) { tokenFeatures.setString("punc", ""); String aaa = tokenVal.toLowerCase(); if (aaa.equals("a")) { wordRelation.addWord("_a"); } else { wordRelation.addWord(aaa); } } else if (isStateName(tokenVal)) { /* * The name of a US state isStateName() has already added the * full name of the state, so we're all set. */ } else if (tokenLength > 1 && !isPronounceable(tokenVal)) { /* Need common exception list */ /* unpronouncable list of alphas */ NumberExpander.expandLetters(tokenVal, wordRelation); } else { /* just a word */ wordRelation.addWord(tokenVal.toLowerCase()); } } else if (matches(dottedAbbrevPattern, tokenVal)) { /* U.S.A. */ // remove all dots NumberExpander.expandLetters(tokenVal.replace(".", ""), wordRelation); } else if (matches(commaIntPattern, tokenVal)) { /* 99,999,999 */ NumberExpander.expandReal(tokenVal.replace(",", "").replace("'", ""), wordRelation); } else if (matches(sevenPhoneNumberPattern, tokenVal)) { /* 234-3434 telephone numbers */ int dashIndex = tokenVal.indexOf('-'); String aaa = tokenVal.substring(0, dashIndex); String bbb = tokenVal.substring(dashIndex + 1); NumberExpander.expandDigits(aaa, wordRelation); wordRelation.addBreak(); NumberExpander.expandDigits(bbb, wordRelation); } else if (matchesPartPhoneNumber(tokenVal)) { /* part of a telephone number */ String punctuation = (String) tokenItem.findFeature("punc"); if (punctuation.equals("")) { tokenItem.getFeatures().setString("punc", ","); } NumberExpander.expandDigits(tokenVal, wordRelation); wordRelation.addBreak(); } else if (matches(numberTimePattern, tokenVal)) { /* 12:35 */ int colonIndex = tokenVal.indexOf(':'); String aaa = tokenVal.substring(0, colonIndex); String bbb = tokenVal.substring(colonIndex + 1); NumberExpander.expandNumber(aaa, wordRelation); if (!(bbb.equals("00"))) { NumberExpander.expandID(bbb, wordRelation); } } else if (matches(digits2DashPattern, tokenVal)) { /* 999-999-999 */ digitsDashToWords(tokenVal); } else if (matches(digitsPattern, tokenVal)) { digitsToWords(tokenVal); } else if (tokenLength == 1 && Character.isUpperCase(tokenVal.charAt(0)) && ((String) tokenItem.findFeature("n.whitespace")) .equals(" ") && Character.isUpperCase(((String) tokenItem .findFeature("n.name")).charAt(0))) { tokenFeatures.setString("punc", ""); String aaa = tokenVal.toLowerCase(); if (aaa.equals("a")) { wordRelation.addWord("_a"); } else { wordRelation.addWord(aaa); } } else if (matches(doublePattern, tokenVal)) { NumberExpander.expandReal(tokenVal, wordRelation); } else if (matches(ordinalPattern, tokenVal)) { /* explicit ordinals */ String aaa = tokenVal.substring(0, tokenLength - 2); NumberExpander.expandOrdinal(aaa, wordRelation); } else if (matches(usMoneyPattern, tokenVal)) { /* US money */ usMoneyToWords(tokenVal); } else if (tokenLength > 0 && tokenVal.charAt(tokenLength - 1) == '%') { /* Y% */ tokenToWords(tokenVal.substring(0, tokenLength - 1)); wordRelation.addWord("percent"); } else if (matches(numessPattern, tokenVal)) { NumberExpander.expandNumess(tokenVal.substring(0, tokenLength - 1), wordRelation); } else if (matches(digitsSlashDigitsPattern, tokenVal) && tokenVal.equals(itemName)) { digitsSlashDigitsToWords(tokenVal); } else if (tokenVal.indexOf('-') != -1) { dashToWords(tokenVal); } else if (tokenLength > 1 && !matches(alphabetPattern, tokenVal)) { notJustAlphasToWords(tokenVal); } else if (tokenVal.equals("&")) { // & wordRelation.addWord("and"); } else if (tokenVal.equals("-")) { // Skip it } else { // Just a word. wordRelation.addWord(tokenVal.toLowerCase()); } } /** * Convert the given digit token with dashes (e.g. 999-999-999) into (word) * Items in the WordRelation. * * @param tokenVal the digit string */ private void digitsDashToWords(String tokenVal) { int tokenLength = tokenVal.length(); int a = 0; for (int p = 0; p <= tokenLength; p++) { if (p == tokenLength || tokenVal.charAt(p) == '-') { String aaa = tokenVal.substring(a, p); NumberExpander.expandDigits(aaa, wordRelation); wordRelation.addBreak(); a = p + 1; } } } /** * Convert the given digit token into (word) Items in the WordRelation. * * @param tokenVal the digit string */ private void digitsToWords(String tokenVal) { FeatureSet featureSet = tokenItem.getFeatures(); String nsw = ""; if (featureSet.isPresent("nsw")) { nsw = featureSet.getString("nsw"); } if (nsw.equals("nide")) { NumberExpander.expandID(tokenVal, wordRelation); } else { String rName = featureSet.getString("name"); String digitsType = null; if (tokenVal.equals(rName)) { digitsType = (String) cart.interpret(tokenItem); } else { featureSet.setString("name", tokenVal); digitsType = (String) cart.interpret(tokenItem); featureSet.setString("name", rName); } if (digitsType.equals("ordinal")) { NumberExpander.expandOrdinal(tokenVal, wordRelation); } else if (digitsType.equals("digits")) { NumberExpander.expandDigits(tokenVal, wordRelation); } else if (digitsType.equals("year")) { NumberExpander.expandID(tokenVal, wordRelation); } else { NumberExpander.expandNumber(tokenVal, wordRelation); } } } /** * Converts the given Roman numeral string into (word) Items in the * WordRelation. * * @param romanString the roman numeral string */ private void romanToWords(String romanString) { String punctuation = (String) tokenItem.findFeature("p.punc"); if (punctuation.equals("")) { /* no preceeding punctuation */ String n = String.valueOf(NumberExpander.expandRoman(romanString)); if (kingLike(tokenItem)) { wordRelation.addWord("the"); NumberExpander.expandOrdinal(n, wordRelation); } else if (sectionLike(tokenItem)) { NumberExpander.expandNumber(n, wordRelation); } else { NumberExpander.expandLetters(romanString, wordRelation); } } else { NumberExpander.expandLetters(romanString, wordRelation); } } /** * Returns true if the given key is in the {@link #kingSectionLikeMap} map, * and the value is the same as the given value. * * @param key key to look for in the map * @param value the value to match * * @return true if it matches, or false if it does not or if the key is not * mapped to any value in the map. */ private static boolean inKingSectionLikeMap(String key, String value) { if (kingSectionLikeMap.containsKey(key)) { return kingSectionLikeMap.get(key).equals(value); } return false; } /** * Returns true if the given token item contains a token that is in a * king-like context, e.g., "King" or "Louis". * * @param tokenItem the token item to check * * @return true or false */ public static boolean kingLike(Item tokenItem) { String kingName = ((String) tokenItem.findFeature("p.name")).toLowerCase(); if (inKingSectionLikeMap(kingName, KING_NAMES)) { return true; } else { String kingTitle = ((String) tokenItem.findFeature("p.p.name")).toLowerCase(); return inKingSectionLikeMap(kingTitle, KING_TITLES); } } /** * Returns true if the given token item contains a token that is in a * section-like context, e.g., "chapter" or "act". * * @param tokenItem the token item to check * * @return true or false */ public static boolean sectionLike(Item tokenItem) { String sectionType = ((String) tokenItem.findFeature("p.name")).toLowerCase(); return inKingSectionLikeMap(sectionType, SECTION_TYPES); } /** * Converts the given string containing "St" and "Dr" to (word) Items in * the WordRelation. * * @param drStString the string with "St" and "Dr" */ private void drStToWords(String drStString) { String street = null; String saint = null; char c0 = drStString.charAt(0); if (c0 == 's' || c0 == 'S') { street = "street"; saint = "saint"; } else { street = "drive"; saint = "doctor"; } FeatureSet featureSet = tokenItem.getFeatures(); String punctuation = featureSet.getString("punc"); String featPunctuation = (String) tokenItem.findFeature("punc"); if (tokenItem.getNext() == null || punctuation.indexOf(',') != -1) { wordRelation.addWord(street); } else if (featPunctuation.equals(",")) { wordRelation.addWord(saint); } else { String pName = (String) tokenItem.findFeature("p.name"); String nName = (String) tokenItem.findFeature("n.name"); char p0 = pName.charAt(0); char n0 = nName.charAt(0); if (Character.isUpperCase(p0) && Character.isLowerCase(n0)) { wordRelation.addWord(street); } else if (Character.isDigit(p0) && Character.isLowerCase(n0)) { wordRelation.addWord(street); } else if (Character.isLowerCase(p0) && Character.isUpperCase(n0)) { wordRelation.addWord(saint); } else { String whitespace = (String) tokenItem.findFeature("n.whitespace"); if (whitespace.equals(" ")) { wordRelation.addWord(saint); } else { wordRelation.addWord(street); } } } if (punctuation != null && punctuation.equals(".")) { featureSet.setString("punc", ""); } } /** * Converts US money string into (word) Items in the WordRelation. * * @param tokenVal the US money string */ private void usMoneyToWords(String tokenVal) { int dotIndex = tokenVal.indexOf('.'); if (matches(illionPattern, (String) tokenItem.findFeature("n.name"))) { NumberExpander.expandReal(tokenVal.substring(1), wordRelation); } else if (dotIndex == -1) { String aaa = tokenVal.substring(1); tokenToWords(aaa); if (aaa.equals("1")) { wordRelation.addWord("dollar"); } else { wordRelation.addWord("dollars"); } } else if (dotIndex == (tokenVal.length() - 1) || (tokenVal.length() - dotIndex) > 3) { // Simply read as mumble point mumble. NumberExpander.expandReal(tokenVal.substring(1), wordRelation); wordRelation.addWord("dollars"); } else { String aaa = tokenVal.substring(1, dotIndex).replace(",", ""); String bbb = tokenVal.substring(dotIndex + 1); NumberExpander.expandNumber(aaa, wordRelation); if (aaa.equals("1")) { wordRelation.addWord("dollar"); } else { wordRelation.addWord("dollars"); } if (bbb.equals("00")) { // Add nothing to the word list. } else { NumberExpander.expandNumber(bbb, wordRelation); if (bbb.equals("01")) { wordRelation.addWord("cent"); } else { wordRelation.addWord("cents"); } } } } /** * Convert the given digits/digits string into word (Items) in the * WordRelation. * * @param tokenVal the digits/digits string */ private void digitsSlashDigitsToWords(String tokenVal) { /* might be fraction, or not */ int index = tokenVal.indexOf('/'); String aaa = tokenVal.substring(0, index); String bbb = tokenVal.substring(index + 1); int a; // if the previous token is a number, add an "and" if (matches(digitsPattern, (String) tokenItem.findFeature("p.name")) && tokenItem.getPrevious() != null) { wordRelation.addWord("and"); } if (aaa.equals("1") && bbb.equals("2")) { wordRelation.addWord("a"); wordRelation.addWord("half"); } else if ((a = Integer.parseInt(aaa)) < (Integer.parseInt(bbb))) { NumberExpander.expandNumber(aaa, wordRelation); NumberExpander.expandOrdinal(bbb, wordRelation); if (a > 1) { wordRelation.addWord("'s"); } } else { NumberExpander.expandNumber(aaa, wordRelation); wordRelation.addWord("slash"); NumberExpander.expandNumber(bbb, wordRelation); } } /** * Convert the given dashed string (e.g. "aaa-bbb") into (word) Items in * the WordRelation. * * @param tokenVal the dashed string */ private void dashToWords(String tokenVal) { int index = tokenVal.indexOf('-'); String aaa = tokenVal.substring(0, index); String bbb = tokenVal.substring(index + 1, tokenVal.length()); if (matches(digitsPattern, aaa) && matches(digitsPattern, bbb)) { FeatureSet featureSet = tokenItem.getFeatures(); featureSet.setString("name", aaa); tokenToWords(aaa); wordRelation.addWord("to"); featureSet.setString("name", bbb); tokenToWords(bbb); featureSet.setString("name", ""); } else { tokenToWords(aaa); tokenToWords(bbb); } } /** * Convert the given string (which does not only consist of alphabet) into * (word) Items in the WordRelation. * * @param tokenVal the string */ private void notJustAlphasToWords(String tokenVal) { /* its not just alphas */ int index = 0; int tokenLength = tokenVal.length(); for (; index < tokenLength - 1; index++) { if (isTextSplitable(tokenVal, index)) { break; } } if (index == tokenLength - 1) { wordRelation.addWord(tokenVal.toLowerCase()); return; } String aaa = tokenVal.substring(0, index + 1); String bbb = tokenVal.substring(index + 1, tokenLength); FeatureSet featureSet = tokenItem.getFeatures(); featureSet.setString("nsw", "nide"); tokenToWords(aaa); tokenToWords(bbb); } /** * Returns true if the given word is pronounceable. This method is * originally called us_aswd() in Flite 1.1. * * @param word the word to test * * @return true if the word is pronounceable, false otherwise */ public boolean isPronounceable(String word) { String lcWord = word.toLowerCase(); return prefixFSM.accept(lcWord) && suffixFSM.accept(lcWord); } /** * Returns true if the given token is the name of a US state. If it is, it * will add the name of the state to (word) Items in the WordRelation. * * @param tokenVal the token string */ private boolean isStateName(String tokenVal) { String[] state = (String[]) usStatesMap.get(tokenVal); if (state != null) { boolean expandState = false; // check to see if the state initials are ambiguous // in the English language if (state[1].equals("ambiguous")) { String previous = (String) tokenItem.findFeature("p.name"); String next = (String) tokenItem.findFeature("n.name"); int nextLength = next.length(); FeatureSet featureSet = tokenItem.getFeatures(); // check if the previous word starts with a capital letter, // is at least 3 letters long, is an alphabet sequence, // and has a comma. boolean previousIsCity = (Character.isUpperCase(previous.charAt(0)) && previous.length() > 2 && matches(alphabetPattern, previous) && tokenItem .findFeature("p.punc").equals(",")); // check if next token starts with a lower case, or // this is the end of sentence, or if next token // is a period (".") or a zip code (5 or 10 digits). boolean nextIsGood = (Character.isLowerCase(next.charAt(0)) || tokenItem.getNext() == null || featureSet.getString("punc").equals(".") || ((nextLength == 5 || nextLength == 10) && matches( digitsPattern, next))); if (previousIsCity && nextIsGood) { expandState = true; } else { expandState = false; } } else { expandState = true; } if (expandState) { for (int j = 2; j < state.length; j++) { if (state[j] != null) { wordRelation.addWord(state[j]); } } return true; } } return false; } /** * Determines if the given input matches the given Pattern. * * @param pattern the pattern to match * @param input the string to test * * @return <code>true</code> if the input string matches the given Pattern; * <code>false</code> otherwise */ private static boolean matches(Pattern pattern, String input) { Matcher m = pattern.matcher(input); return m.matches(); } /** * Determines if the character at the given position of the given input * text is splittable. A character is splittable if: * <p> * 1) the character and the following character are not letters in the * English alphabet (A-Z and a-z) * <p> * 2) the character and the following character are not digits (0-9) * <p> * * @param text the text containing the character of interest * @param index the index of the character of interest * * @return true if the position of the given text is splittable false * otherwise */ private static boolean isTextSplitable(String text, int index) { char c0 = text.charAt(index); char c1 = text.charAt(index + 1); if (Character.isLetter(c0) && Character.isLetter(c1)) { return false; } else if (Character.isDigit(c0) && Character.isDigit(c1)) { return false; } else if (c0 == '\'' || Character.isLetter(c1)) { return false; } else if (c1 == '\'' || Character.isLetter(c0)) { return false; } else { return true; } } }