Preprocess.java example

Explorer
marytts-master
package marytts.language.en;

import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.exceptions.MaryConfigurationException;
import marytts.modules.InternalModule;
import marytts.util.MaryRuntimeUtils;
import marytts.util.dom.MaryDomUtils;
import marytts.util.dom.NameNodeFilter;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;

import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.ULocale.Category;
import com.ibm.icu.text.DateFormat;
import com.ibm.icu.text.RuleBasedNumberFormat;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;

/**
 * @author Tristan Hamilton
 *
 *         <p>
 *         Can process following formats:
 *         <ul>
 *         <li>cardinal (handled by real number)
 *         <li>ordinal
 *         <li>year (as a 4 digit number or any number followed by AD/BC variation)
 *         <li>currency
 *         <li>numberandword together
 *         <li>dashes (read each number singly) or (split into two words)
 *         <li>underscores
 *         <li>decimal point, minus symbol (real numbers) also handles %, however Jtokeniser splits % into separate
 *         tokens
 *         <li>time
 *         <li>dates (in format mm/dd/yyyy)
 *         <li>acronyms (only split into single characters, never expanded)
 *         <li>abbreviations (list of known expansions in resource preprocess/abbrev.dat, a properties file separated by
 *         whitespace. If an abbrev has two different expansions then the capitalized version comes first, followed by a comma)
 *         <li>contractions → first check lexicon, if not then → split and check if map contains contraction, if not
 *         then just remove apostrophe else → split before apostrophe into two tokens, use map to manually add ph → for
 *         's if word ends in c,f,k,p,t then add ph = s otherwise ph = z
 *         <li>ampersand &, "at" @ symbol, → symbols
 *         <li>urls → note that jtokeniser splits off http[s]?://
 *         <li>number ranges "18-35"
 *         <li>words without vowels → first check lexicon, if not then separate into single character tokens
 *         <li>#hashtags
 *         <li>single "A/a" character → if there is no next token or the next token is punctuation or next token
 *         string.length == 1
 *         <li>should also as a last processing attempt, split by punctuation,symbols,etc. and attempt to process these tokens
 *         separately
 *         <li>durations hours:minutes:seconds(:milliseconds)
 *         <li>numbers followed by an s
 *         <li>punctuation → add ph attribute to tag to prevent phonemisation
 *         </ul>
 *         <p>
 *         May include:
 *         <ul>
 *         <li>roman numerals
 *         </ul>
 */
public class Preprocess extends InternalModule {

	// abbreviations map
	private Map<Object, Object> abbrevMap;

	// symbols map
	private static final Map<String, String> symbols;

	// contractions map
	private static final Map<String, String[]> contractions;

	// icu4j stuff
	private RuleBasedNumberFormat rbnf;
	protected final String cardinalRule;
	protected final String ordinalRule;
	protected final String yearRule;
	private DateFormat df;

	// Regex matching patterns
	private static final Pattern moneyPattern;
	private static final Pattern timePattern;
	private static final Pattern durationPattern;
	private static final Pattern abbrevPattern;
	private static final Pattern acronymPattern;
	private static final Pattern realNumPattern;
	private static final Pattern numberWordPattern;
	private static final Pattern datePattern;
	private static final Pattern yearPattern;
	private static final Pattern contractPattern;
	private static final Pattern symbolsPattern;
	private static final Pattern URLPattern;
	private static final Pattern rangePattern;
	private static final Pattern consonantPattern;
	private static final Pattern punctuationPattern;
	private static final Pattern myPunctPattern;
	private static final Pattern hashtagPattern;
	private static final Pattern ordinalPattern;
	private static final Pattern currencySymbPattern;
	private static final Pattern numberSPattern;

	// Regex initialization
	static {
		moneyPattern = Pattern.compile("(\\d+)(\\.\\d+)?");
		currencySymbPattern = Pattern.compile("[$£€]");
		timePattern = Pattern.compile(
				"((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])):([0-5][0-9])(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)?",
				Pattern.CASE_INSENSITIVE);
		yearPattern = Pattern.compile("(\\d+)(bc|ad|b\\.c\\.|b\\.c|a\\.d\\.|a\\.d)", Pattern.CASE_INSENSITIVE);
		ordinalPattern = Pattern.compile("\\d+(st|nd|rd|th)", Pattern.CASE_INSENSITIVE);
		durationPattern = Pattern.compile("(\\d+):([0-5][0-9]):([0-5][0-9])(:([0-5][0-9]))?");
		abbrevPattern = Pattern.compile("[a-zA-Z]{2,}\\.");
		acronymPattern = Pattern.compile("([a-zA-Z]\\.[a-zA-Z](\\.)?)+([a-zA-Z](\\.)?)?");
		realNumPattern = Pattern.compile("(-)?(\\d+)?(\\.(\\d+)(%)?)?");
		numberWordPattern = Pattern.compile("([a-zA-Z]+[0-9]+|[0-9]+[a-zA-Z]+)\\w*");
		datePattern = Pattern.compile("(\\d{2})[\\/\\.](\\d{2})[\\/\\.]\\d{4}");
		contractPattern = Pattern.compile("[a-zA-Z]+('[a-zA-Z]+)");
		symbolsPattern = Pattern.compile("[@%#\\/\\+=&><-]");
		rangePattern = Pattern.compile("([0-9]+)-([0-9]+)");
		consonantPattern = Pattern.compile("[b-df-hj-np-tv-z]+", Pattern.CASE_INSENSITIVE);
		punctuationPattern = Pattern.compile("\\p{Punct}");
		numberSPattern = Pattern.compile("([0-9]+)([sS])");
		myPunctPattern = Pattern.compile(",\\.:;?'\"");
		hashtagPattern = Pattern.compile("(#)(\\w+)");
		URLPattern = Pattern
				.compile("(https?:\\/\\/)?((www\\.)?([-a-zA-Z0-9@:%._\\\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\\\+.~#?&\\/=]*)))");
	}

	// HashMap initialization
	static {
		contractions = new HashMap<String, String[]>();
		contractions.put("'s", new String[] { "z", "s" });
		contractions.put("'ll", new String[] { "l" });
		contractions.put("'ve", new String[] { "v" });
		contractions.put("'d", new String[] { "d" });
		contractions.put("'m", new String[] { "m" });
		contractions.put("'re", new String[] { "r" });

		symbols = new HashMap<String, String>();
		symbols.put("@", "at");
		symbols.put("#", "hashtag");
		symbols.put("/", "forward slash");
		symbols.put("%", "per cent");
		symbols.put("+", "plus");
		symbols.put("-", "minus");
		symbols.put("=", "equals");
		symbols.put(">", "greater than");
		symbols.put("<", "less than");
		symbols.put("&", "and");
	}

	public Preprocess() {
		super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, Locale.ENGLISH);
		this.rbnf = new RuleBasedNumberFormat(ULocale.ENGLISH, RuleBasedNumberFormat.SPELLOUT);
		this.cardinalRule = "%spellout-numbering";
		this.ordinalRule = getOrdinalRuleName(rbnf);
		this.yearRule = getYearRuleName(rbnf);
		this.df = DateFormat.getDateInstance(DateFormat.LONG, ULocale.ENGLISH);
		try {
			this.abbrevMap = loadAbbrevMap();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public MaryData process(MaryData d) throws Exception {
		Document doc = d.getDocument();
		expand(doc);
		MaryData result = new MaryData(getOutputType(), d.getLocale());
		result.setDocument(doc);
		return result;
	}

	/***
	 * processes a document in mary xml format, from Tokens to Words which can be phonemised.
	 *
	 * @param doc
	 *            doc
	 * @throws ParseException
	 *             parse exception
	 * @throws IOException
	 *             IO Exception
	 * @throws MaryConfigurationException
	 *             mary configuration exception
	 */
	protected void expand(Document doc) throws ParseException, IOException, MaryConfigurationException {
		String whichCurrency = "";
		boolean URLFirst = false;
		boolean isYear;
		boolean isURL = false;
		boolean puncSplit = false;
		boolean dashSplit = false;
		String webEmailTemp = "";
		boolean splitContraction;
		TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT,
				new NameNodeFilter(MaryXML.TOKEN), false);
		Element t = null;

		// loop through each node in dom tree
		while ((t = (Element) tw.nextNode()) != null) {

			/*
			 * PRELIM FOR EACH NODE
			 */

			// to accommodate the first token being a url
			if (URLFirst) {
				t = (Element) tw.previousNode();
				URLFirst = false;
			}
			isYear = true;
			splitContraction = false;

			if (MaryDomUtils.hasAncestor(t, MaryXML.SAYAS) || t.hasAttribute("ph") || t.hasAttribute("sounds_like")) {
				// if token already has any of these attributes then ignore
				continue;
			}

			// save the original token text
			String origText = MaryDomUtils.tokenText(t);

			// remove commas
			if (MaryDomUtils.tokenText(t).matches("[\\$|£|€]?\\d+,[\\d,]+")) {
				MaryDomUtils.setTokenText(t, MaryDomUtils.tokenText(t).replaceAll(",", ""));
				// presume that a 4 digit number which had commas is not a year
				if (MaryDomUtils.tokenText(t).matches("\\d{4}")) {
					isYear = false;
				}
			}
			// isYear extra check
			if (MaryDomUtils.tokenText(t).matches("\\d{4}") && !whichCurrency.equals("")) {
				isYear = false;
			}

			// check if currency
			if (MaryDomUtils.tokenText(t).matches(currencySymbPattern.pattern())) {
				whichCurrency = MaryDomUtils.tokenText(t);
			}

			/*
			 * ACTUAL PROCESSING
			 */

			// ordinal
			if (MaryDomUtils.tokenText(t).matches("(?i)" + ordinalPattern.pattern())) {
				String matched = MaryDomUtils.tokenText(t).split("(?i)st|nd|rd|th")[0];
				MaryDomUtils.setTokenText(t, expandOrdinal(Double.parseDouble(matched)));
				// single a or A character
			} else if (MaryDomUtils.tokenText(t).matches("[aA]")) {
				Element checkNextNode = MaryDomUtils.getNextSiblingElement((Element) t);
				if (checkNextNode == null || MaryDomUtils.tokenText(checkNextNode).matches(myPunctPattern.pattern())
						|| MaryDomUtils.tokenText(checkNextNode).length() == 1) {
					MaryDomUtils.setTokenText(t, "_a");
				}
				// date
			} else if (MaryDomUtils.tokenText(t).matches(datePattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandDate(MaryDomUtils.tokenText(t)));
				// number followed by s
			} else if (MaryDomUtils.tokenText(t).matches(numberSPattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandNumberS(MaryDomUtils.tokenText(t)));
				// year with bc or ad
			} else if (MaryDomUtils.tokenText(t).matches("(?i)" + yearPattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandYearBCAD(MaryDomUtils.tokenText(t)));
				// year as just 4 digits → this should always be checked BEFORE real number
			} else if (MaryDomUtils.tokenText(t).matches("\\d{4}") && isYear == true) {
				MaryDomUtils.setTokenText(t, expandYear(Double.parseDouble(MaryDomUtils.tokenText(t))));
				// wordAndNumber → must come AFTER year
			} else if (MaryDomUtils.tokenText(t).matches(numberWordPattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandWordNumber(MaryDomUtils.tokenText(t)));
				// real number & currency
			} else if (MaryDomUtils.tokenText(t).matches(realNumPattern.pattern())) {
				if (!whichCurrency.equals("")) {
					MaryDomUtils.setTokenText(t, expandMoney(MaryDomUtils.tokenText(t), whichCurrency));
					whichCurrency = "";
				} else {
					MaryDomUtils.setTokenText(t, expandRealNumber(MaryDomUtils.tokenText(t)));
				}
				// contractions
			} else if (MaryDomUtils.tokenText(t).matches(contractPattern.pattern())) {
				// first check lexicon
				if (MaryRuntimeUtils.checkLexicon("en_US", MaryDomUtils.tokenText(t)).length == 0) {
					Matcher contractionMatch = contractPattern.matcher(MaryDomUtils.tokenText(t));
					contractionMatch.find();
					// if no contraction we allow g2p rules to handle
					if (!contractions.containsKey(contractionMatch.group(1))) {
						MaryDomUtils.setTokenText(t, MaryDomUtils.tokenText(t).replaceAll("'", ""));
					}

					// FIXME: we do not want to have to phonological word => for now we do not split !
					// // if not in lexicon and we have a contraction expansion then split into two tokens
					// else
					// {
					// splitContraction = true;
					// MaryDomUtils.setTokenText(t, splitContraction(MaryDomUtils.tokenText(t)));
					// }
				}
				// acronym
			} else if (MaryDomUtils.tokenText(t).matches(acronymPattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandAcronym(MaryDomUtils.tokenText(t)));
				// abbreviation
			} else if ((MaryDomUtils.tokenText(t).matches(abbrevPattern.pattern()) || this.abbrevMap.containsKey(MaryDomUtils
					.tokenText(t).toLowerCase())) && !isURL) {
				Element testAbbNode = MaryDomUtils.getNextSiblingElement((Element) t);
				boolean nextTokenIsCapital = false;
				if (testAbbNode != null && Character.isUpperCase(MaryDomUtils.tokenText(testAbbNode).charAt(0))) {
					nextTokenIsCapital = true;
				}
				MaryDomUtils.setTokenText(t, expandAbbreviation(MaryDomUtils.tokenText(t), nextTokenIsCapital));
				// time
			} else if (MaryDomUtils.tokenText(t).matches("(?i)" + timePattern.pattern())) {
				Element testTimeNode = MaryDomUtils.getNextSiblingElement((Element) t);
				boolean nextTokenIsTime = false;
				if (testTimeNode != null && MaryDomUtils.tokenText(testTimeNode).matches("a\\.m\\.|AM|PM|am|pm|p\\.m\\.")) {
					nextTokenIsTime = true;
				}
				MaryDomUtils.setTokenText(t, expandTime(MaryDomUtils.tokenText(t), nextTokenIsTime));
				// duration
			} else if (MaryDomUtils.tokenText(t).matches(durationPattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandDuration(MaryDomUtils.tokenText(t)));
				// hashtags
			} else if (MaryDomUtils.tokenText(t).matches(hashtagPattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandHashtag(MaryDomUtils.tokenText(t)));
				// URLs
			} else if (MaryDomUtils.tokenText(t).matches(URLPattern.pattern())) {
				// matching group 2 contains the chunk we want
				Matcher urlMatcher = URLPattern.matcher(MaryDomUtils.tokenText(t));
				urlMatcher.find();
				webEmailTemp = MaryDomUtils.tokenText(t);
				isURL = true;
				MaryDomUtils.setTokenText(t, expandURL(urlMatcher.group(2)));
				// dot . for web and email addresses
			} else if (MaryDomUtils.tokenText(t).equals(".") && isURL) {
				MaryDomUtils.setTokenText(t, "dot");
				webEmailTemp = webEmailTemp.replaceFirst("\\.", "dot");
				if (!webEmailTemp.contains(".")) {
					isURL = false;
				}
				// symbols
			} else if (MaryDomUtils.tokenText(t).matches(symbolsPattern.pattern())) {
				MaryDomUtils.setTokenText(t, symbols.get(MaryDomUtils.tokenText(t)));
				// number ranges → before checking for dashes
			} else if (MaryDomUtils.tokenText(t).matches(rangePattern.pattern())) {
				MaryDomUtils.setTokenText(t, expandRange(MaryDomUtils.tokenText(t)));
				// dashes and underscores
			} else if (MaryDomUtils.tokenText(t).contains("-") || MaryDomUtils.tokenText(t).contains("_")) {
				dashSplit = true;
				String[] tokens = MaryDomUtils.tokenText(t).split("[-_]");
				int i = 0;
				for (String tok : tokens) {
					if (tok.matches("\\d+")) {
						String newTok = "";
						for (char c : tok.toCharArray()) {
							newTok += expandNumber(Double.parseDouble(String.valueOf(c))) + " ";
						}
						tokens[i] = newTok;
					}
					i++;
				}
				MaryDomUtils.setTokenText(t, Arrays.toString(tokens).replaceAll("[,\\]\\[]", ""));
				// words containing only consonants
			} else if (MaryDomUtils.tokenText(t).matches("(?i)" + consonantPattern.pattern())) {
				// first check lexicon
				if (MaryRuntimeUtils.checkLexicon("en_US", MaryDomUtils.tokenText(t)).length == 0) {
					MaryDomUtils.setTokenText(t, expandConsonants(MaryDomUtils.tokenText(t)));
				}
				// a final attempt to split by punctuation
			} else if (punctuationPattern.matcher(MaryDomUtils.tokenText(t)).find() && MaryDomUtils.tokenText(t).length() > 1) {
				puncSplit = true;
				String[] puncTokens = MaryDomUtils.tokenText(t).split("((?<=\\p{Punct})|(?=\\p{Punct}))");
				MaryDomUtils.setTokenText(t, Arrays.toString(puncTokens).replaceAll("[,\\]\\[]", ""));
				// FIXME: skip quotes for now as we don't have any clever management of the POS for the prosodic feature
			} else if (MaryDomUtils.tokenText(t).equals("\"")) {
			} else if (MaryDomUtils.tokenText(t).matches(punctuationPattern.pattern())) {
				t.setAttribute("pos", ".");
			}
			// if token isn't ignored but there is no handling rule don't add MTU
			if (!origText.equals(MaryDomUtils.tokenText(t))) {
				MaryDomUtils.encloseWithMTU(t, origText, null);
				// finally, split new expanded token separated by spaces into separate tokens (also catch any leftover dashes)
				String[] newTokens = MaryDomUtils.tokenText(t).replaceAll("-", " ").split("\\s+");
				MaryDomUtils.setTokenText(t, newTokens[0]);
				for (int i = 1; i < newTokens.length; i++) {
					MaryDomUtils.appendToken(t, newTokens[i]);
					t = MaryDomUtils.getNextSiblingElement((Element) t);
					// if tokens are an expanded contraction
					if (splitContraction && newTokens.length == 2) {
						if (newTokens[0].substring(newTokens[0].length() - 1).matches("[cfkpt]")
								&& contractions.get(newTokens[i]).length > 1) {
							t.setAttribute("ph", contractions.get(newTokens[i])[1]);
						} else {
							t.setAttribute("ph", contractions.get(newTokens[i])[0]);
						}
					}
				}
				// if expanded url or punctuation go over each node, otherwise let TreeWalker catch up
				if (!isURL && !puncSplit && !dashSplit) {
					tw.setCurrentNode((Node) t);
				} else {
					Node n = tw.previousNode();
					// if the first node in doc is an email or web address, account for this
					if (n == null) {
						URLFirst = true;
					}
					puncSplit = false;
					dashSplit = false;
				}
			}
		}
	}

	protected String expandNumber(double number) {
		this.rbnf.setDefaultRuleSet(cardinalRule);
		return this.rbnf.format(number);
	}

	protected String expandOrdinal(double number) {
		this.rbnf.setDefaultRuleSet(ordinalRule);
		return this.rbnf.format(number);
	}

	protected String expandYear(double number) {
		this.rbnf.setDefaultRuleSet(yearRule);
		return this.rbnf.format(number);
	}

	protected String expandDuration(String duration) {
		Matcher durMatcher = durationPattern.matcher(duration);
		durMatcher.find();
		String hrs = expandNumber(Double.parseDouble(durMatcher.group(1))) + " hours ";
		String mins = expandNumber(Double.parseDouble(durMatcher.group(2))) + " minutes ";
		String secs = expandNumber(Double.parseDouble(durMatcher.group(3))) + " seconds ";
		String ms = "";
		if (durMatcher.group(4) != null) {
			ms = "and " + expandNumber(Double.parseDouble(durMatcher.group(5))) + " milliseconds ";
		} else {
			secs = "and " + secs;
		}
		return hrs + mins + secs + ms;
	}

	protected String expandAcronym(String acronym) {
		return acronym.replaceAll("\\.", " ");
	}

	/***
	 * expand a URL string partially by splitting by @, / and . symbols (but retaining them)
	 *
	 * @param email
	 *            email
	 * @return Arrays.toString(tokens).replaceAll("[,\\]\\[]", "")
	 */
	protected String expandURL(String email) {
		String[] tokens = email.split("((?<=[\\.@\\/])|(?=[\\.@\\/]))");
		return Arrays.toString(tokens).replaceAll("[,\\]\\[]", "");
	}

	protected String expandYearBCAD(String year) {
		String abbrev = "";
		Matcher yearMatcher = yearPattern.matcher(year);
		yearMatcher.find();
		if (yearMatcher.group(2).contains(".")) {
			String[] abbrevAr = yearMatcher.group(2).split("\\.");
			abbrev = Arrays.toString(abbrevAr).replaceAll("[,\\]\\[]", "");
		} else {
			abbrev = expandConsonants(yearMatcher.group(2));

		}
		return expandYear(Double.parseDouble(yearMatcher.group(1))) + " " + abbrev;
	}

	/***
	 * add a space between each char of a string
	 *
	 * @param consonants
	 *            consonants
	 * @return Joiner.on(" ").join(Lists.charactersOf(consonants))
	 */
	protected String expandConsonants(String consonants) {
		return Joiner.on(" ").join(Lists.charactersOf(consonants));
	}

	protected String expandHashtag(String hashtag) {
		String tag = "";
		String expandedTag = "";
		Matcher hashTagMatcher = hashtagPattern.matcher(hashtag);
		hashTagMatcher.find();
		tag = hashTagMatcher.group(2);
		if (!tag.matches("[a-z]+") || !tag.matches("[A-Z]+")) {
			String temp = "";
			for (char c : tag.toCharArray()) {
				if (Character.isDigit(c) && temp.matches("^$|[0-9]+")) {
					temp += c;
				} else if (Character.isDigit(c) && temp.matches(".+[0-9]")) {
					temp += c;
				} else if (Character.isDigit(c)) {
					temp += " " + c;
				} else if (!temp.equals("") && Character.isUpperCase(c)) {
					if (Character.isUpperCase(temp.charAt(temp.length() - 1))) {
						temp += c;
					} else {
						temp += " " + c;
					}
				} else if (Character.isAlphabetic(c) && temp.length() > 0) {
					if (Character.isDigit(temp.charAt(temp.length() - 1))) {
						temp += " " + c;
					} else {
						temp += c;
					}
				} else {
					temp += c;
				}
			}
			expandedTag = temp;
		} else {
			expandedTag = tag;
		}
		return symbols.get(hashTagMatcher.group(1)) + " " + expandedTag;
	}

	protected String expandRange(String range) {
		Matcher rangeMatcher = rangePattern.matcher(range);
		rangeMatcher.find();
		return expandNumber(Double.parseDouble(rangeMatcher.group(1))) + " to "
				+ expandNumber(Double.parseDouble(rangeMatcher.group(2)));
	}

	/***
	 * expands a digit followed by an s. e.g. 7s and 8s and the 60s
	 *
	 * @param numberS
	 *            numberS
	 * @return number
	 */
	protected String expandNumberS(String numberS) {
		Matcher numberSMatcher = numberSPattern.matcher(numberS);
		numberSMatcher.find();
		String number = expandNumber(Double.parseDouble(numberSMatcher.group(1)));
		if (number.endsWith("x")) {
			number += "es";
		} else if (number.endsWith("y")) {
			number = number.replace("y", "ies");
		} else {
			number += "s";
		}
		return number;
	}

	protected String splitContraction(String contraction) {
		int aposIndex = contraction.indexOf("'");
		String lemma = contraction.substring(0, aposIndex);
		String end = contraction.substring(aposIndex);
		return lemma + " " + end;
	}

	/***
	 *
	 * @param abbrev
	 *            the token to be expanded
	 * @param isCapital
	 *            whether the following token begins with a capital letter
	 * @return abbrev
	 */
	protected String expandAbbreviation(String abbrev, boolean isCapital) {
		String expAbb = abbrev.replaceAll("\\.", "").toLowerCase();
		if (!abbrevMap.containsKey(expAbb)) {
			logger.warn(String.format("Could not expand unknown abbreviation \"%s\", ignoring", abbrev));
			return abbrev;
		}
		expAbb = (String) this.abbrevMap.get(expAbb);
		String[] multiExp = expAbb.split(",");
		if (multiExp.length > 1) {
			if (isCapital) {
				expAbb = multiExp[0];
			} else {
				expAbb = multiExp[1];
			}
		}
		return expAbb;
	}

	protected String expandDate(String date) throws ParseException {
		// date format is "month/day/year"
		Date humanDate = df.getPatternInstance("MM.dd.yyyy", ULocale.ENGLISH).parse(date);
		String[] dateParts = df.format(humanDate).replaceAll(",", "").split("\\s");
		dateParts[1] = expandOrdinal(Double.parseDouble(dateParts[1]));
		dateParts[2] = expandYear(Double.parseDouble(dateParts[2]));
		return Arrays.toString(dateParts).replaceAll("[,\\]\\[]", "");
	}

	/***
	 *
	 * @param time
	 *            the token to be expanded
	 * @param isNextTokenTime
	 *            whether the following token contains am or pm
	 * @return theTime
	 */
	protected String expandTime(String time, boolean isNextTokenTime) {
		boolean pastNoon = false;
		String theTime = "";
		String hour = "";
		Double pmHour;
		Matcher timeMatch = timePattern.matcher(time);
		timeMatch.find();
		// hour
		if (timeMatch.group(2) != null || timeMatch.group(3) != null) {
			hour = (timeMatch.group(2) != null) ? timeMatch.group(2) : timeMatch.group(3);
			if (hour.equals("00")) {
				hour = "12";
			}
			theTime += expandNumber(Double.parseDouble(hour));
		} else {
			pastNoon = true;
			hour = (timeMatch.group(4) != null) ? timeMatch.group(4) : timeMatch.group(5);
			pmHour = Double.parseDouble(hour) - 12;
			if (pmHour == 0) {
				hour = "12";
				theTime += expandNumber(Double.parseDouble(hour));
			} else {
				theTime += expandNumber(pmHour);
			}
		}
		// minutes
		if (timeMatch.group(7) != null && !isNextTokenTime) {
			if (!timeMatch.group(6).equals("00")) {
				if (timeMatch.group(6).matches("0\\d")) {
					theTime += " oh " + expandNumber(Double.parseDouble(timeMatch.group(6)));
				} else {
					theTime += " " + expandNumber(Double.parseDouble(timeMatch.group(6)));
				}
			}
			for (char c : timeMatch.group(7).replaceAll("\\.", "").toCharArray()) {
				theTime += " " + c;
			}
		} else if (!isNextTokenTime) {
			if (!timeMatch.group(6).equals("00")) {
				if (timeMatch.group(6).matches("0\\d")) {
					theTime += " oh " + expandNumber(Double.parseDouble(timeMatch.group(6)));
				} else {
					theTime += " " + expandNumber(Double.parseDouble(timeMatch.group(6)));
				}
			}
			theTime += !pastNoon ? " a m" : " p m";
		} else {
			if (!timeMatch.group(6).equals("00")) {
				if (timeMatch.group(6).matches("0\\d")) {
					theTime += " oh " + expandNumber(Double.parseDouble(timeMatch.group(6)));
				} else {
					theTime += " " + expandNumber(Double.parseDouble(timeMatch.group(6)));
				}
			}
		}
		return theTime;
	}

	protected String expandRealNumber(String number) {
		Matcher realNumMatch = realNumPattern.matcher(number);
		realNumMatch.find();
		String newTok = "";
		if (realNumMatch.group(1) != null) {
			newTok += "minus ";
		}
		if (realNumMatch.group(2) != null) {
			newTok += expandNumber(Double.parseDouble(realNumMatch.group(2))) + " ";
		}
		if (realNumMatch.group(3) != null) {
			newTok += "point ";
			for (char c : realNumMatch.group(4).toCharArray()) {
				newTok += expandNumber(Double.parseDouble(String.valueOf(c))) + " ";
			}
			if (realNumMatch.group(5) != null) {
				newTok += "per cent";
			}
		}
		return newTok.trim();
	}

	protected String expandWordNumber(String wordnumseq) {
		String[] groups = wordnumseq.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)");
		int i = 0;
		for (String g : groups) {
			if (g.matches("\\d+")) {
				String newTok = "";
				for (char c : g.toCharArray()) {
					newTok += expandNumber(Double.parseDouble(String.valueOf(c))) + " ";
				}
				groups[i] = newTok;
			}
			i++;
		}
		return Arrays.toString(groups).replaceAll("[,\\]\\[]", "");
	}

	protected String expandMoney(String money, String currency) {
		String origText = money;
		Matcher currencyMatch = moneyPattern.matcher(money);
		currencyMatch.find();
		switch (currency) {
		case "$":
			if (Double.parseDouble(currencyMatch.group(1)) > 1) {
				money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " dollars";
			} else {
				money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " dollar";
			}
			if (currencyMatch.group(2) != null) {
				int dotIndex = origText.indexOf('.');
				money = money + " " + expandNumber(Double.parseDouble(origText.substring(dotIndex + 1))) + " cents";
			}
			break;
		case "£":
			money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " pound sterling";
			if (currencyMatch.group(2) != null) {
				int dotIndex = origText.indexOf('.');
				money = money + " " + expandNumber(Double.parseDouble(origText.substring(dotIndex + 1))) + " pence";
			}
			break;
		case "€":
			money = expandNumber(Double.parseDouble(currencyMatch.group(1))) + " euro";
			if (currencyMatch.group(2) != null) {
				int dotIndex = origText.indexOf('.');
				money = money + " " + expandNumber(Double.parseDouble(origText.substring(dotIndex + 1))) + " cents";
			}
			break;
		default:
			logger.warn(String.format("Could not expand amount [%s] for currency [%s]", origText, currency));
			break;
		}
		return money;
	}

	/**
	 * Try to extract the rule name for "expand ordinal" from the given RuleBasedNumberFormat.
	 * <p>
	 * The rule name is locale sensitive, but usually starts with "%spellout-ordinal".
	 *
	 * @param rbnf
	 *            The RuleBasedNumberFormat from where we will try to extract the rule name.
	 * @return The rule name for "ordinal spell out".
	 */
	protected static String getOrdinalRuleName(final RuleBasedNumberFormat rbnf) {
		List<String> l = Arrays.asList(rbnf.getRuleSetNames());
		if (l.contains("%spellout-ordinal")) {
			return "%spellout-ordinal";
		} else if (l.contains("%spellout-ordinal-masculine")) {
			return "%spellout-ordinal-masculine";
		} else {
			for (String string : l) {
				if (string.startsWith("%spellout-ordinal")) {
					return string;
				}
			}
		}
		throw new UnsupportedOperationException("The locale " + rbnf.getLocale(ULocale.ACTUAL_LOCALE)
				+ " doesn't support ordinal spelling.");
	}

	/**
	 * Try to extract the rule name for "expand year" from the given RuleBasedNumberFormat.
	 * <p>
	 * The rule name is locale sensitive, but usually starts with "%spellout-numbering-year".
	 *
	 * @param rbnf
	 *            The RuleBasedNumberFormat from where we will try to extract the rule name.
	 * @return The rule name for "year spell out".
	 */
	protected static String getYearRuleName(final RuleBasedNumberFormat rbnf) {
		List<String> l = Arrays.asList(rbnf.getRuleSetNames());
		if (l.contains("%spellout-numbering-year")) {
			return "%spellout-numbering-year";
		} else {
			for (String string : l) {
				if (string.startsWith("%spellout-numbering-year")) {
					return string;
				}
			}
		}
		throw new UnsupportedOperationException("The locale " + rbnf.getLocale(ULocale.ACTUAL_LOCALE)
				+ " doesn't support year spelling.");
	}

	public static Map<Object, Object> loadAbbrevMap() throws IOException {
		Map<Object, Object> abbMap = new Properties();
		((Properties) abbMap).load(Preprocess.class.getResourceAsStream("preprocess/abbrev.dat"));
		return abbMap;
	}
}