NumberEP.java example

Explorer
marytts-master
/**
 * Copyright 2002 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.language.de.preprocess;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;

import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

/**
 * An expansion pattern implementation for basic number patterns.
 *
 * @author Marc Schröder
 */

public class NumberEP extends ExpansionPattern {
	private final String[] _knownTypes = { "number", "number:float", "number:integer", "number:ordinal", "number:roman",
			"number:digits", "number:cardinal" };
	/**
	 * Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of
	 * the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one
	 * known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are
	 * specializations.
	 */
	private final List<String> knownTypes = Arrays.asList(_knownTypes);

	public List<String> knownTypes() {
		return knownTypes;
	}

	// Domain-specific primitives:
	// (floats and integers are recognised up to hundreds of millions (nine digits))
	protected static final String sFloat = "(?:-?(?:[1-9][0-9]{0,8}|0)?(?:\\.|,)[0-9]+)";
	protected static final String sInteger = "(?:-?[1-9][0-9]{0,8}|0)";
	protected static final String sOrdinal = "(?:" + sInteger + "\\.)";
	protected static final String sRoman = "(?:[MDCLXVI]+\\.?)";
	protected static final String sDigits = "(?:[0-9.,]*[0-9][.,]?)";
	protected static final String sCardinal = sFloat + "|" + sInteger;

	// Now the actual match patterns:
	protected final Pattern reFloat = Pattern.compile(sFloat);
	protected final Pattern reInteger = Pattern.compile(sInteger);
	protected final Pattern reOrdinal = Pattern.compile(sOrdinal);
	protected final Pattern reRoman = Pattern.compile(sRoman);
	protected final Pattern reDigits = Pattern.compile(sDigits);

	// We do not use reMatchingChars here, but override isCandidate
	private final Pattern reMatchingChars = null;

	public Pattern reMatchingChars() {
		return reMatchingChars;
	}

	/**
	 * Simple numbers are expected to be entire tokens. They should not be joined together out of several tokens.
	 * 
	 * @return false
	 */
	protected boolean allowMultipleTokens() {
		return false;
	}

	protected boolean isCandidate(Element t) {
		String s = MaryDomUtils.tokenText(t);
		return reFloat.matcher(s).matches() || reInteger.matcher(s).matches() || reOrdinal.matcher(s).matches()
				|| reRoman.matcher(s).matches() || reDigits.matcher(s).matches();
	}

	/**
	 * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same
	 * time, the logger needs to be thread-safe or it will produce rubbish.
	 */
	private Logger logger = MaryUtils.getLogger("NumberEP");

	public NumberEP() {
		super();
	}

	protected int match(String s, int type) {
		switch (type) {
		case 0:
			if (matchFloat(s))
				return 1;
			if (matchInteger(s))
				return 2;
			if (matchOrdinal(s))
				return 3;
			// if (matchRoman(s)) return 4;
			// Disable unconditional Roman number recognition.
			// Problem: Even Abbreviations that happen to only use
			// roman digits are pronounced as numbers. Example: LDC
			// !!!! Would have to be replaced by a context dependent
			// pronounciation -- only in specific contexts is a roman number
			// expanded as such.
			// For the moment, roman numbers need to be specifically requested
			// via markup.
			if (matchDigits(s))
				return 5;
			// case"6" Cardinal = Float | Integer
			break;
		case 1:
			if (matchFloat(s))
				return 1;
			break;
		case 2:
			if (matchInteger(s))
				return 2;
			break;
		case 3:
			if (matchOrdinal(s))
				return 3;
			break;
		case 4:
			if (matchRoman(s))
				return 4;
			break;
		case 5:
			if (matchDigits(s))
				return 5;
			break;
		case 6:
			if (matchInteger(s))
				return 2;
			if (matchFloat(s))
				return 1;
			break;
		}
		return -1;
	}

	protected int canDealWith(String input, int typeCode) {
		switch (typeCode) {
		case 0:
			if (matchFloat(input))
				return 1;
			if (matchInteger(input))
				return 2;
			if (matchOrdinal(input))
				return 3;
			if (matchRoman(input))
				return 4;
			if (matchDigits(input))
				return 5;
			break;
		case 1:
			if (matchFloat(input))
				return 1;
			break;
		case 2: // integer
			if (matchInteger(input) || matchRoman(input))
				return 2;
			break;
		case 3: // ordinal
			if (matchOrdinal(input) || matchInteger(input) || matchRoman(input))
				return 3;
			break;
		case 4:
			if (matchRoman(input))
				return 4;
			break;
		case 5:
			if (matchDigits(input))
				return 5;
			break;
		case 6: // cardinal; is either integer or float
			if (matchInteger(input))
				return 2;
			if (matchFloat(input))
				return 1;
			break;
		}
		return -1; // no, cannot deal with it as the given type
	}

	protected List<Element> expand(List<Element> tokens, String s, int type) {
		if (tokens == null)
			throw new NullPointerException("Received null argument");
		if (tokens.isEmpty())
			throw new IllegalArgumentException("Received empty list");
		Document doc = ((Element) tokens.get(0)).getOwnerDocument();
		// we expect type to be one of the return values of match():
		List<Element> expanded = null;
		switch (type) {
		case 1:
			expanded = expandFloat(doc, s, true);
			break;
		case 2:
			expanded = expandInteger(doc, s, true);
			break;
		case 3:
			expanded = expandOrdinal(doc, s, true);
			break;
		case 4:
			expanded = expandRoman(doc, s, true);
			break;
		case 5:
			expanded = expandDigits(doc, s, true);
			break;
		}
		replaceTokens(tokens, expanded);
		return expanded;
	}

	protected boolean matchFloat(String s) {
		return reFloat.matcher(s).matches();
	}

	protected boolean matchInteger(String s) {
		return reInteger.matcher(s).matches();
	}

	protected boolean matchOrdinal(String s) {
		return reOrdinal.matcher(s).matches();
	}

	protected boolean matchRoman(String s) {
		return reRoman.matcher(s).matches();
	}

	protected boolean matchDigits(String s) {
		return reDigits.matcher(s).matches();
	}

	protected List<Element> expandInteger(Document doc, String s, boolean createMtu) {
		long value;
		// In canDealWith(), we have made a commitment to deal with
		// roman numbers to be pronounced as integers.
		if (matchRoman(s)) {
			return expandRoman(doc, s, createMtu, false); // roman integer
		}
		try {
			while (s.length() > 1 && s.startsWith("0"))
				s = s.substring(1);
			value = Long.parseLong(s);
		} catch (NumberFormatException e) {
			logger.info("Cannot convert string \"" + s + "\" to long.");
			throw e;
		}

		return expandInteger(doc, value, createMtu, s);
	}

	protected List<Element> expandInteger(Document doc, long value, boolean createMtu, String orig) {
		String expString = expandInteger(value);
		return makeNewTokens(doc, expString, createMtu, orig);
	}

	protected String expandInteger(String s) {
		long value;
		try {
			while (s.length() > 1 && s.startsWith("0"))
				s = s.substring(1);
			value = Long.decode(s).longValue();
		} catch (NumberFormatException e) {
			logger.info("Cannot convert string \"" + s + "\" to long.");
			throw e;
		}

		return expandInteger(value);
	}

	protected String expandInteger(long value) {
		long millions;
		int thousands;
		int hundreds;
		int tens;
		int rest;
		StringBuilder buf = new StringBuilder();

		// Special treatment for the 0:
		if (value == 0) {
			return (new String("null"));
		}
		if (value < 0) {
			buf.append("Minus ");
		}
		millions = value / 1000000;
		rest = (int) value % 1000000; // the part of value below 1 000 000
		if (millions > 1) {
			buf.append(expandInteger(millions)); // recursive call
			buf.append(" ");
			buf.append("Millionen ");
		} else if (millions == 1) {
			buf.append("eine Million ");
		}
		thousands = rest / 1000;
		rest = rest % 1000;
		if (thousands > 1) {
			buf.append(expandInteger(thousands));
			buf.append(" ");
			buf.append("Tausend ");
		} else if (thousands == 1) {
			buf.append("ein Tausend ");
		}
		hundreds = rest / 100;
		rest = rest % 100;
		if (hundreds > 1) {
			buf.append(expandInteger(hundreds));
			buf.append(" ");
			buf.append("Hundert ");
		} else if (hundreds == 1) {
			buf.append("ein Hundert ");
		}
		if (rest >= 20) {
			tens = rest / 10;
			rest = rest % 10;
			switch (rest) {
			case 1:
				buf.append("einund");
				break;
			case 2:
				buf.append("zweiund");
				break;
			case 3:
				buf.append("dreiund");
				break;
			case 4:
				buf.append("vierund");
				break;
			case 5:
				buf.append("fünfund");
				break;
			case 6:
				buf.append("sechsund");
				break;
			case 7:
				buf.append("siebenund");
				break;
			case 8:
				buf.append("achtund");
				break;
			case 9:
				buf.append("neunund");
				break;
			default: // 0: do nothing
			}
			switch (tens) {
			case 2:
				buf.append("zwanzig ");
				break;
			case 3:
				buf.append("dreißig ");
				break;
			case 4:
				buf.append("vierzig ");
				break;
			case 5:
				buf.append("fünfzig ");
				break;
			case 6:
				buf.append("sechzig ");
				break;
			case 7:
				buf.append("siebzig ");
				break;
			case 8:
				buf.append("achtzig ");
				break;
			case 9:
				buf.append("neunzig ");
				break;
			default: // shouldn't happen
			}
		} else { // rest < 20
			switch (rest) {
			case 1:
				buf.append("eins ");
				break;
			case 2:
				buf.append("zwei ");
				break;
			case 3:
				buf.append("drei ");
				break;
			case 4:
				buf.append("vier ");
				break;
			case 5:
				buf.append("fünf ");
				break;
			case 6:
				buf.append("sechs ");
				break;
			case 7:
				buf.append("sieben ");
				break;
			case 8:
				buf.append("acht ");
				break;
			case 9:
				buf.append("neun ");
				break;
			case 10:
				buf.append("zehn ");
				break;
			case 11:
				buf.append("elf ");
				break;
			case 12:
				buf.append("zwölf ");
				break;
			case 13:
				buf.append("dreizehn ");
				break;
			case 14:
				buf.append("vierzehn ");
				break;
			case 15:
				buf.append("fünfzehn ");
				break;
			case 16:
				buf.append("sechzehn ");
				break;
			case 17:
				buf.append("siebzehn ");
				break;
			case 18:
				buf.append("achtzehn ");
				break;
			case 19:
				buf.append("neunzehn ");
				break;
			default: // shouldn't happen
			}
		}
		return buf.toString().trim();
	}

	/**
	 * This will correctly expand integers as well, although matchFloat() does not match them. This seems to be convenient in
	 * cases where "some number", i.e. integer or float, was matched, and needs to be expanded.
	 * 
	 * @param doc
	 *            doc
	 * @param s
	 *            s
	 * @param createMtu
	 *            createMtu
	 * @return makeNewTokens(doc, expString, createMtu, s)
	 */
	protected List<Element> expandFloat(Document doc, String s, boolean createMtu) {
		String expString = expandFloat(s);
		return makeNewTokens(doc, expString, createMtu, s);
	}

	protected String expandFloat(String number) {
		// String <code>number</code> must contain exactly one ',' or '.'
		long whole = 0; // the integer part of the number
		StringBuilder buf = new StringBuilder();
		int i = 0; // index in <code>number</code>
		while (i < number.length() && Character.isDigit(number.charAt(i))) {
			whole *= 10;
			// presupposing charset where '0' + 1 == '1' etc.
			whole += number.charAt(i) - '0';
			i++;
		}
		// Now, if the komma / dot was string-initial, whole is 0,
		// which will be pronounced also.
		// Say the integer part of the float like an integer:
		buf.append(expandInteger(whole));
		buf.append(" ");
		// Spell out the rest:
		if (i < number.length())
			buf.append(expandDigits(number.substring(i)));
		return buf.toString().trim();
	}

	protected List<Element> expandDigits(Document doc, String s, boolean createMtu) {
		String expString = expandDigits(s);
		return makeNewTokens(doc, expString, createMtu, s);
	}

	protected String expandDigits(String digits) {
		StringBuilder buf = new StringBuilder();
		for (int i = 0; i < digits.length(); i++) {
			switch (digits.charAt(i)) {
			case ',':
				buf.append("Komma ");
				break;
			case '.':
				buf.append("Punkt ");
				break;
			case '0':
				buf.append("null ");
				break;
			case '1':
				buf.append("eins ");
				break;
			case '2':
				buf.append("zwei ");
				break;
			case '3':
				buf.append("drei ");
				break;
			case '4':
				buf.append("vier ");
				break;
			case '5':
				buf.append("fünf ");
				break;
			case '6':
				buf.append("sechs ");
				break;
			case '7':
				buf.append("sieben ");
				break;
			case '8':
				buf.append("acht ");
				break;
			case '9':
				buf.append("neun ");
				break;
			default: // other characters (e.g., letters): output individually
				buf.append(digits.charAt(i));
				buf.append(' ');
			}
		}
		return buf.toString().trim();
	}

	/**
	 * For ordinals we put the expanded form in the sounds_like attribute and keep the surface form. This is for the POS tagger to
	 * tell a later module whether the ordinal is adverbial or adjectival.
	 * 
	 * @param doc
	 *            doc
	 * @param s
	 *            s
	 * @param createMtu
	 *            createMtu
	 * @return expandOrdinal(doc, value, createMtu, s)
	 */
	protected List<Element> expandOrdinal(Document doc, String s, boolean createMtu) {
		long value;
		// In canDealWith(), we have made a commitment to deal with
		// integers and roman numbers to be pronounced as ordinals.
		if (matchRoman(s)) {
			return expandRoman(doc, s, createMtu, true); // roman ordinal
		}
		String intString; // the string s without final dot, if any
		if (matchInteger(s)) {
			intString = s;
		} else {
			intString = s.substring(0, s.length() - 1);
		}
		try {
			while (intString.length() > 1 && intString.startsWith("0"))
				intString = intString.substring(1);
			value = Long.decode(intString).longValue();
		} catch (NumberFormatException e) {
			logger.info("Cannot convert string \"" + intString + "\" to long.");
			throw e;
		}
		return expandOrdinal(doc, value, createMtu, s);
	}

	protected List<Element> expandOrdinal(Document doc, long value, boolean createMtu, String orig) {
		StringBuilder exp = new StringBuilder();
		int rest = (int) value % 100;
		if (Math.abs(value) >= 100) {
			exp.append(expandInteger(value - rest));
			if (rest != 0)
				exp.append(" ");
		}
		switch (rest) {
		case 0:
			if (value == 0)
				exp.append("null");
			// else it is 100. etc.
			break;
		case 1:
			exp.append("ers");
			break;
		case 3:
			exp.append("drit");
			break;
		case 7:
			exp.append("sieb");
			break;
		case 8:
			exp.append("ach");
			break;
		default:
			exp.append(expandInteger(rest));
		}
		if (rest == 0 && value != 0 || rest >= 20)
			exp.append("s");
		exp.append("te");
		// OK, exp construction complete.
		// Now create the t element.
		Element t = MaryXML.createElement(doc, MaryXML.TOKEN);
		// Original surface form as graphemic form:
		MaryDomUtils.setTokenText(t, orig);
		// Expanded form in sounds_like attribute:
		t.setAttribute("sounds_like", exp.toString());
		t.setAttribute("ending", "ordinal");
		t.setAttribute("pos", "ADJA"); // part-of-speech: adjective
		List<Element> result = new ArrayList<Element>();
		if (createMtu) {
			// create mtu element enclosing the expanded tokens:
			Element mtu = MaryXML.createElement(doc, MaryXML.MTU);
			mtu.setAttribute("orig", orig);
			mtu.appendChild(t);
			result.add(mtu);
		} else {
			result.add(t);
		}
		return result;
	}

	protected List<Element> expandRoman(Document doc, String number, boolean createMtu) {
		// First, find out whether it is an ordinal or a simple integer:
		boolean isOrdinal = false;
		if (number.charAt(number.length() - 1) == '.') {
			isOrdinal = true;
			number = number.substring(0, number.length() - 1);
		}
		return expandRoman(doc, number, createMtu, isOrdinal);
	}

	protected List<Element> expandRoman(Document doc, String number, boolean createMtu, boolean isOrdinal) {
		// First make sure there is no dot at the end of number:
		// (here, we consider the dot an artefact of the fact that
		// reRoman allows an optional dot. This causes, e.g.,
		// <SAYAS MODE="cardinal">V.</SAYAS> to accept V., but
		// it is to be spoken as an integer.)
		if (number.charAt(number.length() - 1) == '.') {
			number = number.substring(0, number.length() - 1);
		}
		int value = MaryUtils.romanToInt(number);
		if (isOrdinal)
			return expandOrdinal(doc, value, createMtu, number);
		else
			return expandInteger(doc, value, createMtu, number);
	}

}