/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.de.preprocess;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* An expansion pattern implementation for basic number patterns.
*
* @author Marc Schröder
*/
public class NumberEP extends ExpansionPattern {
private final String[] _knownTypes = { "number", "number:float", "number:integer", "number:ordinal", "number:roman",
"number:digits", "number:cardinal" };
/**
* Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of
* the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one
* known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are
* specializations.
*/
private final List<String> knownTypes = Arrays.asList(_knownTypes);
public List<String> knownTypes() {
return knownTypes;
}
// Domain-specific primitives:
// (floats and integers are recognised up to hundreds of millions (nine digits))
protected static final String sFloat = "(?:-?(?:[1-9][0-9]{0,8}|0)?(?:\\.|,)[0-9]+)";
protected static final String sInteger = "(?:-?[1-9][0-9]{0,8}|0)";
protected static final String sOrdinal = "(?:" + sInteger + "\\.)";
protected static final String sRoman = "(?:[MDCLXVI]+\\.?)";
protected static final String sDigits = "(?:[0-9.,]*[0-9][.,]?)";
protected static final String sCardinal = sFloat + "|" + sInteger;
// Now the actual match patterns:
protected final Pattern reFloat = Pattern.compile(sFloat);
protected final Pattern reInteger = Pattern.compile(sInteger);
protected final Pattern reOrdinal = Pattern.compile(sOrdinal);
protected final Pattern reRoman = Pattern.compile(sRoman);
protected final Pattern reDigits = Pattern.compile(sDigits);
// We do not use reMatchingChars here, but override isCandidate
private final Pattern reMatchingChars = null;
public Pattern reMatchingChars() {
return reMatchingChars;
}
/**
* Simple numbers are expected to be entire tokens. They should not be joined together out of several tokens.
*
* @return false
*/
protected boolean allowMultipleTokens() {
return false;
}
protected boolean isCandidate(Element t) {
String s = MaryDomUtils.tokenText(t);
return reFloat.matcher(s).matches() || reInteger.matcher(s).matches() || reOrdinal.matcher(s).matches()
|| reRoman.matcher(s).matches() || reDigits.matcher(s).matches();
}
/**
* Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same
* time, the logger needs to be thread-safe or it will produce rubbish.
*/
private Logger logger = MaryUtils.getLogger("NumberEP");
public NumberEP() {
super();
}
protected int match(String s, int type) {
switch (type) {
case 0:
if (matchFloat(s))
return 1;
if (matchInteger(s))
return 2;
if (matchOrdinal(s))
return 3;
// if (matchRoman(s)) return 4;
// Disable unconditional Roman number recognition.
// Problem: Even Abbreviations that happen to only use
// roman digits are pronounced as numbers. Example: LDC
// !!!! Would have to be replaced by a context dependent
// pronounciation -- only in specific contexts is a roman number
// expanded as such.
// For the moment, roman numbers need to be specifically requested
// via markup.
if (matchDigits(s))
return 5;
// case"6" Cardinal = Float | Integer
break;
case 1:
if (matchFloat(s))
return 1;
break;
case 2:
if (matchInteger(s))
return 2;
break;
case 3:
if (matchOrdinal(s))
return 3;
break;
case 4:
if (matchRoman(s))
return 4;
break;
case 5:
if (matchDigits(s))
return 5;
break;
case 6:
if (matchInteger(s))
return 2;
if (matchFloat(s))
return 1;
break;
}
return -1;
}
protected int canDealWith(String input, int typeCode) {
switch (typeCode) {
case 0:
if (matchFloat(input))
return 1;
if (matchInteger(input))
return 2;
if (matchOrdinal(input))
return 3;
if (matchRoman(input))
return 4;
if (matchDigits(input))
return 5;
break;
case 1:
if (matchFloat(input))
return 1;
break;
case 2: // integer
if (matchInteger(input) || matchRoman(input))
return 2;
break;
case 3: // ordinal
if (matchOrdinal(input) || matchInteger(input) || matchRoman(input))
return 3;
break;
case 4:
if (matchRoman(input))
return 4;
break;
case 5:
if (matchDigits(input))
return 5;
break;
case 6: // cardinal; is either integer or float
if (matchInteger(input))
return 2;
if (matchFloat(input))
return 1;
break;
}
return -1; // no, cannot deal with it as the given type
}
protected List<Element> expand(List<Element> tokens, String s, int type) {
if (tokens == null)
throw new NullPointerException("Received null argument");
if (tokens.isEmpty())
throw new IllegalArgumentException("Received empty list");
Document doc = ((Element) tokens.get(0)).getOwnerDocument();
// we expect type to be one of the return values of match():
List<Element> expanded = null;
switch (type) {
case 1:
expanded = expandFloat(doc, s, true);
break;
case 2:
expanded = expandInteger(doc, s, true);
break;
case 3:
expanded = expandOrdinal(doc, s, true);
break;
case 4:
expanded = expandRoman(doc, s, true);
break;
case 5:
expanded = expandDigits(doc, s, true);
break;
}
replaceTokens(tokens, expanded);
return expanded;
}
protected boolean matchFloat(String s) {
return reFloat.matcher(s).matches();
}
protected boolean matchInteger(String s) {
return reInteger.matcher(s).matches();
}
protected boolean matchOrdinal(String s) {
return reOrdinal.matcher(s).matches();
}
protected boolean matchRoman(String s) {
return reRoman.matcher(s).matches();
}
protected boolean matchDigits(String s) {
return reDigits.matcher(s).matches();
}
protected List<Element> expandInteger(Document doc, String s, boolean createMtu) {
long value;
// In canDealWith(), we have made a commitment to deal with
// roman numbers to be pronounced as integers.
if (matchRoman(s)) {
return expandRoman(doc, s, createMtu, false); // roman integer
}
try {
while (s.length() > 1 && s.startsWith("0"))
s = s.substring(1);
value = Long.parseLong(s);
} catch (NumberFormatException e) {
logger.info("Cannot convert string \"" + s + "\" to long.");
throw e;
}
return expandInteger(doc, value, createMtu, s);
}
protected List<Element> expandInteger(Document doc, long value, boolean createMtu, String orig) {
String expString = expandInteger(value);
return makeNewTokens(doc, expString, createMtu, orig);
}
protected String expandInteger(String s) {
long value;
try {
while (s.length() > 1 && s.startsWith("0"))
s = s.substring(1);
value = Long.decode(s).longValue();
} catch (NumberFormatException e) {
logger.info("Cannot convert string \"" + s + "\" to long.");
throw e;
}
return expandInteger(value);
}
protected String expandInteger(long value) {
long millions;
int thousands;
int hundreds;
int tens;
int rest;
StringBuilder buf = new StringBuilder();
// Special treatment for the 0:
if (value == 0) {
return (new String("null"));
}
if (value < 0) {
buf.append("Minus ");
}
millions = value / 1000000;
rest = (int) value % 1000000; // the part of value below 1 000 000
if (millions > 1) {
buf.append(expandInteger(millions)); // recursive call
buf.append(" ");
buf.append("Millionen ");
} else if (millions == 1) {
buf.append("eine Million ");
}
thousands = rest / 1000;
rest = rest % 1000;
if (thousands > 1) {
buf.append(expandInteger(thousands));
buf.append(" ");
buf.append("Tausend ");
} else if (thousands == 1) {
buf.append("ein Tausend ");
}
hundreds = rest / 100;
rest = rest % 100;
if (hundreds > 1) {
buf.append(expandInteger(hundreds));
buf.append(" ");
buf.append("Hundert ");
} else if (hundreds == 1) {
buf.append("ein Hundert ");
}
if (rest >= 20) {
tens = rest / 10;
rest = rest % 10;
switch (rest) {
case 1:
buf.append("einund");
break;
case 2:
buf.append("zweiund");
break;
case 3:
buf.append("dreiund");
break;
case 4:
buf.append("vierund");
break;
case 5:
buf.append("fünfund");
break;
case 6:
buf.append("sechsund");
break;
case 7:
buf.append("siebenund");
break;
case 8:
buf.append("achtund");
break;
case 9:
buf.append("neunund");
break;
default: // 0: do nothing
}
switch (tens) {
case 2:
buf.append("zwanzig ");
break;
case 3:
buf.append("dreißig ");
break;
case 4:
buf.append("vierzig ");
break;
case 5:
buf.append("fünfzig ");
break;
case 6:
buf.append("sechzig ");
break;
case 7:
buf.append("siebzig ");
break;
case 8:
buf.append("achtzig ");
break;
case 9:
buf.append("neunzig ");
break;
default: // shouldn't happen
}
} else { // rest < 20
switch (rest) {
case 1:
buf.append("eins ");
break;
case 2:
buf.append("zwei ");
break;
case 3:
buf.append("drei ");
break;
case 4:
buf.append("vier ");
break;
case 5:
buf.append("fünf ");
break;
case 6:
buf.append("sechs ");
break;
case 7:
buf.append("sieben ");
break;
case 8:
buf.append("acht ");
break;
case 9:
buf.append("neun ");
break;
case 10:
buf.append("zehn ");
break;
case 11:
buf.append("elf ");
break;
case 12:
buf.append("zwölf ");
break;
case 13:
buf.append("dreizehn ");
break;
case 14:
buf.append("vierzehn ");
break;
case 15:
buf.append("fünfzehn ");
break;
case 16:
buf.append("sechzehn ");
break;
case 17:
buf.append("siebzehn ");
break;
case 18:
buf.append("achtzehn ");
break;
case 19:
buf.append("neunzehn ");
break;
default: // shouldn't happen
}
}
return buf.toString().trim();
}
/**
* This will correctly expand integers as well, although matchFloat() does not match them. This seems to be convenient in
* cases where "some number", i.e. integer or float, was matched, and needs to be expanded.
*
* @param doc
* doc
* @param s
* s
* @param createMtu
* createMtu
* @return makeNewTokens(doc, expString, createMtu, s)
*/
protected List<Element> expandFloat(Document doc, String s, boolean createMtu) {
String expString = expandFloat(s);
return makeNewTokens(doc, expString, createMtu, s);
}
protected String expandFloat(String number) {
// String <code>number</code> must contain exactly one ',' or '.'
long whole = 0; // the integer part of the number
StringBuilder buf = new StringBuilder();
int i = 0; // index in <code>number</code>
while (i < number.length() && Character.isDigit(number.charAt(i))) {
whole *= 10;
// presupposing charset where '0' + 1 == '1' etc.
whole += number.charAt(i) - '0';
i++;
}
// Now, if the komma / dot was string-initial, whole is 0,
// which will be pronounced also.
// Say the integer part of the float like an integer:
buf.append(expandInteger(whole));
buf.append(" ");
// Spell out the rest:
if (i < number.length())
buf.append(expandDigits(number.substring(i)));
return buf.toString().trim();
}
protected List<Element> expandDigits(Document doc, String s, boolean createMtu) {
String expString = expandDigits(s);
return makeNewTokens(doc, expString, createMtu, s);
}
protected String expandDigits(String digits) {
StringBuilder buf = new StringBuilder();
for (int i = 0; i < digits.length(); i++) {
switch (digits.charAt(i)) {
case ',':
buf.append("Komma ");
break;
case '.':
buf.append("Punkt ");
break;
case '0':
buf.append("null ");
break;
case '1':
buf.append("eins ");
break;
case '2':
buf.append("zwei ");
break;
case '3':
buf.append("drei ");
break;
case '4':
buf.append("vier ");
break;
case '5':
buf.append("fünf ");
break;
case '6':
buf.append("sechs ");
break;
case '7':
buf.append("sieben ");
break;
case '8':
buf.append("acht ");
break;
case '9':
buf.append("neun ");
break;
default: // other characters (e.g., letters): output individually
buf.append(digits.charAt(i));
buf.append(' ');
}
}
return buf.toString().trim();
}
/**
* For ordinals we put the expanded form in the sounds_like attribute and keep the surface form. This is for the POS tagger to
* tell a later module whether the ordinal is adverbial or adjectival.
*
* @param doc
* doc
* @param s
* s
* @param createMtu
* createMtu
* @return expandOrdinal(doc, value, createMtu, s)
*/
protected List<Element> expandOrdinal(Document doc, String s, boolean createMtu) {
long value;
// In canDealWith(), we have made a commitment to deal with
// integers and roman numbers to be pronounced as ordinals.
if (matchRoman(s)) {
return expandRoman(doc, s, createMtu, true); // roman ordinal
}
String intString; // the string s without final dot, if any
if (matchInteger(s)) {
intString = s;
} else {
intString = s.substring(0, s.length() - 1);
}
try {
while (intString.length() > 1 && intString.startsWith("0"))
intString = intString.substring(1);
value = Long.decode(intString).longValue();
} catch (NumberFormatException e) {
logger.info("Cannot convert string \"" + intString + "\" to long.");
throw e;
}
return expandOrdinal(doc, value, createMtu, s);
}
protected List<Element> expandOrdinal(Document doc, long value, boolean createMtu, String orig) {
StringBuilder exp = new StringBuilder();
int rest = (int) value % 100;
if (Math.abs(value) >= 100) {
exp.append(expandInteger(value - rest));
if (rest != 0)
exp.append(" ");
}
switch (rest) {
case 0:
if (value == 0)
exp.append("null");
// else it is 100. etc.
break;
case 1:
exp.append("ers");
break;
case 3:
exp.append("drit");
break;
case 7:
exp.append("sieb");
break;
case 8:
exp.append("ach");
break;
default:
exp.append(expandInteger(rest));
}
if (rest == 0 && value != 0 || rest >= 20)
exp.append("s");
exp.append("te");
// OK, exp construction complete.
// Now create the t element.
Element t = MaryXML.createElement(doc, MaryXML.TOKEN);
// Original surface form as graphemic form:
MaryDomUtils.setTokenText(t, orig);
// Expanded form in sounds_like attribute:
t.setAttribute("sounds_like", exp.toString());
t.setAttribute("ending", "ordinal");
t.setAttribute("pos", "ADJA"); // part-of-speech: adjective
List<Element> result = new ArrayList<Element>();
if (createMtu) {
// create mtu element enclosing the expanded tokens:
Element mtu = MaryXML.createElement(doc, MaryXML.MTU);
mtu.setAttribute("orig", orig);
mtu.appendChild(t);
result.add(mtu);
} else {
result.add(t);
}
return result;
}
protected List<Element> expandRoman(Document doc, String number, boolean createMtu) {
// First, find out whether it is an ordinal or a simple integer:
boolean isOrdinal = false;
if (number.charAt(number.length() - 1) == '.') {
isOrdinal = true;
number = number.substring(0, number.length() - 1);
}
return expandRoman(doc, number, createMtu, isOrdinal);
}
protected List<Element> expandRoman(Document doc, String number, boolean createMtu, boolean isOrdinal) {
// First make sure there is no dot at the end of number:
// (here, we consider the dot an artefact of the fact that
// reRoman allows an optional dot. This causes, e.g.,
// <SAYAS MODE="cardinal">V.</SAYAS> to accept V., but
// it is to be spoken as an integer.)
if (number.charAt(number.length() - 1) == '.') {
number = number.substring(0, number.length() - 1);
}
int value = MaryUtils.romanToInt(number);
if (isOrdinal)
return expandOrdinal(doc, value, createMtu, number);
else
return expandInteger(doc, value, createMtu, number);
}
}