/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.de.preprocess; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * An expansion pattern implementation for currency patterns. * * @author Marc Schröder */ public class CurrencyEP extends ExpansionPattern { private final String[] _knownTypes = { "currency" }; /** * Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of * the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one * known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are * specialisations. */ private final List<String> knownTypes = Arrays.asList(_knownTypes); public List<String> knownTypes() { return knownTypes; } private final String[] _currencySymbolNames = { "DM", "Mark", new Character((char) 8364).toString(), "Euro", "$", "Dollar", "FF", "Francs['frO~]", new Character((char) 165).toString(), "Yen['jEn]", new Character((char) 163).toString(), "Pfund", "sFr.", "Franken", "Kr.", "Kronen", "USD", "U S Dollar", "ATS", "Schilling", "BEF", "belgische Francs['frO~]", "GBP", "britische Pfund", // avoid "CAD", "kanadische Dollar", because of Computer aided design "DKK", "daenische Kronen", "NLG", "Gulden", "EUR", "Euro", "Euro", "Euro", "FRF", "Francs['frO~]", "DEM", "Mark", "GRD", "Drachmen", "IEP", "irische Pfund", "ITL", "Lire", "JPY", "Yen['jEn]", "LUF", "luxemburgische Francs['frO~]", // avoid PLZ polish Zloty because of PLZ Postleitzahl "PTE", "Escudo[Es-'ku:-do:]", "RUB", "Rubel", "ESP", "Peseten", "SEK", "schwedische Kronen", "CHF", "Franken", }; private final Map<String, String> currencySymbolNames = MaryUtils.arrayToMap(_currencySymbolNames); private final String[] _currencySymbolNamesSingular = { "DM", "eine Mark", new Character((char) 8364).toString(), "ein Euro", "$", "ein Dollar", "FF", "ein Francs['frO~]", new Character((char) 165).toString(), "ein Yen['jEn]", new Character((char) 163).toString(), "ein Pfund", "sFr.", "ein Franken", "Kr.", "eine Krone", "USD", "ein U S Dollar", "ATS", "ein Schilling", "BEF", "ein belgischer Francs['frO~]", "GBP", "ein britisches Pfund", // avoid "CAD", "kanadische Dollar", because of Computer aided design "DKK", "eine daenische Krone", "NLG", "ein Gulden", "EUR", "ein Euro", "FRF", "ein Francs['frO~]", "DEM", "eine Mark", "GRD", "eine Drachme", "IEP", "ein irisches Pfund", "ITL", "eine Lire", "JPY", "ein Yen['jEn]", "LUF", "ein luxemburgischer Francs['frO~]", // avoid PLZ polish Zloty because of PLZ Postleitzahl "PTE", "ein Escudo[Es-'ku:-do:]", "RUB", "ein Rubel", "ESP", "eine Pesete", "SEK", "eine schwedische Krone", "CHF", "ein Franken", }; private final Map<String, String> currencySymbolNamesSingular = MaryUtils.arrayToMap(_currencySymbolNamesSingular); // Domain-specific primitives: protected final String sCurrencySymbol = getCurrencySymbols(); protected final String sCurrencyAmount = "(?:" + NumberEP.sInteger + "(?:[,.](?:-|[0-9][0-9]))?)"; protected final String sCurrencyAmountSubstructure = "(?:(" + NumberEP.sInteger + ")(?:[,.](-|[0-9][0-9]))?)"; // in this, first parenthesis are the wholes and second paren are the cents. // We don't use sMatchingChars here, but override isCandidate(). // Now the actual match patterns: protected final Pattern reCurrencyLeading = Pattern.compile("(" + sCurrencySymbol + ")(" + sCurrencyAmount + ")"); protected final Pattern reCurrencyTrailing = Pattern.compile("(" + sCurrencyAmount + ")(" + sCurrencySymbol + ")"); protected final Pattern reCurrencyAmountSubstructure = Pattern.compile(sCurrencyAmountSubstructure); private final Pattern reMatchingChars = null; public Pattern reMatchingChars() { return reMatchingChars; } /** * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same * time, the logger needs to be thread-safe or it will produce rubbish. * * @return _sCurrencySymbol.toString() */ // private Logger logger = MaryUtils.getLogger("CurrencyEP"); // Only used to initialise sCurrencySymbol from _currencySymbolNames[]: private String getCurrencySymbols() { StringBuilder _sCurrencySymbol = new StringBuilder("(?:\\$"); for (int i = 0; i < _currencySymbolNames.length; i += 2) { if (!_currencySymbolNames[i].equals("$")) { // $ needs to be quoted in regular expression _sCurrencySymbol.append("|" + _currencySymbolNames[i]); } } _sCurrencySymbol.append(")"); return _sCurrencySymbol.toString(); } public CurrencyEP() { super(); } protected boolean isCandidate(Element t) { String s = MaryDomUtils.tokenText(t); return (s.length() <= 4 || number.isCandidate(t) || matchCurrency(s)); } protected int canDealWith(String s, int type) { return match(s, type); } protected int match(String s, int type) { switch (type) { case 0: if (matchCurrency(s)) return 0; break; } return -1; } protected List<Element> expand(List<Element> tokens, String s, int type) { if (tokens == null) throw new NullPointerException("Received null argument"); if (tokens.isEmpty()) throw new IllegalArgumentException("Received empty list"); Document doc = ((Element) tokens.get(0)).getOwnerDocument(); // we expect type to be one of the return values of match(): List<Element> expanded = null; switch (type) { case 0: expanded = expandCurrency(doc, s); break; } replaceTokens(tokens, expanded); return expanded; } private boolean matchCurrency(String s) { return reCurrencyLeading.matcher(s).matches() || reCurrencyTrailing.matcher(s).matches(); } protected List<Element> expandCurrency(Document doc, String s) { ArrayList<Element> exp = new ArrayList<Element>(); StringBuilder sb = new StringBuilder(); String currency = null; String amount = null; Matcher reMatcher = reCurrencyLeading.matcher(s); if (reMatcher.find()) { // OK, matched currency = reMatcher.group(1); amount = reMatcher.group(2); } else { reMatcher = reCurrencyTrailing.matcher(s); if (!reMatcher.find()) return null; amount = reMatcher.group(1); currency = reMatcher.group(2); } // Now in amount, find wholes and cents: reMatcher = reCurrencyAmountSubstructure.matcher(amount); reMatcher.find(); String wholes = reMatcher.group(1); // Special treatment of singular. // This does not accout for case (dativ, akkusativ). if (wholes.equals("1")) { String singularExpansion = (String) currencySymbolNamesSingular.get(currency); sb.append(singularExpansion); } else { sb.append(number.expandInteger(wholes)); sb.append(" "); String currencyName = (String) currencySymbolNames.get(currency); sb.append(currencyName); } String cents = reMatcher.group(2); if (cents != null && cents.length() > 0 && !cents.equals("-")) { // OK, cents are two digits sb.append(" "); sb.append(number.expandInteger(cents)); } exp.addAll(makeNewTokens(doc, sb.toString(), true, s)); return exp; } }