/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.de.preprocess; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import marytts.util.dom.MaryDomUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * An expansion pattern implementation for telephone number patterns. * * @author Marc Schröder */ public class TelephoneEP extends ExpansionPattern { private final String[] _knownTypes = { "telephone", }; /** * Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of * the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one * known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are * specializations. */ private final List<String> knownTypes = Arrays.asList(_knownTypes); public List<String> knownTypes() { return knownTypes; } // Domain-specific primitives: protected final String sTelephone = "(?:[0+][0-9/\\-\\.]+)"; protected final String sMatchingChars = "[0-9\\+\\/\\-\\.]"; // Now the actual match patterns: protected final Pattern reTelephone = Pattern.compile(sTelephone); private final Pattern reMatchingChars = Pattern.compile(sMatchingChars); public Pattern reMatchingChars() { return reMatchingChars; } /** * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same * telephone, the logger needs to be thread-safe or it will produce rubbish. */ // private Logger logger = MaryUtils.getLogger("TelephoneEP"); public TelephoneEP() { super(); } protected int match(String s, int type) { switch (type) { case 0: if (matchTelephone(s)) return 0; break; } return -1; } protected List<Element> expand(List<Element> tokens, String s, int type) { if (tokens == null) throw new NullPointerException("Received null argument"); if (tokens.isEmpty()) throw new IllegalArgumentException("Received empty list"); Document doc = ((Element) tokens.get(0)).getOwnerDocument(); // we expect type to be one of the return values of match(): List<Element> expanded = null; switch (type) { case 0: expanded = expandTelephone(doc, tokens); break; } replaceTokens(tokens, expanded); return expanded; } protected boolean matchTelephone(String s) { return reTelephone.matcher(s).matches(); } protected int canDealWith(String input, int typeCode) { if (typeCode != 0) return -1; if (REPattern.digit.matcher(input).find()) // contains at least one digit return 0; // OK else return -1; // failure } /** * This method, differently from what is usually done, does not take a string argument, but the original tokens. The reason is * that grouping of telephone number parts is often done using whitespace, information that would be lost if the * whitespace-free string was used. * * @param doc * doc * @param tokens * tokens * @return exp */ protected List<Element> expandTelephone(Document doc, List<Element> tokens) { // Before expansion, split into parts as follows: // - token boundaries are separators // - non-digits are separators // - If a part is longer than 3 digits, split it in // 3-2-...-2 (odd number of digits) or // 2-2-...-2 (even number of digits) digit parts. if (tokens == null || tokens.size() == 0) return null; ArrayList<Element> exp = new ArrayList<Element>(); ArrayList<String> parts = new ArrayList<String>(); // The very first character in the telephone number may be a + // (for international area code). Element firstToken = (Element) tokens.get(0); String firstText = MaryDomUtils.tokenText(firstToken); if (firstText != null && firstText.length() > 0 && firstText.charAt(0) == '+') { exp.addAll(makeNewTokens(doc, "Plus")); MaryDomUtils.setTokenText(firstToken, firstText.substring(1)); // remove + sign } for (Iterator<Element> it = tokens.iterator(); it.hasNext();) { Element t = (Element) it.next(); String s = MaryDomUtils.tokenText(t); if (!REPattern.digit.matcher(s).find()) // no digits in this token continue; // skip this token if (REPattern.onlyDigits.matcher(s).matches()) { parts.add(s); } else { int first = -1; // index in s of first digit of a new part for (int i = 0; i < s.length(); i++) { if (Character.isDigit(s.charAt(i))) { if (first == -1) { // first digit of new part found first = i; } } else { // not a digit if (first != -1) { // first non-digit after a part found parts.add(s.substring(first, i)); first = -1; } } } if (first != -1) { // s ends in digits parts.add(s.substring(first)); } } } // So now parts contains the digit groups. // Now find long digit groups and // split according to number of digits. for (int i = 0; i < parts.size(); i++) { String p = (String) parts.get(i); if (p.length() > 3) { if (p.length() % 2 != 0) { // odd number of digits // replace long entry by one group of three parts.set(i, p.substring(0, 3)); p = p.substring(3); } else { // replace long group by one group of two parts.set(i, p.substring(0, 2)); p = p.substring(2); } // now remove groups of two while (p.length() > 0) { i++; // the current insert position parts.add(i, p.substring(0, 2)); p = p.substring(2); } } } // Now parts contains the groups we are to speak. for (Iterator<String> it = parts.iterator(); it.hasNext();) { exp.addAll(number.expandDigits(doc, (String) it.next(), true)); // Force accent on last token in mtu: Element mtu = (Element) exp.get(exp.size() - 1); Element t = (Element) mtu.getLastChild(); t.setAttribute("accent", "unknown"); // And add a boundary after the group, unless it is the last group: if (it.hasNext()) { exp.add(MaryDomUtils.createBoundary(doc)); } } return exp; } }