TelephoneEP.java example

Explorer
marytts-master
/**
 * Copyright 2002 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.language.de.preprocess;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import marytts.util.dom.MaryDomUtils;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

/**
 * An expansion pattern implementation for telephone number patterns.
 *
 * @author Marc Schröder
 */

public class TelephoneEP extends ExpansionPattern {
	private final String[] _knownTypes = { "telephone", };
	/**
	 * Every subclass has its own list knownTypes, an internal string representation of known types. These are possible values of
	 * the <code>type</code> attribute to the <code>say-as</code> element, as defined in MaryXML.dtd. If there is more than one
	 * known type, the first type (<code>knownTypes[0]</code>) is expected to be the most general one, of which the others are
	 * specializations.
	 */
	private final List<String> knownTypes = Arrays.asList(_knownTypes);

	public List<String> knownTypes() {
		return knownTypes;
	}

	// Domain-specific primitives:
	protected final String sTelephone = "(?:[0+][0-9/\\-\\.]+)";
	protected final String sMatchingChars = "[0-9\\+\\/\\-\\.]";

	// Now the actual match patterns:
	protected final Pattern reTelephone = Pattern.compile(sTelephone);
	private final Pattern reMatchingChars = Pattern.compile(sMatchingChars);

	public Pattern reMatchingChars() {
		return reMatchingChars;
	}

	/**
	 * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same
	 * telephone, the logger needs to be thread-safe or it will produce rubbish.
	 */
	// private Logger logger = MaryUtils.getLogger("TelephoneEP");

	public TelephoneEP() {
		super();
	}

	protected int match(String s, int type) {
		switch (type) {
		case 0:
			if (matchTelephone(s))
				return 0;
			break;
		}
		return -1;
	}

	protected List<Element> expand(List<Element> tokens, String s, int type) {
		if (tokens == null)
			throw new NullPointerException("Received null argument");
		if (tokens.isEmpty())
			throw new IllegalArgumentException("Received empty list");
		Document doc = ((Element) tokens.get(0)).getOwnerDocument();
		// we expect type to be one of the return values of match():
		List<Element> expanded = null;
		switch (type) {
		case 0:
			expanded = expandTelephone(doc, tokens);
			break;
		}
		replaceTokens(tokens, expanded);
		return expanded;
	}

	protected boolean matchTelephone(String s) {
		return reTelephone.matcher(s).matches();
	}

	protected int canDealWith(String input, int typeCode) {
		if (typeCode != 0)
			return -1;
		if (REPattern.digit.matcher(input).find()) // contains at least one digit
			return 0; // OK
		else
			return -1; // failure
	}

	/**
	 * This method, differently from what is usually done, does not take a string argument, but the original tokens. The reason is
	 * that grouping of telephone number parts is often done using whitespace, information that would be lost if the
	 * whitespace-free string was used.
	 * 
	 * @param doc
	 *            doc
	 * @param tokens
	 *            tokens
	 * @return exp
	 */
	protected List<Element> expandTelephone(Document doc, List<Element> tokens) {
		// Before expansion, split into parts as follows:
		// - token boundaries are separators
		// - non-digits are separators
		// - If a part is longer than 3 digits, split it in
		// 3-2-...-2 (odd number of digits) or
		// 2-2-...-2 (even number of digits) digit parts.
		if (tokens == null || tokens.size() == 0)
			return null;
		ArrayList<Element> exp = new ArrayList<Element>();
		ArrayList<String> parts = new ArrayList<String>();
		// The very first character in the telephone number may be a +
		// (for international area code).
		Element firstToken = (Element) tokens.get(0);
		String firstText = MaryDomUtils.tokenText(firstToken);
		if (firstText != null && firstText.length() > 0 && firstText.charAt(0) == '+') {
			exp.addAll(makeNewTokens(doc, "Plus"));
			MaryDomUtils.setTokenText(firstToken, firstText.substring(1)); // remove + sign
		}
		for (Iterator<Element> it = tokens.iterator(); it.hasNext();) {
			Element t = (Element) it.next();
			String s = MaryDomUtils.tokenText(t);
			if (!REPattern.digit.matcher(s).find()) // no digits in this token
				continue; // skip this token
			if (REPattern.onlyDigits.matcher(s).matches()) {
				parts.add(s);
			} else {
				int first = -1; // index in s of first digit of a new part
				for (int i = 0; i < s.length(); i++) {
					if (Character.isDigit(s.charAt(i))) {
						if (first == -1) { // first digit of new part found
							first = i;
						}
					} else { // not a digit
						if (first != -1) { // first non-digit after a part found
							parts.add(s.substring(first, i));
							first = -1;
						}
					}
				}
				if (first != -1) { // s ends in digits
					parts.add(s.substring(first));
				}
			}
		}
		// So now parts contains the digit groups.
		// Now find long digit groups and
		// split according to number of digits.
		for (int i = 0; i < parts.size(); i++) {
			String p = (String) parts.get(i);
			if (p.length() > 3) {
				if (p.length() % 2 != 0) { // odd number of digits
					// replace long entry by one group of three
					parts.set(i, p.substring(0, 3));
					p = p.substring(3);
				} else {
					// replace long group by one group of two
					parts.set(i, p.substring(0, 2));
					p = p.substring(2);
				}
				// now remove groups of two
				while (p.length() > 0) {
					i++; // the current insert position
					parts.add(i, p.substring(0, 2));
					p = p.substring(2);
				}
			}
		}
		// Now parts contains the groups we are to speak.
		for (Iterator<String> it = parts.iterator(); it.hasNext();) {
			exp.addAll(number.expandDigits(doc, (String) it.next(), true));
			// Force accent on last token in mtu:
			Element mtu = (Element) exp.get(exp.size() - 1);
			Element t = (Element) mtu.getLastChild();
			t.setAttribute("accent", "unknown");
			// And add a boundary after the group, unless it is the last group:
			if (it.hasNext()) {
				exp.add(MaryDomUtils.createBoundary(doc));
			}
		}
		return exp;
	}

}