CompositeEP.java example

Explorer
marytts-master
/**
 * Copyright 2002 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.language.it.preprocess;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;

import org.apache.log4j.Logger;
import org.w3c.dom.DOMException;
import org.w3c.dom.Element;

/**
 * An expansion pattern implementation for composite patterns. Words consisting of digits and letters and pseudo-composites with a
 * hyphen are split into their components. These will then need to be looked at by the other pattern expanders. CompositeEP
 * directly overrides process(), and does not care about the usual subclass methods isCandidate(), match(), and expand().
 * 
 * @author Marc Schröder
 */

public class CompositeEP extends ExpansionPattern {
	// Domain-specific primitives:
	Pattern reTrailingHyphen = Pattern.compile("([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú])-$");
	Pattern reLeadingHyphen = Pattern.compile("^-([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú])");
	// LettersDigitsAndHyphens consists of parts separated by hyphens,
	// each part containing at least one digit or letter.
	Pattern reLettersDigitsAndHyphens = Pattern
			.compile("([^-]*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^-]*)(-[^-]*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^-]*)+");

	Pattern reLettersDigitsAndApostrophe = Pattern
			.compile("([^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^']*)('[^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^']*)+");

	// this can be used for dell' un' etc..
	/*
	 * Pattern reLettersAndApostrophe =
	 * //Pattern.compile("([^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú][^']*)('[^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú][^']*)+");
	 * Pattern.compile("([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú]+)('([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú])+)+");
	 */
	// TODO: FABIO Check if better with new REPattern...
	// This is used for c'X t'X d'X ()
	Pattern reOneLetterAndApostrophe = Pattern
			.compile("([^']*[^EIOUYaeiouyÀÁÈÉÌÍÒÓÄÖÜËÏäöüëïÙÚàáèéìíòóùú])('[hH]?([AEIOUYaeiouyÀÁÈÉÌÍÒÓÄÖÜËÏäöüëïÙÚàáèéìíòóùú][^']*)+)+");

	// Both letters and digits, in any order:
	Pattern reLettersAndDigits = Pattern
			.compile("(?:(?:[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú]+[0-9]+)|(?:[0-9]+[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú]+))[A-ZÄÖÜa-zàáèéìíòóùú0-9]*");

	public List knownTypes() {
		return new ArrayList();
	}

	private final Pattern reMatchingChars = Pattern.compile("");

	public Pattern reMatchingChars() {
		return reMatchingChars;
	}

	/**
	 * Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same
	 * time, the logger needs to be thread-safe or it will produce rubbish.
	 */
	private Logger logger = MaryUtils.getLogger("CompositeEP");

	public CompositeEP() {
		super();
	}

	/**
	 * Process and expand a list of tokens.
	 * 
	 * @param tokens
	 *            the list of tokens to be expanded one after the other.
	 * @return a list of expanded forms of all the tokens, i.e. the concatenation of the expanded form (or unexpanded form if no
	 *         expansion is possible) of all the tokens.
	 */
	private List process(List tokens) {
		List result = new ArrayList();
		for (Iterator it = tokens.iterator(); it.hasNext();) {
			Element t = (Element) it.next();
			if (!t.getTagName().equals(MaryXML.TOKEN))
				throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected t element");
			List expanded = new ArrayList();
			process(t, expanded);
			if (expanded.isEmpty()) // no expansion
				result.add(t);
			else
				result.addAll(expanded);
		}
		return result;
	}

	/**
	 * Process this token. The CompositeEP works as a splitter of single tokens, iteratively expanding a token into its
	 * components.
	 * 
	 * @param t
	 *            the element to expand. After processing, this Element will still exist and be a valid Element, but possibly with
	 *            a different content, and possibly enclosed by an <mtu> element. In addition, <t> may have new
	 *            right-hand neighbors.
	 * @param expanded
	 *            an empty list into which the expanded Elements are placed if an expansion occurred. The list will remain empty
	 *            if no expansion was performed.
	 * @return true if this pattern is confident to have fully expanded this token, false if nothing could be done or more
	 *         expansion may be necessary. CompositeEP always returns false, in order to have other ExpansionPatterns look at the
	 *         components as well.
	 */
	public boolean process(Element t, final List expanded) {
		if (t == null || expanded == null)
			throw new NullPointerException("Received null argument");
		if (!t.getTagName().equals(MaryXML.TOKEN))
			throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected t element");
		if (!expanded.isEmpty())
			throw new IllegalArgumentException("Expected empty list, but list has " + expanded.size() + " elements.");
		// Only modify tokens for which no pronunciation is given:
		if (t.hasAttribute("ph") || t.hasAttribute("sounds_like")) {
			return false;
		}

		// /// First, some cleaning up of the token: /////
		String s = MaryDomUtils.tokenText(t);
		// System.err.println("Trying to split " + s);
		if (reTrailingHyphen.matcher(s).find()) {
			// System.err.println("reTrailingHyphen");
			// remove trailing hyphens after letters:
			s = reTrailingHyphen.matcher(s).replaceFirst("$1");
			MaryDomUtils.setTokenText(t, s);
		}
		if (reLeadingHyphen.matcher(s).find()) {
			// System.err.println("reLeadingHyphen");
			// remove leading hyphens before letters:
			s = reLeadingHyphen.matcher(s).replaceFirst("$1");
			MaryDomUtils.setTokenText(t, s);
		}

		// /// Then, see if we can split it: /////
		// System.err.println("Then, see if we can split it: /////");
		if (reLettersDigitsAndHyphens.matcher(s).matches()) {
			// OK, a hyphen between parts containing letters and/or digits.
			// In pseudo-composita, accent is on the first component:
			Element mtu = MaryDomUtils.encloseWithMTU(t, s, "first");
			StringTokenizer st = new StringTokenizer(s, "-");
			assert st.hasMoreTokens();
			MaryDomUtils.setTokenText(t, st.nextToken());
			expanded.add(t);
			while (st.hasMoreTokens()) {
				t = MaryDomUtils.appendToken(t, st.nextToken());
				expanded.add(t);
			}
		}

		// This is for one letter proclitics (c'X, d'X, ...) and qual'X
		else if (reOneLetterAndApostrophe.matcher(s).matches()) {
			// System.err.println("one letter and apostrophe");
			// OK, a hyphen between parts containing letters and/or digits.
			// In pseudo-composita, accent is on the first component:
			// c'X l'X d'X c' is the first-proclitics part
			Element mtu = MaryDomUtils.encloseWithMTU(t, s, "last-proclitics");
			StringTokenizer st = new StringTokenizer(s, "'");
			assert st.hasMoreTokens();
			MaryDomUtils.setTokenText(t, st.nextToken() + "'");
			expanded.add(t);
			while (st.hasMoreTokens()) {
				t = MaryDomUtils.appendToken(t, st.nextToken());
				expanded.add(t);
			}
		} else if (reLettersAndDigits.matcher(s).matches()) {
			// Token consists only of letters and digits.
			// Split between letters and digits.
			// In pseudo-composita, accent is on the first component:
			Element mtu = MaryDomUtils.encloseWithMTU(t, s, "first");
			String s1 = s;
			boolean isFirst = true;
			while (s1.length() > 0) {
				String part;
				Matcher reMatcher = REPattern.initialNonDigits.matcher(s1);
				if (reMatcher.find()) {
					part = reMatcher.group();
					s1 = reMatcher.replaceFirst("");
				} else {
					reMatcher = REPattern.initialDigits.matcher(s1);
					reMatcher.find();
					part = reMatcher.group();
					s1 = reMatcher.replaceFirst("");
				}
				if (isFirst)
					MaryDomUtils.setTokenText(t, part);
				else
					t = MaryDomUtils.appendToken(t, part);
				expanded.add(t);
				isFirst = false;
			}
		} else if (s.equals("'s")) {
			// a standalone 's: simply pronounce it as [s].
			t.setAttribute("ph", "s");
			expanded.add(t);
		} else if (s.endsWith("'s")) {
			// Cases like "geht's": Simply have it pronounced like "gehts".
			// No iteration.
			t.setAttribute("sounds_like", s.substring(0, s.length() - 2));
			t.setAttribute("ph", "*s");
			expanded.add(t);
		} else if (ExpansionPattern.reSplitAtChars().matcher(s).find()
				&& (REPattern.letter.matcher(s).find() || REPattern.digit.matcher(s).find())) {
			// Split into parts, keeping each special char as one part
			// For special characters, accent is on the last component:
			Element mtu = MaryDomUtils.encloseWithMTU(t, s, "last");
			StringTokenizer st = new StringTokenizer(s, ExpansionPattern.getSplitAtChars(), true); // return delimiters
			MaryDomUtils.setTokenText(t, st.nextToken());
			expanded.add(t);
			while (st.hasMoreTokens()) {
				t = MaryDomUtils.appendToken(t, st.nextToken());
				expanded.add(t);
			}
		}
		// iterative call:
		if (expanded.size() > 0) {
			List newExpanded = process(expanded);
			expanded.clear();
			expanded.addAll(newExpanded);
		}

		// Never return true, in order to allow other ExpansionPatterns to
		// expand the components.
		return false;
	}

	protected int canDealWith(String input, int typeCode) {
		return match(input, typeCode);
	}

	protected int match(String input, int typeCode) {
		throw new RuntimeException("This method should not be called.");
	}

	protected List expand(List tokens, String text, int typeCode) {
		throw new RuntimeException("This method should not be called.");
	}

}