/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.it.preprocess;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import org.apache.log4j.Logger;
import org.w3c.dom.DOMException;
import org.w3c.dom.Element;
/**
* An expansion pattern implementation for composite patterns. Words consisting of digits and letters and pseudo-composites with a
* hyphen are split into their components. These will then need to be looked at by the other pattern expanders. CompositeEP
* directly overrides process(), and does not care about the usual subclass methods isCandidate(), match(), and expand().
*
* @author Marc Schröder
*/
public class CompositeEP extends ExpansionPattern {
// Domain-specific primitives:
Pattern reTrailingHyphen = Pattern.compile("([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú])-$");
Pattern reLeadingHyphen = Pattern.compile("^-([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú])");
// LettersDigitsAndHyphens consists of parts separated by hyphens,
// each part containing at least one digit or letter.
Pattern reLettersDigitsAndHyphens = Pattern
.compile("([^-]*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^-]*)(-[^-]*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^-]*)+");
Pattern reLettersDigitsAndApostrophe = Pattern
.compile("([^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^']*)('[^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú0-9][^']*)+");
// this can be used for dell' un' etc..
/*
* Pattern reLettersAndApostrophe =
* //Pattern.compile("([^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú][^']*)('[^']*[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú][^']*)+");
* Pattern.compile("([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú]+)('([A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú])+)+");
*/
// TODO: FABIO Check if better with new REPattern...
// This is used for c'X t'X d'X ()
Pattern reOneLetterAndApostrophe = Pattern
.compile("([^']*[^EIOUYaeiouyÀÁÈÉÌÍÒÓÄÖÜËÏäöüëïÙÚàáèéìíòóùú])('[hH]?([AEIOUYaeiouyÀÁÈÉÌÍÒÓÄÖÜËÏäöüëïÙÚàáèéìíòóùú][^']*)+)+");
// Both letters and digits, in any order:
Pattern reLettersAndDigits = Pattern
.compile("(?:(?:[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú]+[0-9]+)|(?:[0-9]+[A-ZÀÁÈÉÌÍÒÓÙÚa-zàáèéìíòóùú]+))[A-ZÄÖÜa-zàáèéìíòóùú0-9]*");
public List knownTypes() {
return new ArrayList();
}
private final Pattern reMatchingChars = Pattern.compile("");
public Pattern reMatchingChars() {
return reMatchingChars;
}
/**
* Every subclass has its own logger. The important point is that if several threads are accessing the variable at the same
* time, the logger needs to be thread-safe or it will produce rubbish.
*/
private Logger logger = MaryUtils.getLogger("CompositeEP");
public CompositeEP() {
super();
}
/**
* Process and expand a list of tokens.
*
* @param tokens
* the list of tokens to be expanded one after the other.
* @return a list of expanded forms of all the tokens, i.e. the concatenation of the expanded form (or unexpanded form if no
* expansion is possible) of all the tokens.
*/
private List process(List tokens) {
List result = new ArrayList();
for (Iterator it = tokens.iterator(); it.hasNext();) {
Element t = (Element) it.next();
if (!t.getTagName().equals(MaryXML.TOKEN))
throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected t element");
List expanded = new ArrayList();
process(t, expanded);
if (expanded.isEmpty()) // no expansion
result.add(t);
else
result.addAll(expanded);
}
return result;
}
/**
* Process this token. The CompositeEP works as a splitter of single tokens, iteratively expanding a token into its
* components.
*
* @param t
* the element to expand. After processing, this Element will still exist and be a valid Element, but possibly with
* a different content, and possibly enclosed by an <mtu> element. In addition, <t> may have new
* right-hand neighbors.
* @param expanded
* an empty list into which the expanded Elements are placed if an expansion occurred. The list will remain empty
* if no expansion was performed.
* @return true if this pattern is confident to have fully expanded this token, false if nothing could be done or more
* expansion may be necessary. CompositeEP always returns false, in order to have other ExpansionPatterns look at the
* components as well.
*/
public boolean process(Element t, final List expanded) {
if (t == null || expanded == null)
throw new NullPointerException("Received null argument");
if (!t.getTagName().equals(MaryXML.TOKEN))
throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected t element");
if (!expanded.isEmpty())
throw new IllegalArgumentException("Expected empty list, but list has " + expanded.size() + " elements.");
// Only modify tokens for which no pronunciation is given:
if (t.hasAttribute("ph") || t.hasAttribute("sounds_like")) {
return false;
}
// /// First, some cleaning up of the token: /////
String s = MaryDomUtils.tokenText(t);
// System.err.println("Trying to split " + s);
if (reTrailingHyphen.matcher(s).find()) {
// System.err.println("reTrailingHyphen");
// remove trailing hyphens after letters:
s = reTrailingHyphen.matcher(s).replaceFirst("$1");
MaryDomUtils.setTokenText(t, s);
}
if (reLeadingHyphen.matcher(s).find()) {
// System.err.println("reLeadingHyphen");
// remove leading hyphens before letters:
s = reLeadingHyphen.matcher(s).replaceFirst("$1");
MaryDomUtils.setTokenText(t, s);
}
// /// Then, see if we can split it: /////
// System.err.println("Then, see if we can split it: /////");
if (reLettersDigitsAndHyphens.matcher(s).matches()) {
// OK, a hyphen between parts containing letters and/or digits.
// In pseudo-composita, accent is on the first component:
Element mtu = MaryDomUtils.encloseWithMTU(t, s, "first");
StringTokenizer st = new StringTokenizer(s, "-");
assert st.hasMoreTokens();
MaryDomUtils.setTokenText(t, st.nextToken());
expanded.add(t);
while (st.hasMoreTokens()) {
t = MaryDomUtils.appendToken(t, st.nextToken());
expanded.add(t);
}
}
// This is for one letter proclitics (c'X, d'X, ...) and qual'X
else if (reOneLetterAndApostrophe.matcher(s).matches()) {
// System.err.println("one letter and apostrophe");
// OK, a hyphen between parts containing letters and/or digits.
// In pseudo-composita, accent is on the first component:
// c'X l'X d'X c' is the first-proclitics part
Element mtu = MaryDomUtils.encloseWithMTU(t, s, "last-proclitics");
StringTokenizer st = new StringTokenizer(s, "'");
assert st.hasMoreTokens();
MaryDomUtils.setTokenText(t, st.nextToken() + "'");
expanded.add(t);
while (st.hasMoreTokens()) {
t = MaryDomUtils.appendToken(t, st.nextToken());
expanded.add(t);
}
} else if (reLettersAndDigits.matcher(s).matches()) {
// Token consists only of letters and digits.
// Split between letters and digits.
// In pseudo-composita, accent is on the first component:
Element mtu = MaryDomUtils.encloseWithMTU(t, s, "first");
String s1 = s;
boolean isFirst = true;
while (s1.length() > 0) {
String part;
Matcher reMatcher = REPattern.initialNonDigits.matcher(s1);
if (reMatcher.find()) {
part = reMatcher.group();
s1 = reMatcher.replaceFirst("");
} else {
reMatcher = REPattern.initialDigits.matcher(s1);
reMatcher.find();
part = reMatcher.group();
s1 = reMatcher.replaceFirst("");
}
if (isFirst)
MaryDomUtils.setTokenText(t, part);
else
t = MaryDomUtils.appendToken(t, part);
expanded.add(t);
isFirst = false;
}
} else if (s.equals("'s")) {
// a standalone 's: simply pronounce it as [s].
t.setAttribute("ph", "s");
expanded.add(t);
} else if (s.endsWith("'s")) {
// Cases like "geht's": Simply have it pronounced like "gehts".
// No iteration.
t.setAttribute("sounds_like", s.substring(0, s.length() - 2));
t.setAttribute("ph", "*s");
expanded.add(t);
} else if (ExpansionPattern.reSplitAtChars().matcher(s).find()
&& (REPattern.letter.matcher(s).find() || REPattern.digit.matcher(s).find())) {
// Split into parts, keeping each special char as one part
// For special characters, accent is on the last component:
Element mtu = MaryDomUtils.encloseWithMTU(t, s, "last");
StringTokenizer st = new StringTokenizer(s, ExpansionPattern.getSplitAtChars(), true); // return delimiters
MaryDomUtils.setTokenText(t, st.nextToken());
expanded.add(t);
while (st.hasMoreTokens()) {
t = MaryDomUtils.appendToken(t, st.nextToken());
expanded.add(t);
}
}
// iterative call:
if (expanded.size() > 0) {
List newExpanded = process(expanded);
expanded.clear();
expanded.addAll(newExpanded);
}
// Never return true, in order to allow other ExpansionPatterns to
// expand the components.
return false;
}
protected int canDealWith(String input, int typeCode) {
return match(input, typeCode);
}
protected int match(String input, int typeCode) {
throw new RuntimeException("This method should not be called.");
}
protected List expand(List tokens, String text, int typeCode) {
throw new RuntimeException("This method should not be called.");
}
}