/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.it.preprocess;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import marytts.datatypes.MaryXML;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import org.apache.log4j.Logger;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* For preprocessing, serve as a base class for the different types of possible expansion patterns. For simplicity's sake, it is
* implemented in a "greedy" way: As soon as an expansion pattern matches, it is applied, i.e. the matched tokens are expanded
* according to the expansion rules in the pattern.
*
* @author Marc Schröder
*/
public abstract class ExpansionPattern {
// protected static MultiWordEP multiword;
protected static CompositeEP composite;
// protected static NetEP net;
// protected static DateEP date;
// protected static TimeEP time;
// protected static DurationEP duration;
// protected static CurrencyEP currency;
// protected static MeasureEP measure;
// protected static TelephoneEP telephone;
protected static NumberEP number;
// protected static AbbrevEP abbrev;
protected static SpecialCharEP specialChar;
private static List expansionPatterns;
private static Map patternTable;
/**
* Initialize the various patterns. Notice that the order in which they are added to List expansionPatterns is most important:
* If several patterns potentially would match a given input, the one first found in the list will be applied. Therefore,
* frequent and well-identifiable cases should come first, while exotic or fall-back cases (like simple integer expansion)
* should come last in the list.
*/
static {
expansionPatterns = new ArrayList();
patternTable = new HashMap();
Iterator it;
/*
* multiword = new MultiWordEP(); expansionPatterns.add(multiword); for (it = multiword.knownTypes().iterator();
* it.hasNext();) patternTable.put(it.next(), multiword);
*
* net = new NetEP(); expansionPatterns.add(net); for (it = net.knownTypes().iterator(); it.hasNext();)
* patternTable.put(it.next(), net);
*/
composite = new CompositeEP();
expansionPatterns.add(composite);
for (it = composite.knownTypes().iterator(); it.hasNext();)
patternTable.put(it.next(), composite);
/*
* date = new DateEP(); expansionPatterns.add(date); for (it = date.knownTypes().iterator(); it.hasNext();)
* patternTable.put(it.next(), date); time = new TimeEP(); expansionPatterns.add(time); for (it =
* time.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), time);
*/
// Putting duration after time means that duration patterns,
// which have the same form as a subset of time patterns,
// will actually never match without being explicitly requested
// through <say-as> annotation.
/*
* duration = new DurationEP(); expansionPatterns.add(duration); for (it = duration.knownTypes().iterator();
* it.hasNext();) patternTable.put(it.next(), duration); currency = new CurrencyEP(); expansionPatterns.add(currency); for
* (it = currency.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), currency); measure = new
* MeasureEP(); expansionPatterns.add(measure); for (it = measure.knownTypes().iterator(); it.hasNext();)
* patternTable.put(it.next(), measure); telephone = new TelephoneEP(); expansionPatterns.add(telephone); for (it =
* telephone.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), telephone); abbrev = new AbbrevEP();
* expansionPatterns.add(abbrev); for (it = abbrev.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(),
* abbrev);
*/
number = new NumberEP();
expansionPatterns.add(number);
for (it = number.knownTypes().iterator(); it.hasNext();)
patternTable.put(it.next(), number);
specialChar = new SpecialCharEP();
expansionPatterns.add(specialChar);
for (it = specialChar.knownTypes().iterator(); it.hasNext();)
patternTable.put(it.next(), specialChar);
}
public static List allPatterns() {
return expansionPatterns;
}
public static ExpansionPattern getPattern(String typeString) {
return (ExpansionPattern) patternTable.get(typeString);
}
/**
* A regular expression matching the characters at which a token should be split into parts before any preprocessing patterns
* are applied.
*
* @return return specialChar.getRESplitAtChars
* @see SpecialCharEP#getRESplitAtChars
*/
public static Pattern reSplitAtChars() {
return specialChar.getRESplitAtChars();
}
/**
* A string containing the characters at which a token should be split into parts before any preprocessing patterns are
* applied.
*
* @return specialChar.splitAtChars
* @see SpecialCharEP#splitAtChars
*/
public static String getSplitAtChars() {
return specialChar.splitAtChars();
}
private static Logger logger = MaryUtils.getLogger("ExpansionPattern");
public ExpansionPattern() {
}
/**
* Whether patterns of this type can be composed of several tokens.
*
* @return true
*/
protected boolean allowMultipleTokens() {
return true;
}
/**
* Inform whether this module performs a full expansion of the input, or whether other patterns should be applied after this
* one.
*
* @return true
*/
protected boolean doesFullExpansion() {
return true;
}
/**
* Returns the types known by this ExpansionPattern. These are possible values of the <code>type</code> attribute to the
* <code>say-as</code> element, as defined in MaryXML.dtd. Each subclass needs to override this to return something
* meaningful.
*
* @return known types
*/
public abstract List knownTypes();
/**
* Returns the regular expression object matching any of the chars occurring in the pattern. Each subclass needs to override
* this to return something meaningful.
*
* @return reMatchingChars
*/
public abstract Pattern reMatchingChars();
/**
* Try to match this pattern starting at token <code>t</code>. If successful, replace the matched tokens with the replaced
* form.
*
* @param t
* the element to expand. After processing, this Element will still exist and be a valid Element, but possibly with
* a different content, and possibly enclosed by an <mtu> element. In addition, <t> may have new
* right-hand neighbors.
* @param expanded
* an empty list into which the expanded Elements are placed if an expansion occurred. The list will remain empty
* if no expansion was performed. Elements placed in the list are not guaranteed to be only t elements, but may be
* elements enclosing the expanded t elements, such as mtu elements, as well as non-t empty elements (such as
* boundary elements). If the list is non-empty, it is guaranteed to contain (either directly or as descendants of
* the list items) at least one t element.
* @return true if this pattern is confident to have fully expanded this list of tokens, false if nothing could be done or
* more expansion may be necessary.
*/
public boolean process(Element t, final List expanded) {
if (t == null || expanded == null)
throw new NullPointerException("Received null argument");
if (!t.getTagName().equals(MaryXML.TOKEN))
throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected t element");
if (!expanded.isEmpty())
throw new IllegalArgumentException("Expected empty list, but list has " + expanded.size() + " elements.");
StringBuilder sb = new StringBuilder();
int matchedType = -1;
ArrayList candidates = new ArrayList();
if (allowMultipleTokens()) {
Element n = t;
// Do a look-forward preselection in order to find possible
// candidates for tokens forming a pattern with t: They need to be
// siblings and contain at least one of the characters occurring in
// the pattern (as represented by the regular expression
// reMatchingChars).
while (n != null && n.getTagName().equals(MaryXML.TOKEN) && !n.hasAttribute("ph") && !n.hasAttribute("sounds_like")
&& isCandidate(n)) {
// System.err.println("Found candidate \"" + MaryDomUtils.tokenText(n) + "\" for " + this.getClass().getName());
candidates.add(n);
n = MaryDomUtils.getNextSiblingElement(n);
}
if (candidates.isEmpty()) // t itself is not a candidate
return false; // quick exit for non-candidates
// Now candidates contains the list of tokens that are worth
// looking at more closely.
while (!candidates.isEmpty()) {
sb.setLength(0);
Iterator it = candidates.iterator();
while (it.hasNext()) {
sb.append(MaryDomUtils.tokenText((Element) it.next()));
}
// System.err.println(this.getClass().getName() + ", trying to match: " + sb.toString() + "(t=" +
// MaryDomUtils.tokenText(t) + ", candidates.size()=" + candidates.size() + ")");
matchedType = match(sb.toString(), 0); // 0 == most general type
if (matchedType != -1)
break; // OK, found a match
candidates.remove(candidates.size() - 1); // remove last in list
}
} else { // only a single token allowed
if (!t.hasAttribute("ph") && !t.hasAttribute("sounds_like") && isCandidate(t)) {
sb.setLength(0);
sb.append(MaryDomUtils.tokenText(t));
matchedType = match(sb.toString(), 0); // 0 == most general type
candidates.add(t);
}
}
if (matchedType != -1) { // found a match
logger.debug("Found match, type " + knownTypes().get(matchedType) + ": " + sb.toString() + " (" + candidates.size()
+ " tokens)");
expanded.addAll(expand(candidates, sb.toString(), matchedType));
if (expanded.isEmpty() && !knownTypes().get(matchedType).equals("specialChar")) {
logger.info("Could match, but not expand string \"" + sb + "\" as type " + knownTypes().get(matchedType));
}
return !expanded.isEmpty() && doesFullExpansion();
} else { // no match found
return false;
}
}
protected boolean isCandidate(Element t) {
return reMatchingChars().matcher(MaryDomUtils.tokenText(t)).find();
}
/**
* Try to match and expand the entirety of tokens enclosed by the say-as tag <code>sayas</code>. The <code>type</code> of data
* to expand is given. If the tokens can be matched according to <code>type</code>, they are expanded. Throws DOMException if
* <code>sayas</code>'s tag name is not "say-as".
*
* @param sayas
* sayas
* @param typeString
* typeString
* @throws DOMException
* DOMException
*/
public void match(Element sayas, String typeString) throws DOMException {
if (!sayas.getTagName().equals(MaryXML.SAYAS))
throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected " + MaryXML.SAYAS + " element, got "
+ sayas.getTagName());
List tokens = MaryDomUtils.getNodeListAsList(sayas.getElementsByTagName(MaryXML.TOKEN));
StringBuilder sb = new StringBuilder();
for (Iterator it = tokens.iterator(); it.hasNext();) {
sb.append(MaryDomUtils.tokenText((Element) it.next()));
}
int type = knownTypes().indexOf(typeString);
int expandType = canDealWith(sb.toString(), type);
if (expandType != -1) { // OK, we can expand this
// System.err.println("Say-as requested type \"" + knownTypes().get(type) + "\" for text \"" + sb.toString() +
// "\": can expand.");
List expanded = expand(tokens, sb.toString(), expandType);
if (expanded.isEmpty())
logger.info("Failure expanding string \"" + sb + "\" as type \"" + knownTypes().get(expandType) + "\"");
} else { // cannot expand according to sayas wish
logger.info("Cannot expand string \"" + sb.toString() + "\" as requested type \"" + typeString + "\"");
}
}
/**
* Decide whether we can expand a string according to type <code>typeCode</code>. This is important in cases where a
* particular expansion is requested via a <code>say-as</code> element. As a default, reply that a string can be expanded if
* it would be matched by the pattern recognizer. Subclasses may wish to override this with less strict requirements. Returns
* the type as which it can be expanded, or -1 if expansion is not possible.
*
* @param input
* input
* @param typeCode
* typeCode
* @return true if it can deal with input and typeCode
*/
protected abstract int canDealWith(String input, int typeCode);
// formerly: {return match(input, typeCode); }
/**
* Subclasses do their matching in this class.
*
* @param input
* is the String to be matched,
* @param typeCode
* is the index in <code>knownTypes</code> to match with.
* @return type actually matched on successful match with this type (if <code>typeCode</code> is a general type (
* <code>typeCode == 0</code>), it may have matched with a more specific subtype). On failure, <code>-1</code> is
* returned.
*/
protected abstract int match(String input, int typeCode);
/**
* Subclasses do their expansion in this class.
*
* @param tokens
* is a list of token Elements to be replaced with their expanded form. The expanded forms are inserted into the
* DOM tree at the same positions as the tokens in List <code>tokens</code>. If there are more new tokens than old
* tokens, the rest are inserted as siblings at the position of the last old token.
* @param text
* is the String to be expanded,
* @param typeCode
* is the index in <code>knownTypes</code> this string has matched with before.
* @return the list of expanded (=new) tokens.
*/
protected abstract List expand(List tokens, String text, int typeCode);
/**
* The default way to create new token DOM elements from whitespace-separated tokens in a string. String tokens have the form<br>
* <code>graph</code> or <code>graph[phon]</code>, where the optional <code>phon</code>, if present, is set as value to the
* <code>sampa</code> attribute of the <code>t</code> element.
* <p>
* All expansion patterns that do not require any special attribute settings should create their new tokens using this method.
* <p>
* Returns a list of token elements created from Document <code>doc</code>, but not yet attached in the tree.
*
* @param doc
* doc
* @param newText
* newText
* @return makeNewTokens(doc, newText, false, null)
*/
protected List makeNewTokens(Document doc, String newText) {
return makeNewTokens(doc, newText, false, null);
}
protected List makeNewTokens(Document doc, String newText, boolean createMtu, String origText) {
return makeNewTokens(doc, newText, createMtu, origText, false);
}
protected List makeNewTokens(Document doc, String newText, boolean createMtu, String origText, boolean forceAccents) {
if (newText == null || newText.length() == 0) {
// unusable input
return null; // failure
}
Pattern rePron = Pattern.compile("\\[(.*)\\]"); // pronunciation in square brackets
StringTokenizer st = new StringTokenizer(newText);
ArrayList newTokens = new ArrayList();
while (st.hasMoreTokens()) {
// Create new token element:
String text = st.nextToken();
Element newT = MaryXML.createElement(doc, MaryXML.TOKEN);
Matcher remPron = rePron.matcher(text);
if (remPron.find()) {
String pron = remPron.group(1); // would be $1 in perl
text = rePron.matcher(text).replaceFirst(""); // delete pronunciation from word
newT.setAttribute("ph", pron);
}
MaryDomUtils.setTokenText(newT, text);
if (forceAccents)
newT.setAttribute("accent", "unknown");
newTokens.add(newT);
}
if (createMtu) {
// create mtu element enclosing the expanded tokens:
Element mtu = MaryXML.createElement(doc, MaryXML.MTU);
mtu.setAttribute("orig", origText);
mtu.setAttribute("accent", "last");
for (Iterator it = newTokens.iterator(); it.hasNext();) {
mtu.appendChild((Element) it.next());
}
List result = new ArrayList();
result.add(mtu);
return result;
} else {
return newTokens;
}
}
protected void replaceTokens(List oldTokens, List newTokens) {
if (oldTokens == null || oldTokens.isEmpty() || newTokens == null || newTokens.isEmpty()) {
// unusable input
throw new NullPointerException("Have received null or empty argument.");
}
Element oldT = null;
Iterator itOld = oldTokens.iterator();
Iterator itNew = newTokens.iterator();
while (itNew.hasNext()) {
Element newT = (Element) itNew.next();
// Retrieve old token element:
if (itOld.hasNext()) // this is true at least once
oldT = (Element) itOld.next();
oldT.getParentNode().insertBefore(newT, oldT);
if (itOld.hasNext()) // only remove this old t if there is another one
oldT.getParentNode().removeChild(oldT);
}
if (!itOld.hasNext()) { // only need to remove oldT
oldT.getParentNode().removeChild(oldT);
} else {
// there were more old than new tokens
while (itOld.hasNext()) {
oldT = (Element) itOld.next();
oldT.getParentNode().removeChild(oldT);
}
}
// Now go through the new tokens again and see if there are any
// useless mtu combinations. If so, the "inner" one wins.
itNew = newTokens.iterator();
while (itNew.hasNext()) {
Element mtu = (Element) itNew.next();
if (!mtu.getTagName().equals(MaryXML.MTU))
continue;
Element parent = (Element) mtu.getParentNode();
if (!parent.getTagName().equals(MaryXML.MTU))
continue;
// OK, got an mtu inside an mtu
if (MaryDomUtils.getPreviousSiblingElement(mtu) != null || MaryDomUtils.getNextSiblingElement(mtu) != null)
continue;
if (!parent.getAttribute("orig").equals(mtu.getAttribute("orig")))
continue;
// OK, mtu and parent are mtu tags, there is no other element in parent
// than mtu, and both have the same orig value
// => delete parent
Element grandParent = (Element) parent.getParentNode();
grandParent.insertBefore(mtu, parent);
grandParent.removeChild(parent);
}
}
/**
* Enclose token in a <prosody rate="..."> tag in order to slow the spelling down, and in a <phonology> tag in
* order to enforce precise pronunciation.
*
* @param e
* e
*/
protected void slowDown(Element e) {
Document doc = e.getOwnerDocument();
Element whereToInsert = e;
Element prosody = null;
Element phonol = null;
if (whereToInsert.getParentNode().getNodeName().equals(MaryXML.PHONOLOGY)) {
// There is already a phonology tag enclosing us.
phonol = (Element) whereToInsert.getParentNode();
if (phonol.getParentNode().getNodeName().equals(MaryXML.PROSODY)) {
// And also a prosody tag enclosing us.
prosody = (Element) phonol.getParentNode();
}
} else {
phonol = MaryXML.createElement(doc, MaryXML.PHONOLOGY);
prosody = MaryXML.createElement(doc, MaryXML.PROSODY);
prosody.appendChild(phonol);
whereToInsert.getParentNode().insertBefore(prosody, whereToInsert);
phonol.appendChild(whereToInsert);
}
prosody.setAttribute("rate", "-20%");
phonol.setAttribute("precision", "precise");
}
/**
* Enclose the elements' closest common ancestor.
*
* @param first
* first
* @param last
* last
*/
protected void slowDown(Element first, Element last) {
Element phonol = MaryDomUtils.encloseNodesWithNewElement(first, last, MaryXML.PHONOLOGY);
phonol.setAttribute("precision", "precise");
Document doc = phonol.getOwnerDocument();
Element prosody = MaryXML.createElement(doc, MaryXML.PROSODY);
prosody.setAttribute("rate", "-20%");
phonol.getParentNode().insertBefore(prosody, phonol);
prosody.appendChild(phonol);
}
}