/** * Copyright 2002 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.language.it.preprocess; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import marytts.datatypes.MaryXML; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.apache.log4j.Logger; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.w3c.dom.Element; /** * For preprocessing, serve as a base class for the different types of possible expansion patterns. For simplicity's sake, it is * implemented in a "greedy" way: As soon as an expansion pattern matches, it is applied, i.e. the matched tokens are expanded * according to the expansion rules in the pattern. * * @author Marc Schröder */ public abstract class ExpansionPattern { // protected static MultiWordEP multiword; protected static CompositeEP composite; // protected static NetEP net; // protected static DateEP date; // protected static TimeEP time; // protected static DurationEP duration; // protected static CurrencyEP currency; // protected static MeasureEP measure; // protected static TelephoneEP telephone; protected static NumberEP number; // protected static AbbrevEP abbrev; protected static SpecialCharEP specialChar; private static List expansionPatterns; private static Map patternTable; /** * Initialize the various patterns. Notice that the order in which they are added to List expansionPatterns is most important: * If several patterns potentially would match a given input, the one first found in the list will be applied. Therefore, * frequent and well-identifiable cases should come first, while exotic or fall-back cases (like simple integer expansion) * should come last in the list. */ static { expansionPatterns = new ArrayList(); patternTable = new HashMap(); Iterator it; /* * multiword = new MultiWordEP(); expansionPatterns.add(multiword); for (it = multiword.knownTypes().iterator(); * it.hasNext();) patternTable.put(it.next(), multiword); * * net = new NetEP(); expansionPatterns.add(net); for (it = net.knownTypes().iterator(); it.hasNext();) * patternTable.put(it.next(), net); */ composite = new CompositeEP(); expansionPatterns.add(composite); for (it = composite.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), composite); /* * date = new DateEP(); expansionPatterns.add(date); for (it = date.knownTypes().iterator(); it.hasNext();) * patternTable.put(it.next(), date); time = new TimeEP(); expansionPatterns.add(time); for (it = * time.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), time); */ // Putting duration after time means that duration patterns, // which have the same form as a subset of time patterns, // will actually never match without being explicitly requested // through <say-as> annotation. /* * duration = new DurationEP(); expansionPatterns.add(duration); for (it = duration.knownTypes().iterator(); * it.hasNext();) patternTable.put(it.next(), duration); currency = new CurrencyEP(); expansionPatterns.add(currency); for * (it = currency.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), currency); measure = new * MeasureEP(); expansionPatterns.add(measure); for (it = measure.knownTypes().iterator(); it.hasNext();) * patternTable.put(it.next(), measure); telephone = new TelephoneEP(); expansionPatterns.add(telephone); for (it = * telephone.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), telephone); abbrev = new AbbrevEP(); * expansionPatterns.add(abbrev); for (it = abbrev.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), * abbrev); */ number = new NumberEP(); expansionPatterns.add(number); for (it = number.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), number); specialChar = new SpecialCharEP(); expansionPatterns.add(specialChar); for (it = specialChar.knownTypes().iterator(); it.hasNext();) patternTable.put(it.next(), specialChar); } public static List allPatterns() { return expansionPatterns; } public static ExpansionPattern getPattern(String typeString) { return (ExpansionPattern) patternTable.get(typeString); } /** * A regular expression matching the characters at which a token should be split into parts before any preprocessing patterns * are applied. * * @return return specialChar.getRESplitAtChars * @see SpecialCharEP#getRESplitAtChars */ public static Pattern reSplitAtChars() { return specialChar.getRESplitAtChars(); } /** * A string containing the characters at which a token should be split into parts before any preprocessing patterns are * applied. * * @return specialChar.splitAtChars * @see SpecialCharEP#splitAtChars */ public static String getSplitAtChars() { return specialChar.splitAtChars(); } private static Logger logger = MaryUtils.getLogger("ExpansionPattern"); public ExpansionPattern() { } /** * Whether patterns of this type can be composed of several tokens. * * @return true */ protected boolean allowMultipleTokens() { return true; } /** * Inform whether this module performs a full expansion of the input, or whether other patterns should be applied after this * one. * * @return true */ protected boolean doesFullExpansion() { return true; } /** * Returns the types known by this ExpansionPattern. These are possible values of the <code>type</code> attribute to the * <code>say-as</code> element, as defined in MaryXML.dtd. Each subclass needs to override this to return something * meaningful. * * @return known types */ public abstract List knownTypes(); /** * Returns the regular expression object matching any of the chars occurring in the pattern. Each subclass needs to override * this to return something meaningful. * * @return reMatchingChars */ public abstract Pattern reMatchingChars(); /** * Try to match this pattern starting at token <code>t</code>. If successful, replace the matched tokens with the replaced * form. * * @param t * the element to expand. After processing, this Element will still exist and be a valid Element, but possibly with * a different content, and possibly enclosed by an <mtu> element. In addition, <t> may have new * right-hand neighbors. * @param expanded * an empty list into which the expanded Elements are placed if an expansion occurred. The list will remain empty * if no expansion was performed. Elements placed in the list are not guaranteed to be only t elements, but may be * elements enclosing the expanded t elements, such as mtu elements, as well as non-t empty elements (such as * boundary elements). If the list is non-empty, it is guaranteed to contain (either directly or as descendants of * the list items) at least one t element. * @return true if this pattern is confident to have fully expanded this list of tokens, false if nothing could be done or * more expansion may be necessary. */ public boolean process(Element t, final List expanded) { if (t == null || expanded == null) throw new NullPointerException("Received null argument"); if (!t.getTagName().equals(MaryXML.TOKEN)) throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected t element"); if (!expanded.isEmpty()) throw new IllegalArgumentException("Expected empty list, but list has " + expanded.size() + " elements."); StringBuilder sb = new StringBuilder(); int matchedType = -1; ArrayList candidates = new ArrayList(); if (allowMultipleTokens()) { Element n = t; // Do a look-forward preselection in order to find possible // candidates for tokens forming a pattern with t: They need to be // siblings and contain at least one of the characters occurring in // the pattern (as represented by the regular expression // reMatchingChars). while (n != null && n.getTagName().equals(MaryXML.TOKEN) && !n.hasAttribute("ph") && !n.hasAttribute("sounds_like") && isCandidate(n)) { // System.err.println("Found candidate \"" + MaryDomUtils.tokenText(n) + "\" for " + this.getClass().getName()); candidates.add(n); n = MaryDomUtils.getNextSiblingElement(n); } if (candidates.isEmpty()) // t itself is not a candidate return false; // quick exit for non-candidates // Now candidates contains the list of tokens that are worth // looking at more closely. while (!candidates.isEmpty()) { sb.setLength(0); Iterator it = candidates.iterator(); while (it.hasNext()) { sb.append(MaryDomUtils.tokenText((Element) it.next())); } // System.err.println(this.getClass().getName() + ", trying to match: " + sb.toString() + "(t=" + // MaryDomUtils.tokenText(t) + ", candidates.size()=" + candidates.size() + ")"); matchedType = match(sb.toString(), 0); // 0 == most general type if (matchedType != -1) break; // OK, found a match candidates.remove(candidates.size() - 1); // remove last in list } } else { // only a single token allowed if (!t.hasAttribute("ph") && !t.hasAttribute("sounds_like") && isCandidate(t)) { sb.setLength(0); sb.append(MaryDomUtils.tokenText(t)); matchedType = match(sb.toString(), 0); // 0 == most general type candidates.add(t); } } if (matchedType != -1) { // found a match logger.debug("Found match, type " + knownTypes().get(matchedType) + ": " + sb.toString() + " (" + candidates.size() + " tokens)"); expanded.addAll(expand(candidates, sb.toString(), matchedType)); if (expanded.isEmpty() && !knownTypes().get(matchedType).equals("specialChar")) { logger.info("Could match, but not expand string \"" + sb + "\" as type " + knownTypes().get(matchedType)); } return !expanded.isEmpty() && doesFullExpansion(); } else { // no match found return false; } } protected boolean isCandidate(Element t) { return reMatchingChars().matcher(MaryDomUtils.tokenText(t)).find(); } /** * Try to match and expand the entirety of tokens enclosed by the say-as tag <code>sayas</code>. The <code>type</code> of data * to expand is given. If the tokens can be matched according to <code>type</code>, they are expanded. Throws DOMException if * <code>sayas</code>'s tag name is not "say-as". * * @param sayas * sayas * @param typeString * typeString * @throws DOMException * DOMException */ public void match(Element sayas, String typeString) throws DOMException { if (!sayas.getTagName().equals(MaryXML.SAYAS)) throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Expected " + MaryXML.SAYAS + " element, got " + sayas.getTagName()); List tokens = MaryDomUtils.getNodeListAsList(sayas.getElementsByTagName(MaryXML.TOKEN)); StringBuilder sb = new StringBuilder(); for (Iterator it = tokens.iterator(); it.hasNext();) { sb.append(MaryDomUtils.tokenText((Element) it.next())); } int type = knownTypes().indexOf(typeString); int expandType = canDealWith(sb.toString(), type); if (expandType != -1) { // OK, we can expand this // System.err.println("Say-as requested type \"" + knownTypes().get(type) + "\" for text \"" + sb.toString() + // "\": can expand."); List expanded = expand(tokens, sb.toString(), expandType); if (expanded.isEmpty()) logger.info("Failure expanding string \"" + sb + "\" as type \"" + knownTypes().get(expandType) + "\""); } else { // cannot expand according to sayas wish logger.info("Cannot expand string \"" + sb.toString() + "\" as requested type \"" + typeString + "\""); } } /** * Decide whether we can expand a string according to type <code>typeCode</code>. This is important in cases where a * particular expansion is requested via a <code>say-as</code> element. As a default, reply that a string can be expanded if * it would be matched by the pattern recognizer. Subclasses may wish to override this with less strict requirements. Returns * the type as which it can be expanded, or -1 if expansion is not possible. * * @param input * input * @param typeCode * typeCode * @return true if it can deal with input and typeCode */ protected abstract int canDealWith(String input, int typeCode); // formerly: {return match(input, typeCode); } /** * Subclasses do their matching in this class. * * @param input * is the String to be matched, * @param typeCode * is the index in <code>knownTypes</code> to match with. * @return type actually matched on successful match with this type (if <code>typeCode</code> is a general type ( * <code>typeCode == 0</code>), it may have matched with a more specific subtype). On failure, <code>-1</code> is * returned. */ protected abstract int match(String input, int typeCode); /** * Subclasses do their expansion in this class. * * @param tokens * is a list of token Elements to be replaced with their expanded form. The expanded forms are inserted into the * DOM tree at the same positions as the tokens in List <code>tokens</code>. If there are more new tokens than old * tokens, the rest are inserted as siblings at the position of the last old token. * @param text * is the String to be expanded, * @param typeCode * is the index in <code>knownTypes</code> this string has matched with before. * @return the list of expanded (=new) tokens. */ protected abstract List expand(List tokens, String text, int typeCode); /** * The default way to create new token DOM elements from whitespace-separated tokens in a string. String tokens have the form<br> * <code>graph</code> or <code>graph[phon]</code>, where the optional <code>phon</code>, if present, is set as value to the * <code>sampa</code> attribute of the <code>t</code> element. * <p> * All expansion patterns that do not require any special attribute settings should create their new tokens using this method. * <p> * Returns a list of token elements created from Document <code>doc</code>, but not yet attached in the tree. * * @param doc * doc * @param newText * newText * @return makeNewTokens(doc, newText, false, null) */ protected List makeNewTokens(Document doc, String newText) { return makeNewTokens(doc, newText, false, null); } protected List makeNewTokens(Document doc, String newText, boolean createMtu, String origText) { return makeNewTokens(doc, newText, createMtu, origText, false); } protected List makeNewTokens(Document doc, String newText, boolean createMtu, String origText, boolean forceAccents) { if (newText == null || newText.length() == 0) { // unusable input return null; // failure } Pattern rePron = Pattern.compile("\\[(.*)\\]"); // pronunciation in square brackets StringTokenizer st = new StringTokenizer(newText); ArrayList newTokens = new ArrayList(); while (st.hasMoreTokens()) { // Create new token element: String text = st.nextToken(); Element newT = MaryXML.createElement(doc, MaryXML.TOKEN); Matcher remPron = rePron.matcher(text); if (remPron.find()) { String pron = remPron.group(1); // would be $1 in perl text = rePron.matcher(text).replaceFirst(""); // delete pronunciation from word newT.setAttribute("ph", pron); } MaryDomUtils.setTokenText(newT, text); if (forceAccents) newT.setAttribute("accent", "unknown"); newTokens.add(newT); } if (createMtu) { // create mtu element enclosing the expanded tokens: Element mtu = MaryXML.createElement(doc, MaryXML.MTU); mtu.setAttribute("orig", origText); mtu.setAttribute("accent", "last"); for (Iterator it = newTokens.iterator(); it.hasNext();) { mtu.appendChild((Element) it.next()); } List result = new ArrayList(); result.add(mtu); return result; } else { return newTokens; } } protected void replaceTokens(List oldTokens, List newTokens) { if (oldTokens == null || oldTokens.isEmpty() || newTokens == null || newTokens.isEmpty()) { // unusable input throw new NullPointerException("Have received null or empty argument."); } Element oldT = null; Iterator itOld = oldTokens.iterator(); Iterator itNew = newTokens.iterator(); while (itNew.hasNext()) { Element newT = (Element) itNew.next(); // Retrieve old token element: if (itOld.hasNext()) // this is true at least once oldT = (Element) itOld.next(); oldT.getParentNode().insertBefore(newT, oldT); if (itOld.hasNext()) // only remove this old t if there is another one oldT.getParentNode().removeChild(oldT); } if (!itOld.hasNext()) { // only need to remove oldT oldT.getParentNode().removeChild(oldT); } else { // there were more old than new tokens while (itOld.hasNext()) { oldT = (Element) itOld.next(); oldT.getParentNode().removeChild(oldT); } } // Now go through the new tokens again and see if there are any // useless mtu combinations. If so, the "inner" one wins. itNew = newTokens.iterator(); while (itNew.hasNext()) { Element mtu = (Element) itNew.next(); if (!mtu.getTagName().equals(MaryXML.MTU)) continue; Element parent = (Element) mtu.getParentNode(); if (!parent.getTagName().equals(MaryXML.MTU)) continue; // OK, got an mtu inside an mtu if (MaryDomUtils.getPreviousSiblingElement(mtu) != null || MaryDomUtils.getNextSiblingElement(mtu) != null) continue; if (!parent.getAttribute("orig").equals(mtu.getAttribute("orig"))) continue; // OK, mtu and parent are mtu tags, there is no other element in parent // than mtu, and both have the same orig value // => delete parent Element grandParent = (Element) parent.getParentNode(); grandParent.insertBefore(mtu, parent); grandParent.removeChild(parent); } } /** * Enclose token in a <prosody rate="..."> tag in order to slow the spelling down, and in a <phonology> tag in * order to enforce precise pronunciation. * * @param e * e */ protected void slowDown(Element e) { Document doc = e.getOwnerDocument(); Element whereToInsert = e; Element prosody = null; Element phonol = null; if (whereToInsert.getParentNode().getNodeName().equals(MaryXML.PHONOLOGY)) { // There is already a phonology tag enclosing us. phonol = (Element) whereToInsert.getParentNode(); if (phonol.getParentNode().getNodeName().equals(MaryXML.PROSODY)) { // And also a prosody tag enclosing us. prosody = (Element) phonol.getParentNode(); } } else { phonol = MaryXML.createElement(doc, MaryXML.PHONOLOGY); prosody = MaryXML.createElement(doc, MaryXML.PROSODY); prosody.appendChild(phonol); whereToInsert.getParentNode().insertBefore(prosody, whereToInsert); phonol.appendChild(whereToInsert); } prosody.setAttribute("rate", "-20%"); phonol.setAttribute("precision", "precise"); } /** * Enclose the elements' closest common ancestor. * * @param first * first * @param last * last */ protected void slowDown(Element first, Element last) { Element phonol = MaryDomUtils.encloseNodesWithNewElement(first, last, MaryXML.PHONOLOGY); phonol.setAttribute("precision", "precise"); Document doc = phonol.getOwnerDocument(); Element prosody = MaryXML.createElement(doc, MaryXML.PROSODY); prosody.setAttribute("rate", "-20%"); phonol.getParentNode().insertBefore(prosody, phonol); prosody.appendChild(phonol); } }