/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.it;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.language.it.preprocess.ExpansionPattern;
import marytts.modules.InternalModule;
import marytts.util.dom.MaryDomUtils;
import marytts.util.dom.NameNodeFilter;
import org.apache.log4j.Level;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;
/**
* The preprocessing module.
*
* @author Marc Schröder
*/
public class Preprocess extends InternalModule {
public Preprocess() {
super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, Locale.ITALIAN);
}
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
logger.info("Expanding say-as elements...");
expandSayasElements(doc);
logger.info("Matching and expanding patterns...");
matchAndExpandPatterns(doc);
logger.info("Done.");
MaryData result = new MaryData(outputType(), d.getLocale());
result.setDocument(doc);
return result;
}
private void expandSayasElements(Document doc) {
NodeList sayasElements = doc.getElementsByTagName(MaryXML.SAYAS);
for (int i = 0; i < sayasElements.getLength(); i++) {
Element sayas = (Element) sayasElements.item(i);
String type = sayas.getAttribute("type");
ExpansionPattern ep = ExpansionPattern.getPattern(type);
if (ep != null) {
if (logger.getEffectiveLevel().equals(Level.DEBUG)) {
logger.debug("Expanding say-as element of type " + type + ", containing text `"
+ MaryDomUtils.getPlainTextBelow(sayas) + "'");
}
ep.match(sayas, type);
} else {
// Don't know how to handle type -- ignore
logger.info("Don't know how to expand say-as type=\"" + type + "\"");
}
}
}
private void matchAndExpandPatterns(Document doc) {
TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Element t = null;
while ((t = (Element) tw.nextNode()) != null) {
// System.err.println("matching and expanding " + MaryDomUtils.tokenText(t));
// Skip tokens inside say-as tags, as well as tokens
// for which a pronunciation is given:
if (MaryDomUtils.hasAncestor(t, MaryXML.SAYAS) || t.hasAttribute("ph") || t.hasAttribute("sounds_like")) {
// ignore token
continue;
}
Iterator it = ExpansionPattern.allPatterns().iterator();
boolean fullyExpanded = false;
while (!fullyExpanded && it.hasNext()) {
ExpansionPattern ep = (ExpansionPattern) it.next();
logger.debug("Now applying ep " + ep + " to token " + MaryDomUtils.getPlainTextBelow(t));
List expanded = new ArrayList();
fullyExpanded = ep.process(t, expanded);
// Element replacements may have been caused by ep.process());
// Update t and tw accordingly: the next position to look at is
// - if fully expanded, the token after the last expanded token
// - else,
// -- if no expansion occurred, the same position as before;
// -- if partial expansion occurred, the first of the expanded tokens.
if (fullyExpanded) {
logger.debug("fully expanded");
assert !expanded.isEmpty();
// need to correct tw
Element lastToken = getLastToken(expanded);
assert lastToken != null;
tw.setCurrentNode(lastToken);
logger.debug("set treewalker position:" + MaryDomUtils.getPlainTextBelow((Element) tw.getCurrentNode()));
} else { // not fully expanded
if (!expanded.isEmpty()) { // partial expansion
logger.debug("non-final expansion");
// need to set t
t = getFirstToken(expanded);
assert t != null;
// set tw as if fully expanded, just in case no further expansions occur
// Element lastToken = getLastToken(expanded);
// assert lastToken != null;
tw.setCurrentNode(t);
}
}
} // all patterns
} // all tokens
}
/**
* Find the last token in the list of elements l. Starting from the last element in the list, if the element itself is a
* token, return it; else, if it has a direct or indirect descendant which is a token, return that one; else, go backwards in
* the list.
*
* @param l
* a list of elements
* @return the last token, or null if no such token can be found
*/
private Element getLastToken(List l) {
if (l == null)
throw new NullPointerException("Received null argument");
if (l.isEmpty())
throw new IllegalArgumentException("Received empty list");
for (int i = l.size() - 1; i >= 0; i--) {
Element e = (Element) l.get(i);
Element t = null;
if (e.getTagName().equals(MaryXML.TOKEN)) {
t = e;
} else {
t = MaryDomUtils.getLastElementByTagName(e, MaryXML.TOKEN);
}
if (t != null)
return t;
}
return null;
}
/**
* Find the first token in the list of elements l. Starting from the first element in the list, if the element itself is a
* token, return it; else, if it has a direct or indirect descendant which is a token, return that one; else, go forward in
* the list.
*
* @param l
* a list of elements
* @return the first token, or null if no such token can be found
*/
private Element getFirstToken(List l) {
if (l == null)
throw new NullPointerException("Received null argument");
if (l.isEmpty())
throw new IllegalArgumentException("Received empty list");
for (int i = 0; i < l.size(); i++) {
Element e = (Element) l.get(i);
Element t = null;
if (e.getTagName().equals(MaryXML.TOKEN)) {
t = e;
} else {
t = MaryDomUtils.getFirstElementByTagName(e, MaryXML.TOKEN);
}
if (t != null)
return t;
}
return null;
}
}