Postlex.java example

Explorer
marytts-master
/**
 * Copyright 2002 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.language.it;

import java.util.Iterator;
import java.util.List;
import java.util.Locale;

import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.modules.InternalModule;
import marytts.modules.PronunciationModel;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.util.dom.MaryDomUtils;
import marytts.util.dom.NameNodeFilter;

import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;

/**
 * The postlexical phonological processes module. Used as first option to solve proclitics tokens and sillabification merging
 * 
 * @author Fabio Tesser
 */

public class Postlex extends PronunciationModel {

	public Postlex() {
		super(Locale.ITALIAN);
	}

	public MaryData process(MaryData d) throws Exception {
		// System.err.println("Italian Postlex START");
		Document doc = d.getDocument();
		mtuMergeTokenPostlex(doc);
		return super.process(d);
	}

	/*
	 * Return Quote space if quote present
	 */
	String returnQuoteIfStress(String lPhones) {
		if (lPhones.indexOf("'") != -1) {
			// System.out.println("there is ' in temp string");
			return "' ";
		} else {
			// System.out.println("there is no ' in temp string");
			return "";
		}
	}

	/*
	 * This method is used when proclitics are found in mtu proclitics is c'X (if there is in it_clitics.xml)
	 */
	private void mtuMergeTokenPostlex(Document doc) throws DOMException {
		TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.MTU),
				false);
		Element m = null;
		while ((m = (Element) tw.nextNode()) != null) {
			if (MaryDomUtils.hasAncestor(m, MaryXML.MTU)) // not highest-level
				continue;
			// Now m is a highest-level mtu element
			// Search for the token whose accent is retained;
			// all other accents will be deleted.
			Element c = m;
			while (c != null && !c.getTagName().equals(MaryXML.TOKEN)) {
				String whatToAccent = c.getAttribute("accent");
				// check for last-proclitics (c' t' d' X)
				if (whatToAccent != null && whatToAccent.equals("last-proclitics")) {
					// System.err.println("token to join!!! in " + c.getNodeName());
					Element c1 = MaryDomUtils.getFirstChildElement(c);
					// get the anchor as reference in order to delete the children after
					Element c_anchor = c;
					// set c as last (second) child element in this case
					// WARNING we treat the case with 2 token only!!!
					c = MaryDomUtils.getLastChildElement(c);
					// merge ph and POS?
					c.setAttribute("merged-token", "yes");
					c.setAttribute("g2p_method", c1.getAttribute("g2p_method") + "+" + c.getAttribute("g2p_method"));
					// TODO: accent= to merge? take the first or the second?
					// c.setAttribute("accent", c1.getAttribute("accent")); // + "+" + c.getAttribute("accent"));
					// c.removeAttribute("accent");
					c.setTextContent(c1.getTextContent() + "+" + c.getTextContent());
					// Merge the ph and write the quote if necessary
					c.setAttribute("ph",
							returnQuoteIfStress(c.getAttribute("ph")) + c1.getAttribute("ph") + " " + c.getAttribute("ph"));
					// TODO: POS are not merged if you want to merge the POS:
					// c.setAttribute("pos", c1.getAttribute("pos") + "+" +c.getAttribute("pos"));

					// c.setAttribute("pos",c1.getAttribute("pos");

					// remove child token
					c_anchor.removeChild(c1);
				} else
					c = MaryDomUtils.getLastChildElement(c);
			}

			/*
			 * Element retainAccentToken = c;
			 * 
			 * // Now all token below m except retainAccentToken get // their accent deleted. System.err.println("the olio" +
			 * m.getNodeName()); NodeList tokens = m.getElementsByTagName(MaryXML.TOKEN); System.err.println("OK number" +
			 * tokens.getLength()); for (int i=0; i<tokens.getLength(); i++) { Element t = (Element) tokens.item(i);
			 * System.err.println("OK" ); if (t == retainAccentToken) // not the same *Object*! {
			 * 
			 * System.err.println("VAI!!!!:" + t.getNodeName()); t.setNodeValue("aaaa"); //System.err.println("VAI!!!!:" + t.get);
			 * //Element syl = MaryDomUtils.getFirstChildElement(t); // System.err.println("VAI!!!!: " + syl.getLocalName()); } }
			 */

		} // for all highest-level mtu elements
	}

}