JPhonemiser.java example

Explorer
marytts-master
/**
 * Copyright 2002-2008 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.language.te;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.exceptions.MaryConfigurationException;
import marytts.fst.FSTLookup;
import marytts.language.te.phonemiser.TeluguLTS;
import marytts.modules.InternalModule;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.server.MaryProperties;
import marytts.util.MaryRuntimeUtils;
import marytts.util.dom.MaryDomUtils;

import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;

/**
 * Telugu phonemiser module
 *
 * @author Marc Schröder, Sathish
 */

public class JPhonemiser extends InternalModule {

	protected Map<String, List<String>> userdict;
	protected FSTLookup lexicon;
	protected TeluguLTS lts;

	protected AllophoneSet allophoneSet;

	public JPhonemiser(String propertyPrefix) throws IOException, MaryConfigurationException {
		this("JPhonemiser", MaryDataType.PARTSOFSPEECH, MaryDataType.PHONEMES, propertyPrefix + "allophoneset", propertyPrefix
				+ "userdict", propertyPrefix + "utf8toit3map");
	}

	/**
	 * Constructor providing the individual filenames of files that are required.
	 * 
	 * @param componentName
	 *            componentName
	 * @param inputType
	 *            inputType
	 * @param outputType
	 *            outputType
	 * @param allophonesProperty
	 *            allophonesProperty
	 * @param userdictProperty
	 *            userdictProperty
	 * @param utf8toit3mapProperty
	 *            utf8toit3mapProperty
	 * @throws IOException
	 *             IOException
	 * @throws MaryConfigurationException
	 *             MaryConfigurationException
	 */
	public JPhonemiser(String componentName, MaryDataType inputType, MaryDataType outputType, String allophonesProperty,
			String userdictProperty, String utf8toit3mapProperty) throws IOException, MaryConfigurationException {
		super(componentName, inputType, outputType, MaryRuntimeUtils.needAllophoneSet(allophonesProperty).getLocale());
		allophoneSet = MaryRuntimeUtils.needAllophoneSet(allophonesProperty);
		// userdict is optional
		String userdictFilename = MaryProperties.getFilename(userdictProperty);
		if (userdictFilename != null) {
			if (new File(userdictFilename).exists()) {
				userdict = readLexicon(userdictFilename);
			} else {
				logger.info("User dictionary '" + userdictFilename + "' for locale '" + getLocale()
						+ "' does not exist. Ignoring.");
			}
		}
		InputStream utf8toit3mapStream = MaryProperties.needStream(utf8toit3mapProperty);
		lts = new TeluguLTS(utf8toit3mapStream);
	}

	public MaryData process(MaryData d) throws Exception {
		Document doc = d.getDocument();
		NodeIterator it = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.TOKEN);
		Element t = null;
		while ((t = (Element) it.nextNode()) != null) {
			String text;

			// Do not touch tokens for which a transcription is already
			// given (exception: transcription contains a '*' character:
			if (t.hasAttribute("ph") && !t.getAttribute("ph").contains("*")) {
				continue;
			}
			if (t.hasAttribute("sounds_like"))
				text = t.getAttribute("sounds_like");
			else
				text = MaryDomUtils.tokenText(t);

			String pos = null;
			// use part-of-speech if available
			if (t.hasAttribute("pos")) {
				pos = t.getAttribute("pos");
			}

			if (text != null && !text.equals("")) {
				// If text consists of several parts (e.g., because that was
				// inserted into the sounds_like attribute), each part
				// is transcribed separately.
				StringBuilder ph = new StringBuilder();
				String g2pMethod = null;
				StringTokenizer st = new StringTokenizer(text, " -");
				while (st.hasMoreTokens()) {
					String graph = st.nextToken();
					StringBuilder helper = new StringBuilder();
					if (pos.equals("$PUNCT")) {
						continue;
					}
					String phon = phonemise(graph, pos, helper);
					if (ph.length() == 0) { // first part
						// The g2pMethod of the combined beast is
						// the g2pMethod of the first constituant.
						g2pMethod = helper.toString();
						ph.append(phon);
					} else { // following parts
						ph.append(" - ");
						// Reduce primary to secondary stress:
						ph.append(phon.replace('\'', ','));
					}
				}

				if (ph != null && ph.length() > 0) {
					setPh(t, ph.toString());
					t.setAttribute("g2p_method", g2pMethod);
				}
			}
		}
		MaryData result = new MaryData(outputType(), d.getLocale());
		result.setDocument(doc);
		return result;
	}

	/**
	 * Phonemise the word text. This starts with a simple lexicon lookup, followed by some heuristics, and finally applies
	 * letter-to-sound rules if nothing else was successful.
	 * 
	 * @param text
	 *            the textual (graphemic) form of a word.
	 * @param pos
	 *            the part-of-speech of the word
	 * @param g2pMethod
	 *            This is an awkward way to return a second String parameter via a StringBuilder. If a phonemisation of the text
	 *            is found, this parameter will be filled with the method of phonemisation ("lexicon", ... "rules").
	 * @return a phonemisation of the text if one can be generated, or null if no phonemisation method was successful.
	 * @throws IOException
	 *             IOException
	 */
	public String phonemise(String text, String pos, StringBuilder g2pMethod) throws IOException {
		// First, try a simple userdict lookup:

		String result = userdictLookup(text, pos);
		if (result != null) {
			g2pMethod.append("userdict");
			return result;
		}

		// Cannot find it in the lexicon -- apply letter-to-sound rules
		// to the normalised form

		result = lts.phonemise(text);
		if (result != null) {
			g2pMethod.append("rules");
			return result;
		}

		return null;
	}

	/**
	 * look a given text up in the userdict. part-of-speech is used in case of ambiguity.
	 * 
	 * @param text
	 *            IOException
	 * @param pos
	 *            pos
	 * @return transcr
	 */
	public String userdictLookup(String text, String pos) {
		if (userdict == null || text == null || text.length() == 0)
			return null;
		List<String> entries = userdict.get(text);
		// If entry is not found directly, try the following changes:
		// - lowercase the word
		// - all lowercase but first uppercase
		if (entries == null) {
			text = text.toLowerCase(getLocale());
			entries = userdict.get(text);
		}
		if (entries == null) {
			text = text.substring(0, 1).toUpperCase(getLocale()) + text.substring(1);
			entries = userdict.get(text);
		}

		if (entries == null)
			return null;

		String transcr = null;
		for (String entry : entries) {
			String[] parts = entry.split("\\|");
			transcr = parts[0];
			if (parts.length > 1 && pos != null) {
				StringTokenizer tokenizer = new StringTokenizer(entry);
				while (tokenizer.hasMoreTokens()) {
					String onePos = tokenizer.nextToken();
					if (pos.equals(onePos))
						return transcr; // found
				}
			}
		}
		// no match of POS: return last entry
		return transcr;
	}

	/**
	 * Read a lexicon. Lines must have the format
	 * 
	 * graphemestring | phonestring | optional-parts-of-speech
	 * 
	 * The pos-item is optional. Different pos's belonging to one grapheme chain may be separated by whitespace
	 * 
	 * 
	 * @param lexiconFilename
	 *            lexiconFilename
	 * @throws IOException
	 *             IOException
	 * @return fLexicon
	 */
	protected Map<String, List<String>> readLexicon(String lexiconFilename) throws IOException {
		String line;
		Map<String, List<String>> fLexicon = new HashMap<String, List<String>>();

		BufferedReader lexiconFile = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8"));
		while ((line = lexiconFile.readLine()) != null) {
			// Ignore empty lines and comments:
			if (line.trim().equals("") || line.startsWith("#"))
				continue;

			String[] lineParts = line.split("\\s*\\|\\s*");
			String graphStr = lineParts[0];
			String phonStr = lineParts[1];
			try {
				allophoneSet.splitIntoAllophones(phonStr);
			} catch (RuntimeException re) {
				logger.warn("Lexicon '" + lexiconFilename + "': invalid entry for '" + graphStr + "'", re);
			}
			String phonPosStr = phonStr;
			if (lineParts.length > 2) {
				String pos = lineParts[2];
				if (!pos.trim().equals(""))
					phonPosStr += "|" + pos;
			}

			List<String> transcriptions = fLexicon.get(graphStr);
			if (null == transcriptions) {
				transcriptions = new ArrayList<String>();
				fLexicon.put(graphStr, transcriptions);
			}
			transcriptions.add(phonPosStr);
		}
		lexiconFile.close();
		return fLexicon;
	}

	protected void setPh(Element t, String ph) {
		if (!t.getTagName().equals(MaryXML.TOKEN))
			throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Only t elements allowed, received " + t.getTagName() + ".");
		if (t.hasAttribute("ph")) {
			String prevPh = t.getAttribute("ph");
			// In previous sampa, replace star with sampa:
			String newPh = prevPh.replaceFirst("\\*", ph);
			t.setAttribute("ph", newPh);
		} else {
			t.setAttribute("ph", ph);
		}
	}

}