/**
* Copyright 2002-2008 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.te;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.exceptions.MaryConfigurationException;
import marytts.fst.FSTLookup;
import marytts.language.te.phonemiser.TeluguLTS;
import marytts.modules.InternalModule;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.server.MaryProperties;
import marytts.util.MaryRuntimeUtils;
import marytts.util.dom.MaryDomUtils;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;
/**
* Telugu phonemiser module
*
* @author Marc Schröder, Sathish
*/
public class JPhonemiser extends InternalModule {
protected Map<String, List<String>> userdict;
protected FSTLookup lexicon;
protected TeluguLTS lts;
protected AllophoneSet allophoneSet;
public JPhonemiser(String propertyPrefix) throws IOException, MaryConfigurationException {
this("JPhonemiser", MaryDataType.PARTSOFSPEECH, MaryDataType.PHONEMES, propertyPrefix + "allophoneset", propertyPrefix
+ "userdict", propertyPrefix + "utf8toit3map");
}
/**
* Constructor providing the individual filenames of files that are required.
*
* @param componentName
* componentName
* @param inputType
* inputType
* @param outputType
* outputType
* @param allophonesProperty
* allophonesProperty
* @param userdictProperty
* userdictProperty
* @param utf8toit3mapProperty
* utf8toit3mapProperty
* @throws IOException
* IOException
* @throws MaryConfigurationException
* MaryConfigurationException
*/
public JPhonemiser(String componentName, MaryDataType inputType, MaryDataType outputType, String allophonesProperty,
String userdictProperty, String utf8toit3mapProperty) throws IOException, MaryConfigurationException {
super(componentName, inputType, outputType, MaryRuntimeUtils.needAllophoneSet(allophonesProperty).getLocale());
allophoneSet = MaryRuntimeUtils.needAllophoneSet(allophonesProperty);
// userdict is optional
String userdictFilename = MaryProperties.getFilename(userdictProperty);
if (userdictFilename != null) {
if (new File(userdictFilename).exists()) {
userdict = readLexicon(userdictFilename);
} else {
logger.info("User dictionary '" + userdictFilename + "' for locale '" + getLocale()
+ "' does not exist. Ignoring.");
}
}
InputStream utf8toit3mapStream = MaryProperties.needStream(utf8toit3mapProperty);
lts = new TeluguLTS(utf8toit3mapStream);
}
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
NodeIterator it = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.TOKEN);
Element t = null;
while ((t = (Element) it.nextNode()) != null) {
String text;
// Do not touch tokens for which a transcription is already
// given (exception: transcription contains a '*' character:
if (t.hasAttribute("ph") && !t.getAttribute("ph").contains("*")) {
continue;
}
if (t.hasAttribute("sounds_like"))
text = t.getAttribute("sounds_like");
else
text = MaryDomUtils.tokenText(t);
String pos = null;
// use part-of-speech if available
if (t.hasAttribute("pos")) {
pos = t.getAttribute("pos");
}
if (text != null && !text.equals("")) {
// If text consists of several parts (e.g., because that was
// inserted into the sounds_like attribute), each part
// is transcribed separately.
StringBuilder ph = new StringBuilder();
String g2pMethod = null;
StringTokenizer st = new StringTokenizer(text, " -");
while (st.hasMoreTokens()) {
String graph = st.nextToken();
StringBuilder helper = new StringBuilder();
if (pos.equals("$PUNCT")) {
continue;
}
String phon = phonemise(graph, pos, helper);
if (ph.length() == 0) { // first part
// The g2pMethod of the combined beast is
// the g2pMethod of the first constituant.
g2pMethod = helper.toString();
ph.append(phon);
} else { // following parts
ph.append(" - ");
// Reduce primary to secondary stress:
ph.append(phon.replace('\'', ','));
}
}
if (ph != null && ph.length() > 0) {
setPh(t, ph.toString());
t.setAttribute("g2p_method", g2pMethod);
}
}
}
MaryData result = new MaryData(outputType(), d.getLocale());
result.setDocument(doc);
return result;
}
/**
* Phonemise the word text. This starts with a simple lexicon lookup, followed by some heuristics, and finally applies
* letter-to-sound rules if nothing else was successful.
*
* @param text
* the textual (graphemic) form of a word.
* @param pos
* the part-of-speech of the word
* @param g2pMethod
* This is an awkward way to return a second String parameter via a StringBuilder. If a phonemisation of the text
* is found, this parameter will be filled with the method of phonemisation ("lexicon", ... "rules").
* @return a phonemisation of the text if one can be generated, or null if no phonemisation method was successful.
* @throws IOException
* IOException
*/
public String phonemise(String text, String pos, StringBuilder g2pMethod) throws IOException {
// First, try a simple userdict lookup:
String result = userdictLookup(text, pos);
if (result != null) {
g2pMethod.append("userdict");
return result;
}
// Cannot find it in the lexicon -- apply letter-to-sound rules
// to the normalised form
result = lts.phonemise(text);
if (result != null) {
g2pMethod.append("rules");
return result;
}
return null;
}
/**
* look a given text up in the userdict. part-of-speech is used in case of ambiguity.
*
* @param text
* IOException
* @param pos
* pos
* @return transcr
*/
public String userdictLookup(String text, String pos) {
if (userdict == null || text == null || text.length() == 0)
return null;
List<String> entries = userdict.get(text);
// If entry is not found directly, try the following changes:
// - lowercase the word
// - all lowercase but first uppercase
if (entries == null) {
text = text.toLowerCase(getLocale());
entries = userdict.get(text);
}
if (entries == null) {
text = text.substring(0, 1).toUpperCase(getLocale()) + text.substring(1);
entries = userdict.get(text);
}
if (entries == null)
return null;
String transcr = null;
for (String entry : entries) {
String[] parts = entry.split("\\|");
transcr = parts[0];
if (parts.length > 1 && pos != null) {
StringTokenizer tokenizer = new StringTokenizer(entry);
while (tokenizer.hasMoreTokens()) {
String onePos = tokenizer.nextToken();
if (pos.equals(onePos))
return transcr; // found
}
}
}
// no match of POS: return last entry
return transcr;
}
/**
* Read a lexicon. Lines must have the format
*
* graphemestring | phonestring | optional-parts-of-speech
*
* The pos-item is optional. Different pos's belonging to one grapheme chain may be separated by whitespace
*
*
* @param lexiconFilename
* lexiconFilename
* @throws IOException
* IOException
* @return fLexicon
*/
protected Map<String, List<String>> readLexicon(String lexiconFilename) throws IOException {
String line;
Map<String, List<String>> fLexicon = new HashMap<String, List<String>>();
BufferedReader lexiconFile = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8"));
while ((line = lexiconFile.readLine()) != null) {
// Ignore empty lines and comments:
if (line.trim().equals("") || line.startsWith("#"))
continue;
String[] lineParts = line.split("\\s*\\|\\s*");
String graphStr = lineParts[0];
String phonStr = lineParts[1];
try {
allophoneSet.splitIntoAllophones(phonStr);
} catch (RuntimeException re) {
logger.warn("Lexicon '" + lexiconFilename + "': invalid entry for '" + graphStr + "'", re);
}
String phonPosStr = phonStr;
if (lineParts.length > 2) {
String pos = lineParts[2];
if (!pos.trim().equals(""))
phonPosStr += "|" + pos;
}
List<String> transcriptions = fLexicon.get(graphStr);
if (null == transcriptions) {
transcriptions = new ArrayList<String>();
fLexicon.put(graphStr, transcriptions);
}
transcriptions.add(phonPosStr);
}
lexiconFile.close();
return fLexicon;
}
protected void setPh(Element t, String ph) {
if (!t.getTagName().equals(MaryXML.TOKEN))
throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Only t elements allowed, received " + t.getTagName() + ".");
if (t.hasAttribute("ph")) {
String prevPh = t.getAttribute("ph");
// In previous sampa, replace star with sampa:
String newPh = prevPh.replaceFirst("\\*", ph);
t.setAttribute("ph", newPh);
} else {
t.setAttribute("ph", ph);
}
}
}