/**
* Copyright 2002 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.language.de;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.exceptions.MaryConfigurationException;
import marytts.fst.FSTLookup;
import marytts.language.de.phonemiser.Inflection;
import marytts.language.de.phonemiser.PhonemiseDenglish;
import marytts.language.de.phonemiser.Result;
import marytts.modules.synthesis.PAConverter;
import marytts.server.MaryProperties;
import marytts.util.MaryUtils;
import marytts.util.dom.MaryDomUtils;
import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;
/**
* The phonemiser module -- java implementation.
*
* @author Marc Schröder
*/
public class JPhonemiser extends marytts.modules.JPhonemiser {
private Inflection inflection;
private FSTLookup usEnglishLexicon = null;
private String logUnknownFileName = null;
private Map<String, Integer> unknown2Frequency = null;
private String logEnglishFileName = null;
private Map<String, Integer> english2Frequency = null;
private PhonemiseDenglish phonemiseDenglish;
public JPhonemiser() throws IOException, MaryConfigurationException {
super("JPhonemiser_de", MaryDataType.PARTSOFSPEECH, MaryDataType.PHONEMES, "de.allophoneset", "de.userdict",
"de.lexicon", "de.lettertosound");
}
public void startup() throws Exception {
super.startup();
phonemiseDenglish = new PhonemiseDenglish(this);
inflection = new Inflection();
if (MaryProperties.getBoolean("de.phonemiser.logunknown")) {
String logBasepath = MaryProperties.maryBase() + File.separator + "log" + File.separator;
File logDir = new File(logBasepath);
try {
if (!logDir.isDirectory()) {
logger.info("Creating log directory " + logDir.getCanonicalPath());
FileUtils.forceMkdir(logDir);
}
logUnknownFileName = MaryProperties.getFilename("de.phonemiser.logunknown.filename", logBasepath
+ "de_unknown.txt");
unknown2Frequency = new HashMap<String, Integer>();
logEnglishFileName = MaryProperties.getFilename("de.phonemiser.logenglish.filename", logBasepath
+ "de_english-words.txt");
english2Frequency = new HashMap<String, Integer>();
} catch (IOException e) {
logger.info("Could not create log directory " + logDir.getCanonicalPath() + " Logging disabled!", e);
}
}
if (MaryProperties.getBoolean("de.phonemiser.useenglish")) {
InputStream usLexStream = MaryProperties.getStream("en_US.lexicon");
if (usLexStream != null) {
try {
usEnglishLexicon = new FSTLookup(usLexStream, MaryProperties.getProperty("en_US.lexicon"));
} catch (Exception e) {
logger.info("Cannot load English lexicon '" + MaryProperties.getProperty("en_US.lexicon") + "'", e);
}
}
}
}
public void shutdown() {
if (logUnknownFileName != null || logEnglishFileName != null) {
try {
/* print unknown words */
// open file
PrintWriter logUnknown = new PrintWriter(
new OutputStreamWriter(new FileOutputStream(logUnknownFileName), "UTF-8"));
// sort the words
Set<String> unknownWords = unknown2Frequency.keySet();
SortedMap<Integer, List<String>> freq2Unknown = new TreeMap<Integer, List<String>>();
for (String nextUnknown : unknownWords) {
int nextFreq = unknown2Frequency.get(nextUnknown);
// logUnknown.println(nextFreq+" "+nextUnknown);
if (freq2Unknown.containsKey(nextFreq)) {
List<String> unknowns = freq2Unknown.get(nextFreq);
unknowns.add(nextUnknown);
} else {
List<String> unknowns = new ArrayList<String>();
unknowns.add(nextUnknown);
freq2Unknown.put(nextFreq, unknowns);
}
}
// print the words
for (int nextFreq : freq2Unknown.keySet()) {
List<String> unknowns = freq2Unknown.get(nextFreq);
for (int i = 0; i < unknowns.size(); i++) {
String unknownWord = (String) unknowns.get(i);
logUnknown.println(nextFreq + " " + unknownWord);
}
}
// close file
logUnknown.flush();
logUnknown.close();
/* print english words */
// open the file
PrintWriter logEnglish = new PrintWriter(
new OutputStreamWriter(new FileOutputStream(logEnglishFileName), "UTF-8"));
// sort the words
SortedMap<Integer, List<String>> freq2English = new TreeMap<Integer, List<String>>();
for (String nextEnglish : english2Frequency.keySet()) {
int nextFreq = english2Frequency.get(nextEnglish);
if (freq2English.containsKey(nextFreq)) {
List<String> englishWords = freq2English.get(nextFreq);
englishWords.add(nextEnglish);
} else {
List<String> englishWords = new ArrayList<String>();
englishWords.add(nextEnglish);
freq2English.put(nextFreq, englishWords);
}
}
// print the words
for (int nextFreq : freq2English.keySet()) {
List<String> englishWords = freq2English.get(nextFreq);
for (int i = 0; i < englishWords.size(); i++) {
logEnglish.println(nextFreq + " " + englishWords.get(i));
}
}
// close file
logEnglish.flush();
logEnglish.close();
} catch (Exception e) {
logger.info("Error printing log files for english and unknown words", e);
}
}
}
@Override
public MaryData process(MaryData d) throws Exception {
Document doc = d.getDocument();
inflection.determineEndings(doc);
NodeIterator it = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.TOKEN);
Element t = null;
while ((t = (Element) it.nextNode()) != null) {
String text;
// Do not touch tokens for which a transcription is already
// given (exception: transcription contains a '*' character:
if (t.hasAttribute("ph") && t.getAttribute("ph").indexOf('*') == -1) {
continue;
}
if (t.hasAttribute("sounds_like"))
text = t.getAttribute("sounds_like");
else
text = MaryDomUtils.tokenText(t);
// use part-of-speech if available
String pos = null;
if (t.hasAttribute("pos")) {
pos = t.getAttribute("pos");
}
boolean isEnglish = false;
if (t.hasAttribute("xml:lang")
&& MaryUtils.subsumes(Locale.ENGLISH, MaryUtils.string2locale(t.getAttribute("xml:lang")))) {
isEnglish = true;
}
if (maybePronounceable(text, pos)) {
// If text consists of several parts (e.g., because that was
// inserted into the sounds_like attribute), each part
// is transcribed separately.
StringBuilder ph = new StringBuilder();
String g2pMethod = null;
StringTokenizer st = new StringTokenizer(text, " -");
while (st.hasMoreTokens()) {
String graph = st.nextToken();
StringBuilder helper = new StringBuilder();
String phon = null;
if (isEnglish && usEnglishLexicon != null) {
phon = phonemiseEn(graph);
if (phon != null)
helper.append("foreign:en");
}
if (phon == null) {
phon = phonemise(graph, pos, helper);
}
// null result should not be processed
if (phon == null) {
continue;
}
if (ph.length() == 0) { // first part
// The g2pMethod of the combined beast is
// the g2pMethod of the first constituant.
g2pMethod = helper.toString();
ph.append(phon);
} else { // following parts
ph.append(" - ");
// Reduce primary to secondary stress:
ph.append(phon.replace('\'', ','));
}
}
if (ph != null && ph.length() > 0) {
setPh(t, ph.toString());
t.setAttribute("g2p_method", g2pMethod);
}
}
}
MaryData result = new MaryData(outputType(), d.getLocale());
result.setDocument(doc);
return result;
}
/**
* Phonemise the word text. This starts with a simple lexicon lookup, followed by some heuristics, and finally applies
* letter-to-sound rules if nothing else was successful.
*
* @param text
* the textual (graphemic) form of a word.
* @param pos
* pos
* @param g2pMethod
* This is an awkward way to return a second String parameter via a StringBuilder. If a phonemisation of the text
* is found, this parameter will be filled with the method of phonemisation ("lexicon", ... "rules").
* @return a phonemisation of the text if one can be generated, or null if no phonemisation method was successful.
*/
@Override
public String phonemise(String text, String pos, StringBuilder g2pMethod) {
// First, try a simple userdict and lexicon lookup:
String result = userdictLookup(text, pos);
if (result != null) {
g2pMethod.append("userdict");
return result;
}
result = lexiconLookup(text, pos);
if (result != null) {
g2pMethod.append("lexicon");
return result;
}
/**
* // Not found? Try a compound "analysis": result = compoundSearch(text);
* //logger.debug("Compound here: "+compoundSearch(text)); if (result != null) { g2pMethod.append("compound"); return
* result; }
**/
// Lookup attempts failed. Try normalising exotic letters
// (diacritics on vowels, etc.), look up again:
String normalised = MaryUtils.normaliseUnicodeLetters(text, Locale.GERMAN);
if (!normalised.equals(text)) {
// First, try a simple userdict and lexicon lookup:
result = userdictLookup(normalised, pos);
if (result != null) {
g2pMethod.append("userdict");
return result;
}
result = lexiconLookup(normalised, pos);
if (result != null) {
g2pMethod.append("lexicon");
return result;
}
/**
* // Not found? Try a compound "analysis": result = compoundSearch(normalised); if (result != null) {
* g2pMethod.append("compound"); return result; }
**/
}
// plain English word must be looked up in English lexicon before phonemiseDenglish starts
if (usEnglishLexicon != null) {
String englishTranscription = phonemiseEn(text);
if (englishTranscription != null) {
g2pMethod.append("foreign:en");
logger.debug(text + " is English");
if (logEnglishFileName != null) {
String englishText = text.trim();
if (english2Frequency.containsKey(englishText)) {
int textFreq = english2Frequency.get(englishText);
textFreq++;
english2Frequency.put(englishText, textFreq);
} else {
english2Frequency.put(englishText, 1);
}
}
return englishTranscription;
}
}
Result resultingWord = null;
boolean usedOtherLanguageToPhonemise = false;
try {
resultingWord = phonemiseDenglish.processWord(text, usEnglishLexicon != null);
result = resultingWord.getTranscription();
usedOtherLanguageToPhonemise = resultingWord.isUsedOtherLanguageToPhonemise();
} catch (NullPointerException e) {
logger.debug(String.format("Word is Null: ", e.getMessage()));
}
// logger.debug("input for PD: "+text);
if (result != null) {
result = allophoneSet.splitAllophoneString(result);
if (usedOtherLanguageToPhonemise) {
g2pMethod.append("phonemiseDenglish");
return result;
} else {
g2pMethod.append("compound");
return result;
}
}
// Cannot find it in the lexicon -- apply letter-to-sound rules
// to the normalised form
String phones = ""; // added
try {
phones = lts.predictPronunciation(normalised); // added
result = lts.syllabify(phones);
} catch (IllegalArgumentException e) {
logger.error(String.format("Problem with token <%s> [%s]: %s", normalised, phones, e.getMessage()));
} catch (ClassCastException e) {
logger.error(String.format("Problem with token <%s> : %s", normalised, e.getMessage())); // added
}
if (result != null) {
if (logUnknownFileName != null) {
String unknownText = text.trim();
if (unknown2Frequency.containsKey(unknownText)) {
int textFreq = unknown2Frequency.get(unknownText);
textFreq++;
unknown2Frequency.put(unknownText, textFreq);
} else {
unknown2Frequency.put(unknownText, new Integer(1));
}
}
g2pMethod.append("rules");
return result;
}
return null;
}
/**
* Try to determine an English transcription of the text according to English rules, but using German Sampa.
*
* @param text
* Word to transcribe
* @return the transcription, or null if none could be determined.
*/
public String phonemiseEn(String text) {
assert usEnglishLexicon != null;
// We get here only if there is an English lexicon
String normalisedEn = MaryUtils.normaliseUnicodeLetters(text, Locale.US);
normalisedEn = normalisedEn.toLowerCase();
String[] transcriptions = usEnglishLexicon.lookup(normalisedEn);
assert transcriptions != null; // if nothing is found, an array of length 0 is returned.
if (transcriptions.length == 0) {
return null;
}
String usSampa = transcriptions[0];
String deSampa = PAConverter.sampaEnString2sampaDeString(usSampa);
// logger.debug("converted "+usSampa+" to "+deSampa);
return deSampa;
}
/**
* This method tries to decompose a compound. It calls itself recursively.
*
* @param text
* the word to be transcribed.
* @return the SAMPA transcription of text, or null if none was found.
*/
/*
* private String compoundSearch(String text) { // Chop off longest possible prefixes and try to look them up // in the
* lexicon. Any part must have a minimum length of 3 characters.
* //System.out.println("Compound Search is starting with: "+text);
*
* for (int i=text.length() - 3; i >= 3; i--) { //-3!!! >= 3!!!
*
* String firstPhon = null; String fugePhon = null; String restPhon = null; String prefix = text.substring(0, i);
*
*
* firstPhon = userdictLookup(prefix);
*
* if (firstPhon == null) firstPhon = lexiconLookup(prefix);
*
* if (firstPhon != null) { // found a valid prefix String rest = text.substring(i); logger.debug("Rest is: "+rest);
*
* // Is the rest a simple lexical entry? restPhon = userdictLookup(rest);
*
* if (restPhon == null) restPhon = lexiconLookup(rest);
*
* // Or can the rest be analysed as a compound? if (restPhon == null) restPhon = compoundSearch(rest);
*
* // Or does it help if we cut off a Fuge? if (restPhon == null) { String [] helper = fugeSearch(rest); //hier scheint er
* nicht mehr reinzugehen //logger.debug("fugeSearch(rest) is: " + fugeSearch(rest)); if (helper != null && helper.length ==
* 2) { fugePhon = helper[0]; String rest2 = helper[1]; assert fugePhon != null; assert rest2 != null; restPhon =
* userdictLookup(rest2); if (restPhon == null) restPhon = lexiconLookup(rest2); if (restPhon == null) restPhon =
* compoundSearch(rest2); } } if (restPhon != null) // success! return firstPhon + (fugePhon != null ? fugePhon : "") + "-" +
* restPhon; } } return null; }
*/
/**
* Try to cut off a Fuge morpheme at the beginning of suffix.
*
* @param suffix
* a part of a word with a prefix already removed.
* @return a two-item String array. First string is the transcription of the Fuge found, second is the suffix after the Fuge
* was removed. Returns null if no Fuge was found.
*/
/*
* private String[] fugeSearch(String suffix) { String fugePhon = null; int fugeLength = 0; if (suffix.startsWith("es")) {
* fugePhon = "@s"; fugeLength = 2; } else if (suffix.startsWith("en")) { fugePhon = "@n"; fugeLength = 2; } else if
* (suffix.startsWith("n")) { fugePhon = "n"; fugeLength = 1; } else if (suffix.startsWith("s")) { fugePhon = "s"; fugeLength
* = 1; } else if (suffix.startsWith("e")) { fugePhon = "@"; fugeLength = 1; } if (fugePhon != null) { // found a Fuge
* String[] returnValue = new String[2]; returnValue[0] = fugePhon; returnValue[1] = suffix.substring(fugeLength); return
* returnValue; } else { return null; } }
*/
}