/** * Copyright 2002-2008 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.modules; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import marytts.datatypes.MaryData; import marytts.datatypes.MaryDataType; import marytts.datatypes.MaryXML; import marytts.exceptions.MaryConfigurationException; import marytts.fst.FSTLookup; import marytts.modules.phonemiser.AllophoneSet; import marytts.modules.phonemiser.TrainedLTS; import marytts.server.MaryProperties; import marytts.util.MaryRuntimeUtils; import marytts.util.MaryUtils; import marytts.util.dom.MaryDomUtils; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.traversal.NodeIterator; /** * The phonemiser module -- java implementation. * * @author Marc Schröder, Sathish * @author ingmar */ public class JPhonemiser extends InternalModule { protected Map<String, List<String>> userdict; protected FSTLookup lexicon; protected TrainedLTS lts; protected boolean removeTrailingOneFromPhones = true; protected AllophoneSet allophoneSet; protected Pattern punctuationPosRegex; protected Pattern unpronounceablePosRegex; public JPhonemiser(String propertyPrefix) throws IOException, MaryConfigurationException { this("JPhonemiser", MaryDataType.PARTSOFSPEECH, MaryDataType.PHONEMES, propertyPrefix + "allophoneset", propertyPrefix + "userdict", propertyPrefix + "lexicon", propertyPrefix + "lettertosound", propertyPrefix + "removeTrailingOneFromPhones"); } /** * Constructor providing the individual filenames of files that are required. * * @param componentName * componentName * @param inputType * inputType * @param outputType * outputType * @param allophonesProperty * allophonesProperty * @param userdictProperty * userdictProperty * @param lexiconProperty * lexiconProperty * @param ltsProperty * ltsProperty * @throws IOException * IOException * @throws MaryConfigurationException * MaryConfigurationException */ public JPhonemiser(String componentName, MaryDataType inputType, MaryDataType outputType, String allophonesProperty, String userdictProperty, String lexiconProperty, String ltsProperty) throws IOException, MaryConfigurationException { this(componentName, inputType, outputType, allophonesProperty, userdictProperty, lexiconProperty, ltsProperty, null); } /** * Constructor providing the individual filenames of files that are required. * * @param componentName * componentName * @param inputType * inputType * @param outputType * outputType * @param allophonesProperty * allophonesProperty * @param userdictProperty * userdictProperty * @param lexiconProperty * lexiconProperty * @param ltsProperty * ltsProperty * @param removetrailingonefromphonesProperty * removetrailingonefromphonesProperty * @throws IOException * IOException * @throws MaryConfigurationException * MaryConfigurationException */ public JPhonemiser(String componentName, MaryDataType inputType, MaryDataType outputType, String allophonesProperty, String userdictProperty, String lexiconProperty, String ltsProperty, String removetrailingonefromphonesProperty) throws IOException, MaryConfigurationException { super(componentName, inputType, outputType, MaryRuntimeUtils.needAllophoneSet(allophonesProperty).getLocale()); allophoneSet = MaryRuntimeUtils.needAllophoneSet(allophonesProperty); // userdict is optional String userdictFilename = MaryProperties.getFilename(userdictProperty); // may be null if (userdictFilename != null) { if (new File(userdictFilename).exists()) { userdict = readLexicon(userdictFilename); } else { logger.info("User dictionary '" + userdictFilename + "' for locale '" + getLocale() + "' does not exist. Ignoring."); } } InputStream lexiconStream = MaryProperties.needStream(lexiconProperty); lexicon = new FSTLookup(lexiconStream, lexiconProperty); InputStream ltsStream = MaryProperties.needStream(ltsProperty); if (removetrailingonefromphonesProperty != null) { this.removeTrailingOneFromPhones = MaryProperties.getBoolean(removetrailingonefromphonesProperty, true); } lts = new TrainedLTS(allophoneSet, ltsStream, this.removeTrailingOneFromPhones); } public void startup() throws Exception { super.startup(); setPunctuationPosRegex(); setUnpronounceablePosRegex(); } public MaryData process(MaryData d) throws Exception { Document doc = d.getDocument(); NodeIterator it = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.TOKEN); Element t = null; while ((t = (Element) it.nextNode()) != null) { String text; // Do not touch tokens for which a transcription is already // given (exception: transcription contains a '*' character: if (t.hasAttribute("ph") && !t.getAttribute("ph").contains("*")) { continue; } if (t.hasAttribute("sounds_like")) text = t.getAttribute("sounds_like"); else text = MaryDomUtils.tokenText(t); // use part-of-speech if available String pos = null; if (t.hasAttribute("pos")) { pos = t.getAttribute("pos"); } if (maybePronounceable(text, pos)) { // If text consists of several parts (e.g., because that was // inserted into the sounds_like attribute), each part // is transcribed separately. StringBuilder ph = new StringBuilder(); String g2pMethod = null; StringTokenizer st = new StringTokenizer(text, " -"); while (st.hasMoreTokens()) { String graph = st.nextToken(); StringBuilder helper = new StringBuilder(); String phon = phonemise(graph, pos, helper); // null result should not be processed if (phon == null) { continue; } if (ph.length() == 0) { // first part // The g2pMethod of the combined beast is // the g2pMethod of the first constituant. g2pMethod = helper.toString(); ph.append(phon); } else { // following parts ph.append(" - "); // Reduce primary to secondary stress: ph.append(phon.replace('\'', ',')); } } if (ph != null && ph.length() > 0) { setPh(t, ph.toString()); t.setAttribute("g2p_method", g2pMethod); } } } MaryData result = new MaryData(outputType(), d.getLocale()); result.setDocument(doc); return result; } /** * Phonemise the word text. This starts with a simple lexicon lookup, followed by some heuristics, and finally applies * letter-to-sound rules if nothing else was successful. * * @param text * the textual (graphemic) form of a word. * @param pos * the part-of-speech of the word * @param g2pMethod * This is an awkward way to return a second String parameter via a StringBuilder. If a phonemisation of the text * is found, this parameter will be filled with the method of phonemisation ("lexicon", ... "rules"). * @return a phonemisation of the text if one can be generated, or null if no phonemisation method was successful. */ public String phonemise(String text, String pos, StringBuilder g2pMethod) { // First, try a simple userdict and lexicon lookup: String result = userdictLookup(text, pos); if (result != null) { g2pMethod.append("userdict"); return result; } result = lexiconLookup(text, pos); if (result != null) { g2pMethod.append("lexicon"); return result; } // Lookup attempts failed. Try normalising exotic letters // (diacritics on vowels, etc.), look up again: String normalised = MaryUtils.normaliseUnicodeLetters(text, getLocale()); if (!normalised.equals(text)) { result = userdictLookup(normalised, pos); if (result != null) { g2pMethod.append("userdict"); return result; } result = lexiconLookup(normalised, pos); if (result != null) { g2pMethod.append("lexicon"); return result; } } // Cannot find it in the lexicon -- apply letter-to-sound rules // to the normalised form String phones = lts.predictPronunciation(text); try { result = lts.syllabify(phones); } catch (IllegalArgumentException e) { logger.error(String.format("Problem with token <%s> [%s]: %s", text, phones, e.getMessage())); } if (result != null) { g2pMethod.append("rules"); return result; } return null; } /** * Look a given text up in the (standard) lexicon. part-of-speech is used in case of ambiguity. * * @param text * text * @param pos * pos * @return null if text == null or text.length is 0, null if entries.length is 0, entries[0] otherwise */ public String lexiconLookup(String text, String pos) { if (text == null || text.length() == 0) return null; String[] entries; entries = lexiconLookupPrimitive(text, pos); // If entry is not found directly, try the following changes: // - lowercase the word // - all lowercase but first uppercase if (entries.length == 0) { text = text.toLowerCase(getLocale()); entries = lexiconLookupPrimitive(text, pos); } if (entries.length == 0) { text = text.substring(0, 1).toUpperCase(getLocale()) + text.substring(1); entries = lexiconLookupPrimitive(text, pos); } if (entries.length == 0) return null; return entries[0]; } private String[] lexiconLookupPrimitive(String text, String pos) { String[] entries; if (pos != null) { // look for pos-specific version first entries = lexicon.lookup(text + pos); if (entries.length == 0) { // not found -- lookup without pos entries = lexicon.lookup(text); } } else { entries = lexicon.lookup(text); } return entries; } /** * look a given text up in the userdict. part-of-speech is used in case of ambiguity. * * @param text * text * @param pos * pos * @return null if userdict is null or text is null or text.length is 0, null if entries is null, transcr otherwise */ public String userdictLookup(String text, String pos) { if (userdict == null || text == null || text.length() == 0) return null; List<String> entries = userdict.get(text); // If entry is not found directly, try the following changes: // - lowercase the word // - all lowercase but first uppercase if (entries == null) { text = text.toLowerCase(getLocale()); entries = userdict.get(text); } if (entries == null) { text = text.substring(0, 1).toUpperCase(getLocale()) + text.substring(1); entries = userdict.get(text); } if (entries == null) return null; String transcr = null; for (String entry : entries) { String[] parts = entry.split("\\|"); transcr = parts[0]; if (parts.length > 1 && pos != null) { StringTokenizer tokenizer = new StringTokenizer(entry); while (tokenizer.hasMoreTokens()) { String onePos = tokenizer.nextToken(); if (pos.equals(onePos)) return transcr; // found } } } // no match of POS: return last entry return transcr; } /** * Access the allophone set underlying this phonemiser. * * @return allophoneSet */ public AllophoneSet getAllophoneSet() { return allophoneSet; } /** * Read a lexicon. Lines must have the format * * graphemestring | phonestring | optional-parts-of-speech * * The pos-item is optional. Different pos's belonging to one grapheme chain may be separated by whitespace * * * @param lexiconFilename * lexiconFilename * @throws IOException * IOException * @return fLexicon */ protected Map<String, List<String>> readLexicon(String lexiconFilename) throws IOException { logger.debug(String.format("Reading lexicon from '%s'", lexiconFilename)); String line; Map<String, List<String>> fLexicon = new HashMap<String, List<String>>(); BufferedReader lexiconFile = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8")); while ((line = lexiconFile.readLine()) != null) { // Ignore empty lines and comments: if (line.trim().equals("") || line.startsWith("#")) continue; String[] lineParts = line.split("\\s*\\|\\s*"); String graphStr = lineParts[0]; String phonStr = null; try { phonStr = lineParts[1]; } catch (ArrayIndexOutOfBoundsException e) { logger.warn(String.format("Lexicon '%s': missing transcription for '%s'", lexiconFilename, graphStr)); continue; } try { allophoneSet.splitIntoAllophones(phonStr); } catch (IllegalArgumentException e) { logger.warn(String.format("Lexicon '%s': invalid entry for '%s': %s", lexiconFilename, graphStr, e.getMessage())); continue; } String phonPosStr = phonStr; if (lineParts.length > 2) { String pos = lineParts[2]; if (!pos.trim().equals("")) phonPosStr += "|" + pos; } List<String> transcriptions = fLexicon.get(graphStr); if (null == transcriptions) { transcriptions = new ArrayList<String>(); fLexicon.put(graphStr, transcriptions); } transcriptions.add(phonPosStr); } lexiconFile.close(); return fLexicon; } protected void setPh(Element t, String ph) { if (!t.getTagName().equals(MaryXML.TOKEN)) throw new DOMException(DOMException.INVALID_ACCESS_ERR, "Only t elements allowed, received " + t.getTagName() + "."); if (t.hasAttribute("ph")) { String prevPh = t.getAttribute("ph"); // In previous sampa, replace star with sampa: String newPh = prevPh.replaceFirst("\\*", ph); t.setAttribute("ph", newPh); } else { t.setAttribute("ph", ph); } } /** * Compile a regex pattern used to determine whether tokens are processed as punctuation or not, based on whether their * <code>pos</code> attribute matches the pattern. * */ protected void setPunctuationPosRegex() { String language = getLocale().getLanguage(); String propertyName = language + ".pos.punct.regex"; String defaultRegex = "\\$PUNCT"; String regex = MaryProperties.getProperty(propertyName); if (regex == null) { logger.debug(String.format("Property %s not set, using default", propertyName)); regex = defaultRegex; } else { logger.debug(String.format("Using property %s", propertyName)); } try { punctuationPosRegex = Pattern.compile(regex); } catch (PatternSyntaxException e) { logger.error(String.format("Could not compile regex pattern /%s/, using default instead", regex)); punctuationPosRegex = Pattern.compile(defaultRegex); } logger.debug(String.format("Punctuation regex pattern set to /%s/", punctuationPosRegex)); } /** * Compile a regex pattern used to determine whether tokens are processed as unprounounceable or not, based on whether their * <code>pos</code> attribute matches the pattern. * */ protected void setUnpronounceablePosRegex() { String language = getLocale().getLanguage(); String propertyName = language + ".pos.unprounounceable.regex"; String defaultRegex = "^[^a-zA-Z]+$"; String regex = MaryProperties.getProperty(propertyName); if (regex == null) { logger.debug(String.format("Property %s not set, using default", propertyName)); regex = defaultRegex; } else { logger.debug(String.format("Using property %s", propertyName)); } try { unpronounceablePosRegex = Pattern.compile(regex); } catch (PatternSyntaxException e) { logger.error(String.format("Could not compile regex pattern /%s/, using default instead", regex)); unpronounceablePosRegex = Pattern.compile(defaultRegex); } logger.debug(String.format("Punctuation regex pattern set to /%s/", unpronounceablePosRegex)); } /** * Based on the regex compiled in {@link #setPunctuationPosRegex()}, determine whether a given POS string is classified as * punctuation * * @param pos * the POS tag * @return <b>true</b> if the POS tag matches the regex pattern; <b>false</b> otherwise * @throws NullPointerException * if the regex pattern is null (because it hasn't been set during module startup) * */ public boolean isPosPunctuation(String pos) { if (pos != null && punctuationPosRegex.matcher(pos).matches()) { return true; } return false; } public boolean isUnpronounceable(String pos) { if (pos != null && unpronounceablePosRegex.matcher(pos).matches()) { return true; } return false; } /** * Determine whether token should be pronounceable, based on text and POS tag. * * @param text * the text of the token * @param pos * the POS tag of the token * @return <b>false</b> if the text is empty, or if it contains no word characters <em>and</em> the POS tag indicates * punctuation; <b>true</b> otherwise */ public boolean maybePronounceable(String text, String pos) { // does text contain anything at all? if (text == null || text.isEmpty()) { return false; } // does text contain at least one word character? if (text.matches(".*\\w.*")) { return true; } // does POS tag indicate punctuation? if (isPosPunctuation(pos)) { return false; } // does POS tag indicate punctuation? if (isUnpronounceable(pos)) { return false; } // by default, just try to pronounce anyway return true; } }