CMUDict2MaryFST.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.tools.newlanguage.en_US;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.StringTokenizer;

import marytts.modules.phonemiser.AllophoneSet;
import marytts.modules.phonemiser.Syllabifier;
import marytts.tools.newlanguage.LexiconCreator;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.PatternLayout;

/**
 * This class does a one-time, offline conversion from the CMUDict in Festival format (cmudict-0.4.scm and cmudict_extensions.scm)
 * into MARY format. Specifically, the following steps are performed:
 * <ol>
 * <li>conversion to a text format without brackets, using '|' as the delimiter between three fields:
 * <code>graphemes | allophones | part-of-speech(optional)</code></li>
 * <li>conversion of the phonetic alphabet used from MRPA to SAMPA</li>
 * <li>creation of a compact FST representing the lexicon</li>
 * <li>training of Letter-to-sound rules from the data</li>
 * </ol>
 * 
 * @author marc
 *
 */
public class CMUDict2MaryFST extends LexiconCreator {
	private static final String LEXPATH = "lib/modules/en/us/lexicon/";

	public CMUDict2MaryFST() throws Exception {
		super(AllophoneSet.getAllophoneSet(LEXPATH + "allophones.en_US.xml"), LEXPATH + "cmudictSampa.txt", LEXPATH
				+ "cmudict.fst", LEXPATH + "cmudict.lts", true, // convert to lowercase
				true, // predict stress
				3 // number of characters to the left and to the right to use for prediction
		);
	}

	@Override
	protected void prepareLexicon() throws IOException {
		File cmudict = new File(LEXPATH + "cmu/cmudict-0.4.scm");
		if (!cmudict.exists())
			throw new IllegalStateException("This program should be called from the MARY base directory.");
		File extensions = new File(LEXPATH + "cmu/cmudict_extensions.scm");
		File cmudictSampa = new File(lexiconFilename);

		// Convert to SAMPA text dictionary
		logger.info("Converting dictionary to MARY text format...");
		mrpa2sampa = new HashMap<String, String>();
		fillSampaMap();

		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(cmudict), "ASCII"));
		PrintWriter toSampa = new PrintWriter(cmudictSampa, "UTF-8");
		convertToSampa(br, toSampa);
		br.close();
		br = new BufferedReader(new InputStreamReader(new FileInputStream(extensions), "ASCII"));
		convertToSampa(br, toSampa);
		br.close();
		toSampa.close();
		logger.info("...done!\n");

	}

	private Map<String, String> mrpa2sampa;

	private void fillSampaMap() {
		// Any phone inventory mappings?
		String sampamapFilename = "lib/modules/en/synthesis/sampa2mrpa_en.map";
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(sampamapFilename), "UTF-8"));
			String line;
			while ((line = br.readLine()) != null) {
				line = line.trim();
				if (line.equals("") || line.startsWith("#")) {
					continue; // ignore empty and comment lines
				}
				try {
					addSampaMapEntry(line);
				} catch (IllegalArgumentException iae) {
					throw new IllegalArgumentException("Ignoring invalid entry in sampa map file " + sampamapFilename, iae);
				}
			}
		} catch (IOException ioe) {
			throw new IllegalArgumentException("Cannot open file '" + sampamapFilename + "'", ioe);
		}
	}

	private void addSampaMapEntry(String entry) throws IllegalArgumentException {
		boolean s2v = false;
		boolean v2s = false;
		String[] parts = null;
		// For one-to-many mappings, '+' can be used to group phone symbols.
		// E.g., the line "EI->E:+I" would map "EI" to "E:" and "I"
		entry = entry.replace('+', ' ');
		if (entry.indexOf("<->") != -1) {
			parts = entry.split("<->");
			s2v = true;
			v2s = true;
		} else if (entry.indexOf("->") != -1) {
			parts = entry.split("->");
			s2v = true;
		} else if (entry.indexOf("<-") != -1) {
			parts = entry.split("<-");
			v2s = true;
		}
		if (parts == null || parts.length != 2) { // invalid entry
			throw new IllegalArgumentException();
		}
		if (v2s) {
			mrpa2sampa.put(parts[1].trim(), parts[0].trim());
		}
	}

	/**
	 * Converts a single phonetic symbol in MRPA representation representation into its equivalent in MARY sampa representation.
	 * 
	 * @param voicePhoneme
	 *            voicePhoneme
	 * @return the converted phone, or the input string if no known conversion exists.
	 */
	private String mrpa2sampa(String voicePhoneme) {
		if (mrpa2sampa.containsKey(voicePhoneme))
			return mrpa2sampa.get(voicePhoneme);
		else
			return voicePhoneme;
	}

	private String mrpaString2sampaString(String mrpaString) {
		StringTokenizer st = new StringTokenizer(mrpaString);
		LinkedList<String> sampaList = new LinkedList<String>();
		while (st.hasMoreTokens()) {
			String mrpa = st.nextToken();
			String sampa;
			if (mrpa.endsWith("1")) {
				sampa = mrpa2sampa(mrpa.substring(0, mrpa.length() - 1)) + "1";
			} else if (mrpa.endsWith("0")) {
				sampa = mrpa2sampa(mrpa.substring(0, mrpa.length() - 1));
			} else {
				sampa = mrpa2sampa(mrpa);
			}
			sampaList.add(sampa);
		}
		new Syllabifier(allophoneSet).syllabify(sampaList);
		StringBuilder sb = new StringBuilder();
		for (String s : sampaList) {
			if (sb.length() > 0)
				sb.append(" ");
			sb.append(s);
		}
		return sb.toString();
	}

	private void convertToSampa(BufferedReader br, PrintWriter toSampa) throws IOException {
		String line;
		while ((line = br.readLine()) != null) {
			line = line.trim();
			// skip comments:
			if (line.startsWith(";") || line.equals(""))
				continue;
			// expected line format:
			// ("acquirer" nil (ax k w ay1 er0 er0))
			int firstQuote = line.indexOf('"');
			if (!(firstQuote >= 0)) {
				System.err.println("Skipping strange line (no first quote): " + line);
			}
			int secondQuote = line.indexOf('"', firstQuote + 1);
			if (!(secondQuote > firstQuote)) {
				System.err.println("Skipping strange line (no second quote): " + line);
			}
			int firstSpace = secondQuote + 1;
			if (!(line.charAt(firstSpace) == ' ')) {
				System.err.println("Skipping strange line (no first space): " + line);
			}
			int secondSpace = line.indexOf(' ', firstSpace + 1);
			if (!(secondSpace > firstSpace)) {
				System.err.println("Skipping strange line (no second space): " + line);
			}
			int firstBracket = secondSpace + 1;
			if (!(line.charAt(firstBracket) == '(')) {
				System.err.println("Skipping strange line (no first bracket): " + line);
			}
			int secondBracket = line.indexOf(')', firstBracket + 1);
			if (!(secondBracket > firstBracket)) {
				System.err.println("Skipping strange line (no second bracket): " + line);
			}
			String graphemes = line.substring(firstQuote + 1, secondQuote);
			String pos = line.substring(firstSpace + 1, secondSpace);
			if (pos.equals("nil"))
				pos = "";
			else
				pos = "(" + pos + ")";
			String allophones = line.substring(firstBracket + 1, secondBracket);
			String sampaString = mrpaString2sampaString(allophones);
			toSampa.println(graphemes + " | " + sampaString + " | " + pos);
		}
	}

	/**
	 * @param args
	 *            args
	 * @throws Exception
	 *             Exception
	 */
	public static void main(String[] args) throws Exception {
		PatternLayout layout = new PatternLayout("%d %m\n");
		BasicConfigurator.configure(new ConsoleAppender(layout));

		CMUDict2MaryFST c2m = new CMUDict2MaryFST();
		c2m.createLexicon();
	}

}