LexiconCreator.java example

Explorer
marytts-master
/**
 * Copyright 2008 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.tools.newlanguage;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import marytts.cart.CART;
import marytts.cart.io.MaryCARTReader;
import marytts.cart.io.MaryCARTWriter;
import marytts.exceptions.MaryConfigurationException;
import marytts.fst.AlignerTrainer;
import marytts.fst.FSTLookup;
import marytts.fst.TransducerTrie;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.modules.phonemiser.TrainedLTS;
import marytts.util.MaryUtils;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;

/**
 * The LexiconCreator is the base class for creating the files needed to run the phonemiser component for a new language. From a
 * list of phonetically transcribed words, the class will create:
 * <ul>
 * <li>a lexicon file, efficiently stored as a Finite State Transducer;</li>
 * <li>a letter-to-sound prediction file, as a decision tree in MARY format.</li>
 * </ul>
 * 
 * The input file is expected to contain data in the following format:
 * <code>grapheme | ' a l - l o - p h o n e s | (optional-part-of-speech)</code> Hereby, the allophones must correspond to a
 * defined allophone set, given in the constructor. The file's encoding is expected to be UTF-8. Subclasses of LexiconCreator can
 * override prepareLexicon() to provide data in this format.
 * 
 * @see AllophoneSet
 * @author marc
 *
 */
public class LexiconCreator {
	protected Logger logger;
	protected AllophoneSet allophoneSet;
	protected String lexiconFilename;
	protected String fstFilename;
	protected String ltsFilename;
	protected boolean convertToLowercase;
	protected boolean predictStress;
	protected int context;

	/**
	 * Initialise a new lexicon creator. Letter to sound rules built with this lexicon creator will convert graphemes to lowercase
	 * before prediction, using the locale given in the allophone set; letter-to-sound rules will also predict stress; a context
	 * of 2 characters to the left and to the right of the current character will be used as predictive features.
	 * 
	 * @param allophoneSet
	 *            this specifies the set of phonetic symbols that can be used in the lexicon, and provides the locale of the
	 *            lexicon
	 * @param lexiconFilename
	 *            where to find the plain-text lexicon
	 * @param fstFilename
	 *            where to create the compressed lexicon FST file
	 * @param ltsFilename
	 *            where to create the letter-to-sound prediction tree.
	 */
	public LexiconCreator(AllophoneSet allophoneSet, String lexiconFilename, String fstFilename, String ltsFilename) {
		this(allophoneSet, lexiconFilename, fstFilename, ltsFilename, true, true, 2);
	}

	/**
	 * Initialize a new lexicon creator.
	 * 
	 * @param allophoneSet
	 *            this specifies the set of phonetic symbols that can be used in the lexicon, and provides the locale of the
	 *            lexicon
	 * @param lexiconFilename
	 *            where to find the plain-text lexicon
	 * @param fstFilename
	 *            where to create the compressed lexicon FST file
	 * @param ltsFilename
	 *            where to create the letter-to-sound prediction tree.
	 * @param convertToLowercase
	 *            if true, Letter to sound rules built with this lexicon creator will convert graphemes to lowercase before
	 *            prediction, using the locale given in the allophone set.
	 * @param predictStress
	 *            if true, letter-to-sound rules will predict stress.
	 * @param context
	 *            the number of characters to the left and to the right of the current character will be used as predictive
	 *            features.
	 */
	public LexiconCreator(AllophoneSet allophoneSet, String lexiconFilename, String fstFilename, String ltsFilename,
			boolean convertToLowercase, boolean predictStress, int context) {
		this.allophoneSet = allophoneSet;
		this.lexiconFilename = lexiconFilename;
		this.fstFilename = fstFilename;
		this.ltsFilename = ltsFilename;
		this.convertToLowercase = convertToLowercase;
		this.predictStress = predictStress;
		this.context = context;
		this.logger = MaryUtils.getLogger("LexiconCreator");
	}

	/**
	 * This base implementation does nothing. Subclasses can override this method to prepare a lexicon in the expected format,
	 * which should then be found at lexiconFilename.
	 * 
	 * @throws IOException
	 *             IOException
	 */
	protected void prepareLexicon() throws IOException {
	}

	protected void compileFST() throws IOException {
		logger.info("Compressing into FST:");
		logger.info(" - aligning graphemes and allophones...");
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8"));
		AlignerTrainer at = new AlignerTrainer(false, true);
		at.readLexicon(br, "\\s*\\|\\s*");
		br.close();

		// make some alignment iterations
		for (int i = 0; i < 4; i++) {
			logger.info("     iteration " + (i + 1));
			at.alignIteration();
		}
		logger.info(" - entering alignments in trie...");
		TransducerTrie t = new TransducerTrie();
		for (int i = 0, size = at.lexiconSize(); i < size; i++) {
			t.add(at.getAlignment(i));
			t.add(at.getInfoAlignment(i));
		}
		logger.info(" - minimizing trie...");
		t.computeMinimization();
		logger.info(" - writing transducer to disk...");
		File of = new File(fstFilename);
		DataOutputStream os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(of)));
		t.writeFST(os, "UTF-8");
		os.flush();
		os.close();
	}

	protected void testFST() throws IOException {
		List<String> testGraphemes = new ArrayList<String>();
		List<String> testAllophones = new ArrayList<String>();
		List<String> testPos = new ArrayList<String>();
		int N = 100; // every N'th entry is put into tests...
		loadTestWords(testGraphemes, testAllophones, testPos, N);
		logger.info(" - looking up " + testGraphemes.size() + " test words...");
		FSTLookup fst = new FSTLookup(fstFilename);
		for (int i = 0, max = testGraphemes.size(); i < max; i++) {
			String key = testGraphemes.get(i);
			String expected = testAllophones.get(i);
			String[] result = fst.lookup(key);
			if (testPos.get(i) != null) {
				String key2 = key + testPos.get(i);
				String[] result2 = fst.lookup(key2);
				if (!expected.equals(result2[0]))
					logger.info("    " + key2 + " -> " + Arrays.toString(result2) + " (expected: " + expected + ")");
				// in addition, expected should be one of the results of a lookup without pos
				boolean found = false;
				for (String r : result) {
					if (expected.equals(r)) {
						found = true;
						break;
					}
				}
				if (!found)
					logger.info("    " + key + " -> " + Arrays.toString(result) + " (expected: " + expected + ")");
			} else {
				if (!expected.equals(result[0]))
					logger.info("    " + key + " -> " + Arrays.toString(result) + " (expected: " + expected + ")");
			}
		}
		logger.info("...done!\n");
	}

	private void loadTestWords(List<String> testGraphemes, List<String> testAllophones, List<String> testPos, int N)
			throws UnsupportedEncodingException, FileNotFoundException, IOException {
		int n = 0;
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8"));
		String line;
		while ((line = br.readLine()) != null) {
			String[] parts = line.split("\\s*\\|\\s*");
			String graphemes = parts[0];
			String allophones = parts[1];
			String pos = (parts.length > 2 && parts[2].length() > 0) ? parts[2] : null;
			n++;
			if (n == N) {
				testGraphemes.add(graphemes);
				testAllophones.add(allophones);
				testPos.add(pos);
				n = 0;
			}
		}
	}

	protected void compileLTS() throws IOException {
		logger.info("Training letter-to-sound rules...");
		// initialize trainer
		LTSTrainer tp = new LTSTrainer(allophoneSet, convertToLowercase, predictStress, context);
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(lexiconFilename), "UTF-8"));

		logger.info(" - reading lexicon...");
		// read lexicon for training
		tp.readLexicon(br, "\\s*\\|\\s*");

		logger.info(" - aligning...");
		// make some alignment iterations
		for (int i = 0; i < 5; i++) {
			logger.info("     iteration " + (i + 1));
			tp.alignIteration();

		}
		logger.info(" - training decision tree...");
		CART st = tp.trainTree(10);
		logger.info(" - saving...");
		// new MARY cart format:
		MaryCARTWriter mcw = new MaryCARTWriter();
		mcw.dumpMaryCART(st, ltsFilename);

		// Alternative ways of saving the CART would be:
		// MARY cart text format:
		// PrintWriter pw = new PrintWriter("lib/modules/en/us/lexicon/cmudict.lts.tree.txt", "UTF-8");
		// mcw.toTextOut(st, pw);
		// pw.close();
		// old wagon cart, text and binary format:
		// WagonCARTWriter wcw = new WagonCARTWriter();
		// wcw.dumpWagonCART(st, "lib/modules/en/us/lexicon/cmudict.lts.wagontree.binary");
		// pw = new PrintWriter("lib/modules/en/us/lexicon/cmudict.lts.wagontree.txt", "UTF-8");
		// wcw.toTextOut(st, pw);
		// pw.close();
		// For all of these, it would also be necessary to separately save the feature definition:
		// pw = new PrintWriter("lib/modules/en/us/lexicon/cmudict.lts.pfeats", "UTF-8");
		// st.getFeatureDefinition().writeTo(pw, false);
		// pw.close();

	}

	protected void testLTS() throws IOException, MaryConfigurationException {
		List<String> testGraphemes = new ArrayList<String>();
		List<String> testAllophones = new ArrayList<String>();
		List<String> testPos = new ArrayList<String>();
		int N = 100; // every N'th entry is put into tests...
		loadTestWords(testGraphemes, testAllophones, testPos, N);

		logger.info(" - loading LTS rules...");
		MaryCARTReader cartReader = new MaryCARTReader();
		CART st = cartReader.load(ltsFilename);
		TrainedLTS lts = new TrainedLTS(allophoneSet, st);

		logger.info(" - looking up " + testGraphemes.size() + " test words...");
		int max = testGraphemes.size();
		int correct = 0;
		for (int i = 0; i < max; i++) {
			String key = testGraphemes.get(i);
			String expected = testAllophones.get(i);
			String result = lts.syllabify(lts.predictPronunciation(key));
			if (!expected.equals(result))
				logger.info("    " + key + " -> " + result + " (expected: " + expected + ")");
			else
				correct++;
		}
		logger.info("   for " + correct + " out of " + max + " prediction is identical to lexicon entry.");
		logger.info("...done!\n");
	}

	public void createLexicon() throws Exception {
		prepareLexicon();
		compileFST();
		testFST();
		System.gc();
		compileLTS();
		testLTS();
	}

	/**
	 * @param args
	 *            args
	 * @throws Exception
	 *             Exception
	 */
	public static void main(String[] args) throws Exception {
		PatternLayout layout = new PatternLayout("%d %m\n");
		BasicConfigurator.configure(new ConsoleAppender(layout));
		AllophoneSet allophoneSet = AllophoneSet.getAllophoneSet(args[0]);
		String lexiconFilename = args[1];
		String fstFilename = args[2];
		String ltsFilename = args[3];
		LexiconCreator lc = new LexiconCreator(allophoneSet, lexiconFilename, fstFilename, ltsFilename);
		lc.createLexicon();
	}

}