HMMModel.java example

Explorer
marytts-master
/**
 * Copyright 2010 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.modules.acoustic;

import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.WeakHashMap;

import marytts.exceptions.MaryConfigurationException;
import marytts.features.FeatureDefinition;
import marytts.features.FeatureProcessorManager;
import marytts.features.FeatureVector;
import marytts.htsengine.CartTreeSet;
import marytts.htsengine.HMMData;
import marytts.htsengine.HTSModel;
import marytts.htsengine.HTSParameterGeneration;
import marytts.htsengine.HTSUttModel;
import marytts.unitselection.select.Target;
import marytts.util.MaryUtils;

import org.apache.log4j.Logger;
import org.w3c.dom.Element;

/**
 * Model for predicting duration and F0 from HMMs
 *
 * @author marcela
 *
 */
public class HMMModel extends Model {
	/**
	 * Configuration information of the model
	 */
	private HMMData htsData = null;
	/**
	 * HMM trees and pdfs for this model.
	 */
	private CartTreeSet cart;
	/**
	 * Feature definition used when training HMMs.
	 */
	FeatureDefinition hmmFeatureDefinition;
	/**
	 * to calculate duration in seconds.
	 */
	private float fperiodsec;

	protected static Logger logger = MaryUtils.getLogger("HMMModel");

	/**
	 * If the model is instantiated because the same HHMModel is used for predicting both F0 and duration, set this variable true;
	 * this is done in Voice.loadAcousticModels(), when creating the models.
	 */
	private boolean predictDurAndF0 = false;

	/**
	 * This list keeps a copy of the utterance model, this is done when the same HMMModel is used for predicting durations and F0,
	 * the idea is to keep in the utterance model list the state durations predicted together with duration, these state durations
	 * are used when predicting F0, so the same state duration is applied.
	 */
	private Map<List<Element>, HTSUttModel> uttModels = new WeakHashMap<List<Element>, HTSUttModel>();

	/**
	 * Model constructor
	 *
	 * @param featureManager
	 *            the feature processor manager used to compute the symbolic features used for prediction
	 * @param voiceName
	 *            in HMM models this data file corresponds to the configuration file of the HMM voice
	 * @param dataStream
	 *            dataStream
	 * @param targetAttributeName
	 *            attribute in MARYXML to predict
	 * @param targetAttributeFormat
	 *            print style, not used in HMM models
	 * @param featureName
	 *            not used in HMMModel
	 * @param predictFrom
	 *            not used in HMMModel
	 * @param applyTo
	 *            not used in HMMModel
	 *
	 * @throws MaryConfigurationException
	 *             if there are missing files or problems loading trees and pdf files.
	 */
	public HMMModel(FeatureProcessorManager featureManager, String voiceName, InputStream dataStream, String targetAttributeName,
			String targetAttributeFormat, String featureName, String predictFrom, String applyTo)
					throws MaryConfigurationException {
		super(featureManager, voiceName, dataStream, targetAttributeName, targetAttributeFormat, featureName, predictFrom,
				applyTo);
		if (!(targetAttributeName.contentEquals("d") || targetAttributeName.contentEquals("f0"))) {
			throw new MaryConfigurationException("targetAttributeName = " + targetAttributeName + " Not known");
		}
		load();
	}

	/**
	 * This variable is set to true whenever the same HMMModel is used to predict both duration and F0. by default the variable is
	 * false, so that means that two different HMMModels are used for predicting duration and F0, in this case there is no state
	 * durations information to predict F0.
	 *
	 * @param bval
	 *            bval
	 */
	public void setPredictDurAndF0(boolean bval) {
		predictDurAndF0 = bval;
	}

	/**
	 * Load trees and pdfs, from HMM configuration file.
	 *
	 * @throws MaryConfigurationException
	 *             if there are missing files or problems loading trees and pdf files.
	 */
	@Override
	protected void loadData() throws IOException, MaryConfigurationException {
		if (htsData == null)
			htsData = new HMMData();
		// we use the configuration of the HMM voice whose hmm models will be used
		htsData.initHMMDataForHMMModel(voiceName);
		cart = htsData.getCartTreeSet();
		fperiodsec = ((float) htsData.getFperiod() / (float) htsData.getRate());
		predictionFeatureNames = htsData.getFeatureDefinition().getFeatureNames();
	}

	/**
	 * Predict duration for the list of elements. If the same HMMModel is used to predict duration and F0 then a utterance model
	 * is created and kept in a WeakHashMap, so the next call to this module, for predicting F0, can use that utterance model.
	 *
	 * @param elements
	 *            elements from MaryXML for which to predict the values
	 *
	 * @throws MaryConfigurationException
	 *             if error searching in HMM trees.
	 */
	@Override
	public void applyTo(List<Element> elements) throws MaryConfigurationException {
		logger.debug("predicting duration");
		HTSUttModel um = predictAndSetDuration(elements, elements);
		if (predictDurAndF0) { // this same model will be used for predicting F0 -- remember um
			uttModels.put(elements, um);
		}
	}

	/**
	 * Predict F0 for the list of elements and apply to another list of elements. If the same HMMModel is used to predict duration
	 * and F0 then there must be a utterance model created in a previous call to this module, that will be used to predict F0. If
	 * there is no previously created utterance model then one is created.
	 *
	 * @param predictFromElements
	 *            elements from MaryXML for which to predict the values
	 * @param applyToElements
	 *            elements from MaryXML for which to apply the predicted values
	 *
	 * @throws MaryConfigurationException
	 *             if error searching in HMM trees.
	 */
	@Override
	public void applyFromTo(List<Element> predictFromElements, List<Element> applyToElements) throws MaryConfigurationException {
		logger.debug("predicting F0");
		// Two possibilities: Either we have an uttModel due to a previous call to applyTo()
		// (in which case the lookup key should be applyToElements),
		// or we don't -- in which case we must create an uttModel from the XML.
		HTSUttModel um;
		if (predictDurAndF0) {
			logger.debug("using already created utterance model, it contains predicted state durations.");
			um = uttModels.get(applyToElements); // it must be already created so get it from the uttModels Map
		} else {
			logger.debug("creating utterance model with equal values for state durations.");
			um = createUttModel(predictFromElements); // create a um, state durations are set equal for all states
		}
		assert um != null;
		predictAndSetF0(applyToElements, um);
	}

	/**
	 * Predict durations and state durations from predictFromElements and apply durations to applyToElements. A utterance model is
	 * created that contains the predicted state durations.
	 *
	 * @param predictFromElements
	 *            elements to predict from
	 * @param applyToElements
	 *            elements to apply predicted durations
	 *
	 * @return HTSUttModel a utterance model
	 *
	 * @throws MaryConfigurationException
	 *             if error searching in HMM trees.
	 */
	private HTSUttModel predictAndSetDuration(List<Element> predictFromElements, List<Element> applyToElements)
			throws MaryConfigurationException {
		List<Element> predictorElements = predictFromElements;
		List<Target> predictorTargets = getTargets(predictorElements);
		FeatureVector fv = null;
		HTSUttModel um = new HTSUttModel();
		FeatureDefinition feaDef = htsData.getFeatureDefinition();
		double diffdurOld = 0.0;
		double diffdurNew = 0.0;
		String durAttributeName = "d";

		try {
			// (1) Predict the values
			for (int i = 0; i < predictorTargets.size(); i++) {

				// Retrieve values
				fv = predictorTargets.get(i).getFeatureVector();
				um.addUttModel(new HTSModel(cart.getNumStates()));
				HTSModel m = um.getUttModel(i);
				Element element = applyToElements.get(i);

				/* this function also sets the phone name, the phone between - and + */
				m.setPhoneName(fv.getFeatureAsString(feaDef.getFeatureIndex("phone"), feaDef));

				/* Check if context-dependent gv (gv without sil) */
				if (htsData.getUseContextDependentGV()) {
					if (m.getPhoneName().contentEquals("_"))
						m.setGvSwitch(false);
				}

				/* increment number of models in utterance model */
				um.setNumModel(um.getNumModel() + 1);

				/* update number of states */
				um.setNumState(um.getNumState() + cart.getNumStates());

				String formattedTargetValue;
				double duration;

				// if the attribute already exists for this element keep it
				if (element.hasAttribute(durAttributeName)) {
					// Element is in milliseconds already, so convert to second in order to get reformatted in ms après
					formattedTargetValue = element.getAttribute(durAttributeName);
					duration = Float.parseFloat(formattedTargetValue) / 1000;
					formattedTargetValue = String.format(targetAttributeFormat, duration);
					um.setTotalFrame(um.getTotalFrame() + (int) Math.round(duration / fperiodsec));
				} else {

					// Estimate state duration from state duration model (Gaussian)
					diffdurNew = cart.searchDurInCartTree(m, fv, htsData, diffdurOld);
					diffdurOld = diffdurNew;
					duration = m.getTotalDur() * fperiodsec; // in seconds
					um.setTotalFrame(um.getTotalFrame() + m.getTotalDur());
				}

				/*
				 * Find pdf for LF0, this function sets the pdf for each state. and determines, according to the HMM models,
				 * whether the states are voiced or unvoiced, (it can be possible that some states are voiced and some unvoiced).
				 */
				cart.searchLf0InCartTree(m, fv, feaDef, htsData.getUV());
				for (int mstate = 0; mstate < cart.getNumStates(); mstate++) {
					for (int frame = 0; frame < m.getDur(mstate); frame++) {
						if (m.getVoiced(mstate))
							um.setLf0Frame(um.getLf0Frame() + 1);
					}
				}

				// "evaluate" pseudo XPath syntax:
				// TODO this needs to be extended to take into account targetAttributeNames like "foo/@bar", which would add the
				// bar attribute to the foo child of this element, creating the child if not already present...
				if (durAttributeName.startsWith("@")) {
					durAttributeName = durAttributeName.replaceFirst("@", "");
				}
				formattedTargetValue = String.format(targetAttributeFormat, duration);

				// set the new attribute value:
				element.setAttribute(durAttributeName, formattedTargetValue);
			}

			return um;
		} catch (Exception e) {
			throw new MaryConfigurationException("Error searching in tree when predicting duration. ", e);
		}
	}

	/**
	 * Predict F0 from the utterance model and apply to elements
	 *
	 * @param applyToElements
	 *            elements to apply predicted F0s
	 * @param um
	 *            utterance model that contains the set of elements (phonemes) and state durations for generating F0.
	 *
	 * @throws MaryConfigurationException
	 *             if error generating F0 out of HMMs trees and pdfs.
	 */
	private void predictAndSetF0(List<Element> applyToElements, HTSUttModel um) throws MaryConfigurationException {

		HTSModel m;
		try {
			String f0AttributeName = "f0";
			HTSParameterGeneration pdf2par = new HTSParameterGeneration();
			/* Once we have all the phone models Process UttModel */
			/* Generate sequence of speech parameter vectors, generate parameters out of sequence of pdf's */
			boolean debug = false; /* so it does not save the generated parameters. */
			/* this function generates features just for the trees and pdf that are not null in the HMM cart */
			pdf2par.htsMaximumLikelihoodParameterGeneration(um, htsData);

			// (2) include the predicted values in applicableElements (as it is done in Model)
			boolean voiced[] = pdf2par.getVoicedArray();
			int numVoiced = 0;
			// make sure that the number of applicable elements is the same as the predicted number of elements
			assert applyToElements.size() == um.getNumModel();
			float f0;
			String formattedTargetValue;
			int t = 0;
			for (int i = 0; i < applyToElements.size(); i++) { // this will be the same as the utterance model set
				m = um.getUttModel(i);
				int k = 1;
				int numVoicedInModel = m.getNumVoiced();
				formattedTargetValue = "";
				// System.out.format("phone = %s dur_in_frames=%d num_voiced_frames=%d : ", m.getPhoneName(), m.getTotalDur(),
				// numVoicedInModel);
				for (int mstate = 0; mstate < cart.getNumStates(); mstate++) {
					for (int frame = 0; frame < m.getDur(mstate); frame++) {
						if (voiced[t++]) { // numVoiced and t are not the same because voiced values can be true or false,
							// numVoiced count just the voiced
							f0 = (float) Math.exp(pdf2par.getlf0Pst().getPar(numVoiced++, 0));
							formattedTargetValue += "(" + Integer.toString((int) ((k * 100.0) / numVoicedInModel)) + ","
									+ Integer.toString((int) f0) + ")";
							k++;
						}
					}
				}
				Element element = applyToElements.get(i);
				// "evaluate" pseudo XPath syntax:
				// TODO this needs to be extended to take into account targetAttributeNames like "foo/@bar", which would add the
				// bar attribute to the foo child of this element, creating the child if not already present...
				if (f0AttributeName.startsWith("@")) {
					f0AttributeName = f0AttributeName.replaceFirst("@", "");
				}
				// format targetValue according to targetAttributeFormat
				// String formattedTargetValue = String.format(targetAttributeFormat, targetValue);
				// set the new attribute value:
				// if the whole segment is unvoiced then f0 should not be fixed?
				if (formattedTargetValue.length() > 0)
					element.setAttribute(f0AttributeName, formattedTargetValue);
				// System.out.println(formattedTargetValue);
			}
			// once finished re-set to null um
			// um = null;

		} catch (Exception e) {
			throw new MaryConfigurationException("Error generating F0 out of HMMs trees and pdfs. ", e);
		}

	}

	/**
	 * Create a utterance model list from feature vectors predicted from elements.
	 *
	 * @param predictFromElements
	 *            elements from MaryXML from where to get feature vectors.
	 *
	 * @return Utterance model um containing state durations and pdfs already searched on the trees to generate F0.
	 *
	 * @throws MaryConfigurationException
	 *             if error searching in HMM trees.
	 */
	private HTSUttModel createUttModel(List<Element> predictFromElements) throws MaryConfigurationException {
		int i, k, s, t, mstate, frame, durInFrames, durStateInFrames, numVoicedInModel;
		HTSModel m;
		List<Element> predictorElements = predictFromElements;
		List<Target> predictorTargets = getTargets(predictorElements);
		FeatureVector fv;
		HTSUttModel um = new HTSUttModel();
		FeatureDefinition feaDef = htsData.getFeatureDefinition();
		float duration;
		double diffdurOld = 0.0;
		double diffdurNew = 0.0;
		float f0s[] = null;
		try {
			// (1) Predict the values
			for (i = 0; i < predictorTargets.size(); i++) {
				fv = predictorTargets.get(i).getFeatureVector();
				Element e = predictFromElements.get(i);
				um.addUttModel(new HTSModel(cart.getNumStates()));
				m = um.getUttModel(i);
				/* this function also sets the phone name, the phone between - and + */
				m.setPhoneName(fv.getFeatureAsString(feaDef.getFeatureIndex("phone"), feaDef));
				/* Check if context-dependent gv (gv without sil) */
				if (htsData.getUseContextDependentGV()) {
					if (m.getPhoneName().contentEquals("_"))
						m.setGvSwitch(false);
				}
				/* increment number of models in utterance model */
				um.setNumModel(um.getNumModel() + 1);
				/* update number of states */
				um.setNumState(um.getNumState() + cart.getNumStates());
				// get the duration from the element
				duration = Integer.parseInt(e.getAttribute("d")) * 0.001f; // in sec.
				// distribute the duration (in frames) among the five states, here it is done the same amount for each state
				durInFrames = (int) (duration / fperiodsec);
				durStateInFrames = (int) (durInFrames / cart.getNumStates());
				m.setTotalDur(0); // reset to set new value according to duration
				for (s = 0; s < cart.getNumStates(); s++) {
					m.setDur(s, durStateInFrames);
					m.setTotalDur(m.getTotalDur() + m.getDur(s));
				}
				um.setTotalFrame(um.getTotalFrame() + m.getTotalDur());
				System.out.format("createUttModel: duration=%.3f sec. durInFrames=%d  durStateInFrames=%d  m.getTotalDur()=%d\n",
						duration, durInFrames, durStateInFrames, m.getTotalDur());

				/*
				 * Find pdf for LF0, this function sets the pdf for each state. and determines, according to the HMM models,
				 * whether the states are voiced or unvoiced, (it can be possible that some states are voiced and some unvoiced).
				 */
				cart.searchLf0InCartTree(m, fv, feaDef, htsData.getUV());
				for (mstate = 0; mstate < cart.getNumStates(); mstate++) {
					for (frame = 0; frame < m.getDur(mstate); frame++)
						if (m.getVoiced(mstate))
							um.setLf0Frame(um.getLf0Frame() + 1);
				}
			}
			return um;
		} catch (Exception e) {
			throw new MaryConfigurationException("Error searching in tree when creating utterance model. ", e);
		}
	}

	/**
	 * Apply the HMM to a Target to get its predicted value, this method is not used in HMMModel.
	 *
	 * @throws RuntimeException
	 *             if this method is called.
	 */
	@Override
	protected float evaluate(Target target) {
		throw new RuntimeException("This method should never be called");
	}

}