HTSParameterGeneration.java example

Explorer
marytts-master
/* ----------------------------------------------------------------- */
/*           The HMM-Based Speech Synthesis Engine "hts_engine API"  */
/*           developed by HTS Working Group                          */
/*           http://hts-engine.sourceforge.net/                      */
/* ----------------------------------------------------------------- */
/*                                                                   */
/*  Copyright (c) 2001-2010  Nagoya Institute of Technology          */
/*                           Department of Computer Science          */
/*                                                                   */
/*                2001-2008  Tokyo Institute of Technology           */
/*                           Interdisciplinary Graduate School of    */
/*                           Science and Engineering                 */
/*                                                                   */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/* - Redistributions of source code must retain the above copyright  */
/*   notice, this list of conditions and the following disclaimer.   */
/* - Redistributions in binary form must reproduce the above         */
/*   copyright notice, this list of conditions and the following     */
/*   disclaimer in the documentation and/or other materials provided */
/*   with the distribution.                                          */
/* - Neither the name of the HTS working group nor the names of its  */
/*   contributors may be used to endorse or promote products derived */
/*   from this software without specific prior written permission.   */
/*                                                                   */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND            */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,       */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF          */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,          */
/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED   */
/* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,     */
/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
/* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   */
/* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY    */
/* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE           */
/* POSSIBILITY OF SUCH DAMAGE.                                       */
/* ----------------------------------------------------------------- */
/**
 * Copyright 2011 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.htsengine;

import java.io.BufferedInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import marytts.htsengine.HMMData.FeatureType;
import marytts.signalproc.analysis.Mfccs;
import marytts.signalproc.analysis.PitchReaderWriter;
import marytts.util.MaryUtils;
import marytts.util.io.LEDataInputStream;

import org.apache.log4j.Logger;

/**
 * Parameter generation out of trained HMMs.
 *
 * Java port and extension of HTS engine API version 1.04 Extension: mixed excitation
 *
 * @author Marcela Charfuelan
 */
public class HTSParameterGeneration {

	public static final double INFTY = ((double) 1.0e+38);
	public static final double INFTY2 = ((double) 1.0e+19);
	public static final double INVINF = ((double) 1.0e-38);
	public static final double INVINF2 = ((double) 1.0e-19);

	private HTSPStream mcepPst = null;
	private HTSPStream strPst = null;
	private HTSPStream magPst = null;
	private HTSPStream lf0Pst = null;
	private boolean voiced[];
	private int totalUttFrame; // total number of frames in a mcep, str or mag Pst
	private int totalLf0Frame; // total number of f0 voiced frames in a lf0 Pst

	private Logger logger = MaryUtils.getLogger("ParameterGeneration");

	public HTSPStream getMcepPst() {
		return mcepPst;
	}

	public void setMcepPst(HTSPStream var) {
		mcepPst = var;
	};

	public HTSPStream getStrPst() {
		return strPst;
	}

	public void setStrPst(HTSPStream var) {
		strPst = var;
	};

	public HTSPStream getMagPst() {
		return magPst;
	}

	public void setMagPst(HTSPStream var) {
		magPst = var;
	};

	public HTSPStream getlf0Pst() {
		return lf0Pst;
	}

	public void setlf0Pst(HTSPStream var) {
		lf0Pst = var;
	};

	public boolean[] getVoicedArray() {
		return voiced;
	}

	public void setVoicedArray(boolean[] var) {
		voiced = var;
	} // only used in HTSEngineTest

	/* Inverse of a given double */
	/* We actually need the inverse of the matrix of covariance, but since this matrix */
	/* is a diagonal matrix, then we just need to calculate the inverse of each of the */
	/* numbers in the diagonal. */
	static public double finv(double x) {

		if (x >= INFTY2)
			return 0.0;
		if (x <= -INFTY2)
			return 0.0;
		if (x <= INVINF2 && x >= 0)
			return INFTY;
		if (x >= -INVINF2 && x < 0)
			return -INFTY;

		return 1.0 / x;

	}

	/**
	 * HTS maximum likelihood parameter generation
	 *
	 * @param um
	 *            : utterance model sequence after processing Mary context features
	 * @param htsData
	 *            : HMM pdfs model set.
	 * @throws Exception
	 *             Exception
	 */
	public void htsMaximumLikelihoodParameterGeneration(HTSUttModel um, final HMMData htsData) throws Exception {
		CartTreeSet ms = htsData.getCartTreeSet();

		/* Initialisation of PStream objects */
		/* Initialise Parameter generation using UttModel um and Modelset ms */
		/* initialise PStream objects for all the parameters that are going to be generated: */
		/* mceppst, strpst, magpst, lf0pst */
		/* Here i should pass the window files to initialise the dynamic windows dw */
		/* for the moment the dw are all the same and hard-coded */
		if (htsData.getPdfMgcStream() != null)
			mcepPst = new HTSPStream(ms.getMcepVsize(), um.getTotalFrame(), HMMData.FeatureType.MGC, htsData.getMaxMgcGvIter());
		/* for lf0 count just the number of lf0frames that are voiced or non-zero */
		if (htsData.getPdfLf0Stream() != null)
			lf0Pst = new HTSPStream(ms.getLf0Stream(), um.getLf0Frame(), HMMData.FeatureType.LF0, htsData.getMaxLf0GvIter());

		/* The following are optional in case of generating mixed excitation */
		if (htsData.getPdfStrStream() != null)
			strPst = new HTSPStream(ms.getStrVsize(), um.getTotalFrame(), HMMData.FeatureType.STR, htsData.getMaxStrGvIter());
		if (htsData.getPdfMagStream() != null)
			magPst = new HTSPStream(ms.getMagVsize(), um.getTotalFrame(), HMMData.FeatureType.MAG, htsData.getMaxMagGvIter());

		int lf0Frame = 0; // counts voiced frames
		int uttFrame = 0; // counts all frames
		voiced = new boolean[um.getTotalFrame()];

		// local variables for faster access
		int msNumStates = ms.getNumStates();
		int totalFrames = um.getTotalFrame();
		for (int i = 0; i < um.getNumUttModel(); i++) {
			HTSModel m = um.getUttModel(i);
			int numVoicedInModel = 0;
			for (int state = 0; state < msNumStates; state++) {
				int dur = m.getDur(state);
				Arrays.fill(voiced, uttFrame, uttFrame += dur, m.getVoiced(state));
				if (m.getVoiced(state))
					lf0Frame += dur;

			}
		}

		/* mcepframe and lf0frame are used in the original code to initialise the T field */
		/* in each pst, but here the pst are already initialised .... */
		logger.debug("utteranceFrame=" + uttFrame + " lf0frame=" + lf0Frame);

		// Step 1: initialize fields in the parameter streams
		uttFrame = 0;
		lf0Frame = 0;

		/* copy pdfs */
		for (int i = 0; i < um.getNumUttModel(); i++) {
			HTSModel m = um.getUttModel(i);
			boolean gvSwitch = m.getGvSwitch();
			for (int state = 0; state < msNumStates; state++) {

				for (int frame = 0; frame < m.getDur(state); frame++) {

					/* copy pdfs for mcep */
					if (mcepPst != null) {
						mcepPst.setMseq(uttFrame, m.getMean(FeatureType.MGC, state));
						mcepPst.setVseq(uttFrame, m.getVariance(FeatureType.MGC, state));
						if (!gvSwitch)
							mcepPst.setGvSwitch(uttFrame, false);
					}

					/* copy pdf for str */
					if (strPst != null) {
						strPst.setMseq(uttFrame, m.getMean(FeatureType.STR, state));
						strPst.setVseq(uttFrame, m.getVariance(FeatureType.STR, state));
						if (!gvSwitch)
							strPst.setGvSwitch(uttFrame, false);
					}

					/* copy pdf for mag */
					if (magPst != null) {
						magPst.setMseq(uttFrame, m.getMean(FeatureType.MAG, state));
						magPst.setVseq(uttFrame, m.getVariance(FeatureType.MAG, state));
						if (!gvSwitch)
							magPst.setGvSwitch(uttFrame, false);
					}

					/* copy pdfs for lf0 */
					if (lf0Pst != null && !htsData.getUseAcousticModels()) {
						for (int k = 0; k < ms.getLf0Stream(); k++) {
							boolean nobound = true;
							/* check if current frame is voiced/unvoiced boundary or not */
							for (int n = lf0Pst.getDWLeftBoundary(k); n <= lf0Pst.getDWRightBoundary(k); n++)
								if ((uttFrame + n) <= 0 || totalFrames <= (uttFrame + n))
									nobound = false;
								else
									nobound = (nobound && voiced[uttFrame + n]);
							/* copy pdfs */
							if (voiced[uttFrame]) {
								lf0Pst.setMseq(lf0Frame, k, m.getLf0Mean(state, k));
								if (nobound || k == 0)
									lf0Pst.setIvseq(lf0Frame, k, finv(m.getLf0Variance(state, k)));
								else
									/* the variances for dynamic features are set to inf on v/uv boundary */
									lf0Pst.setIvseq(lf0Frame, k, 0.0);
							}
						}
					}
					if (voiced[uttFrame]) {
						if (!gvSwitch)
							lf0Pst.setGvSwitch(lf0Frame, false);
						lf0Frame++;
					}
					uttFrame++;

				} /* for each frame in this state */
			} /* for each state in this model */
		} /* for each model in this utterance */

		GVModelSet gvms = htsData.getGVModelSet();

		// Step 2: set dynamic features to infinity on the borders for MGC/STR/MAG
		if (mcepPst != null)
			mcepPst.fixDynFeatOnBoundaries();
		if (strPst != null)
			strPst.fixDynFeatOnBoundaries();
		if (magPst != null)
			magPst.fixDynFeatOnBoundaries();

		// Step 3: optimize individual parameter streams

		/* parameter generation for mcep */
		if (mcepPst != null) {
			logger.info("Parameter generation for MGC: ");
			if (htsData.getUseGV() && (htsData.getPdfMgcGVStream() != null))
				mcepPst.setGvMeanVar(gvms.getGVmeanMgc(), gvms.getGVcovInvMgc());
			mcepPst.mlpg(htsData, htsData.getUseGV());
		}

		// parameter generation for lf0 */
		if (htsData.getUseAcousticModels())
			loadMaryXmlF0(um, htsData);
		else if (lf0Pst != null) {
			logger.info("Parameter generation for LF0: ");
			if (htsData.getUseGV() && (htsData.getPdfLf0GVStream() != null))
				lf0Pst.setGvMeanVar(gvms.getGVmeanLf0(), gvms.getGVcovInvLf0());
			lf0Pst.mlpg(htsData, htsData.getUseGV());
			// here we need set realisedF0
			setRealisedF0(lf0Pst, um, msNumStates);
		}

		/* parameter generation for str */
		boolean useGV = false;
		if (strPst != null) {
			logger.debug("Parameter generation for STR ");
			if (htsData.getUseGV() && (htsData.getPdfStrGVStream() != null)) {
				useGV = true;
				strPst.setGvMeanVar(gvms.getGVmeanStr(), gvms.getGVcovInvStr());
			}
			strPst.mlpg(htsData, useGV);
		}

		/* parameter generation for mag */
		useGV = false;
		if (magPst != null) {
			logger.info("Parameter generation for MAG ");
			if (htsData.getUseGV() && (htsData.getPdfMagGVStream() != null)) {
				useGV = true;
				magPst.setGvMeanVar(gvms.getGVmeanMag(), gvms.getGVcovInvMag());
			}
			magPst.mlpg(htsData, useGV);
		}

	} /* method htsMaximumLikelihoodParameterGeneration */

	/* Save generated parameters in a binary file */
	public void saveParamMaryFormat(String fileName, HTSPStream par, HMMData.FeatureType type) {
		int t, m, i;
		double ws = 0.025; /* window size in seconds */
		double ss = 0.005; /* skip size in seconds */
		int fs = 16000; /* sampling rate */

		try {

			if (type == HMMData.FeatureType.LF0) {
				fileName += ".ptc";
				/*
				 * DataOutputStream data_out = new DataOutputStream (new FileOutputStream (fileName));
				 * data_out.writeFloat((float)(ws*fs)); data_out.writeFloat((float)(ss*fs)); data_out.writeFloat((float)fs);
				 * data_out.writeFloat(voiced.length);
				 *
				 * i=0; for(t=0; t<voiced.length; t++){ // here par.getT are just the voiced!!! so the actual length of frames can
				 * be taken from the voiced array if( voiced[t] ){ data_out.writeFloat((float)Math.exp(par.getPar(i,0))); i++;
				 * }System.out.println("GEN f0s[" + t + "]=" + Math.exp(lf0Pst.getPar(i,0))); else
				 * data_out.writeFloat((float)0.0); } data_out.close();
				 */

				i = 0;
				double f0s[] = new double[voiced.length];
				// System.out.println("voiced.length=" + voiced.length);
				for (t = 0; t < voiced.length; t++) { // here par.getT are just the voiced!!! so the actual length of frames can
					// be taken from the voiced array
					if (voiced[t]) {
						f0s[t] = Math.exp(par.getPar(i, 0));
						i++;
					} else
						f0s[t] = 0.0;
					System.out.println("GEN f0s[" + t + "]=" + f0s[t]);

				}
				/*
				 * i am using this function but it changes the values of sw, and ss *samplingrate+0.5??? for the HTS values
				 * ss=0.005 and sw=0.025 is not a problem though
				 */
				PitchReaderWriter.write_pitch_file(fileName, f0s, (float) (ws), (float) (ss), fs);

			} else if (type == HMMData.FeatureType.MGC) {

				int numfrm = par.getT();
				int dimension = par.getOrder();
				Mfccs mgc = new Mfccs(numfrm, dimension);

				fileName += ".mfc";

				for (t = 0; t < par.getT(); t++)
					for (m = 0; m < par.getOrder(); m++)
						mgc.mfccs[t][m] = par.getPar(t, m);

				mgc.params.samplingRate = fs; /* samplingRateInHz */
				mgc.params.skipsize = (float) ss; /* skipSizeInSeconds */
				mgc.params.winsize = (float) ws; /* windowSizeInSeconds */

				mgc.writeMfccFile(fileName);

				/*
				 * The whole set for header is in the following order: ler.writeInt(numfrm); ler.writeInt(dimension);
				 * ler.writeFloat(winsize); ler.writeFloat(skipsize); ler.writeInt(samplingRate);
				 */

			}

			logger.info("saveParam in file: " + fileName);

		} catch (IOException e) {
			logger.info("IO exception = " + e);
		}
	}

	/* Save generated parameters in a binary file */
	public void saveParam(String fileName, HTSPStream par, HMMData.FeatureType type) {
		int t, m, i;
		try {

			if (type == HMMData.FeatureType.LF0) {
				fileName += ".f0";
				DataOutputStream data_out = new DataOutputStream(new FileOutputStream(fileName));
				i = 0;
				for (t = 0; t < voiced.length; t++) { /* here par.getT are just the voiced!!! */
					if (voiced[t]) {
						data_out.writeFloat((float) Math.exp(par.getPar(i, 0)));
						i++;
					} else
						data_out.writeFloat((float) 0.0);
				}
				data_out.close();

			} else if (type == HMMData.FeatureType.MGC) {
				fileName += ".mgc";
				DataOutputStream data_out = new DataOutputStream(new FileOutputStream(fileName));
				for (t = 0; t < par.getT(); t++)
					for (m = 0; m < par.getOrder(); m++)
						data_out.writeFloat((float) par.getPar(t, m));
				data_out.close();
			}

			logger.info("saveParam in file: " + fileName);

		} catch (IOException e) {
			logger.info("IO exception = " + e);
		}
	}

	private void loadMaryXmlF0(HTSUttModel um, HMMData htsData) throws Exception {
		logger.info("Using f0 from maryXML acoustparams");
		int i, n, numVoiced;
		HTSModel m;
		double[] dval;
		double lastF0 = 0.0;
		numVoiced = 0;
		Vector<Double> f0Vector = new Vector<Double>();

		for (i = 0; i < um.getNumUttModel(); i++) {
			m = um.getUttModel(i);
			// System.out.format("\nmodel=%s totalDur=%d numVoicedFrames=%d F0=%s\n", m.getPhoneName(), m.getTotalDur(),
			// m.getNumVoiced(), m.getMaryXmlF0());
			// get contour for this model if voiced frames and maryXml has f0 values
			dval = getContourSegment(m.getMaryXmlF0(), m.getNumVoiced());
			// accumulate the values
			for (n = 0; n < dval.length; n++)
				f0Vector.add(dval[n]);
		}
		// interpolate values if necessary
		interpolateSegments(f0Vector);

		// create a new Lf0Pst with the values from maryXML
		HTSPStream newLf0Pst = new HTSPStream(3, f0Vector.size(), HMMData.FeatureType.LF0, htsData.getMaxLf0GvIter());
		for (n = 0; n < f0Vector.size(); n++)
			newLf0Pst.setPar(n, 0, Math.log(f0Vector.get(n)));

		setlf0Pst(newLf0Pst);

	}

	private double[] getContourSegment(String maryXmlF0, int numVoiced) throws Exception {
		int i, t = 0, k = 0, f = 0; // f is number of f0 in xml string

		// just fill the values in approx. position
		double[] f0Vector = new double[numVoiced];

		int index[] = new int[2];
		double value[] = new double[2];
		int key, n, interval;
		double valF0, lastValF0;

		if (maryXmlF0 != null) {
			Pattern p = Pattern.compile("(\\d+,\\d+)");
			Matcher xml = p.matcher(maryXmlF0);
			SortedMap<Integer, Double> f0Map = new TreeMap<Integer, Double>();
			int numF0s = 0;
			while (xml.find()) {
				String[] f0Values = (xml.group().trim()).split(",");
				f0Map.put(new Integer(f0Values[0]), new Double(f0Values[1]));
				numF0s++;
			}
			Set<Map.Entry<Integer, Double>> s = f0Map.entrySet();
			Iterator<Map.Entry<Integer, Double>> if0 = s.iterator();

			if (numF0s == numVoiced) {
				t = 0;
				while (if0.hasNext() && t < numVoiced) {
					Map.Entry<Integer, Double> mf0 = if0.next();
					key = (Integer) mf0.getKey();
					valF0 = (Double) mf0.getValue();
					f0Vector[t++] = valF0;
				}
			} else {
				if (numF0s < numVoiced) {
					for (i = 0; i < numVoiced; i++)
						// then just some values will be filled, so the other must be 0
						f0Vector[i] = 0.0;
				}
				while (if0.hasNext() && t < numVoiced) {
					Map.Entry<Integer, Double> mf0 = if0.next();
					key = (Integer) mf0.getKey();
					valF0 = (Double) mf0.getValue();
					if (key == 0)
						n = 0;
					else if (key == 100)
						n = numVoiced - 1;
					else
						n = (int) ((numVoiced * key) / 100.0);
					if (n >= 0 && n < numVoiced)
						f0Vector[n] = valF0;
				} // while(if0.hasNext())
			} // numF0s == numVoiced
		} // if maryXML != null

		// for(i=0; i<numVoiced; i++) // then just some values will be filled, so the other must be 0
		// System.out.format("%.1f ", f0Vector[i]);
		// System.out.println();

		return f0Vector;
	}

	private void interpolateSegments(Vector<Double> f0) {
		int i, n, interval;
		double slope;
		// check where there are zeros and interpolate
		int[] index = new int[2];
		double[] value = new double[2];

		index[0] = 0;
		value[0] = 0.0;
		for (i = 0; i < f0.size(); i++) {
			if (f0.get(i) > 0.0) {
				index[1] = i;
				value[1] = f0.get(i);

				interval = index[1] - index[0];
				if (interval > 1) {
					// System.out.format("Interval to interpolate index[0]=%d index[1]=%d\n",index[0],index[1]);
					slope = ((value[1] - value[0]) / interval);
					for (n = index[0]; n < index[1]; n++) {
						double newVal = (slope * (n - index[0])) + value[0];
						f0.set(n, newVal);
						// System.out.format(" n=%d value:%.1f\n",n,newVal);
					}
				}
				index[0] = index[1];
				value[0] = value[1];
			}
		}
	}

	private void setRealisedF0(HTSPStream lf0Pst, HTSUttModel um, int numStates) {
		int t = 0;
		int vt = 0;
		for (int i = 0; i < um.getNumUttModel(); i++) {
			HTSModel m = um.getUttModel(i);
			int numVoicedInModel = m.getNumVoiced();
			String formattedF0 = "";
			int k = 1;
			for (int state = 0; state < numStates; state++) {
				for (int frame = 0; frame < m.getDur(state); frame++) {
					if (voiced[t++]) {
						float f0 = (float) Math.exp(lf0Pst.getPar(vt++, 0));
						formattedF0 += "(" + Integer.toString((int) ((k * 100.0) / numVoicedInModel)) + ","
								+ Integer.toString((int) f0) + ")";
						k++;
					}
				} // for unvoiced frame
			} // for state
			if (!formattedF0.contentEquals("")) {
				m.setMaryXmlF0(formattedF0);
				// m.setUnit_f0ArrayStr(formattedF0);
				// System.out.println("ph=" + m.getPhoneName() + " " + formattedF0);
			}
		} // for model in utterance model list
	}

} /* class ParameterGeneration */