NoisePartLpFilterPostHpfLpcSynthesizer.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 * 
 * Permission is hereby granted, free of charge, to use and distribute
 * this software and its documentation without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of this work, and to
 * permit persons to whom this work is furnished to do so, subject to
 * the following conditions:
 * 
 * 1. The code must retain the above copyright notice, this list of
 *    conditions and the following disclaimer.
 * 2. Any modifications must be clearly marked as such.
 * 3. Original authors' names are not deleted.
 * 4. The authors' names are not used to endorse or promote products
 *    derived from this software without specific prior written
 *    permission.
 *
 * DFKI GMBH AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL DFKI GMBH NOR THE
 * CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 * THIS SOFTWARE.
 */

package marytts.signalproc.sinusoidal.hntm.synthesis;

import java.util.Arrays;

import marytts.signalproc.sinusoidal.hntm.analysis.FrameNoisePartLpc;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmAnalyzerParams;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechSignal;
import marytts.signalproc.window.Window;
import marytts.util.math.ArrayUtils;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;

/**
 * A time-domain LP synthesis filter based version of the HNM noise part synthesis algorithm described in:
 * 
 * Reference: Stylianou, Y., 1996, "Harmonic plus Noise Models for Speech, combined with Statistical Methods, for Speech and
 * Speaker Modification", Ph.D. thesis, Ecole Nationale Supérieure des Télécommunications. (Chapter 3, A Harmonic plus Noise
 * Model, HNM)
 * 
 * Supports optional triangular energy envelope weighting.
 * 
 * @author oytun.turk
 * 
 */
public class NoisePartLpFilterPostHpfLpcSynthesizer {
	// LPC based noise model + OLA approach + Gain normalization according to generated harmonic part gain
	public static double[] synthesize(HntmSpeechSignal hnmSignal, HntmAnalyzerParams analysisParams,
			HntmSynthesizerParams synthesisParams) {
		double[] noisePart = null;
		double[] noisePart2 = null;
		double[] weights = null;
		boolean isPrevNoised, isNoised, isNextNoised;
		boolean isVoiced, isNextVoiced;
		float tsi = 0.0f;
		float tsiNext; // Time in seconds
		int i, n, j;
		int startIndex = 0;
		int startIndexNext;
		int outputLen = SignalProcUtils.time2sample(hnmSignal.originalDurationInSeconds, hnmSignal.samplingRateInHz);
		int lpOrder = 0;

		double[] excitation = MathUtils.random(outputLen, -0.5, 0.5);

		int fftSizeNoise = SignalProcUtils.getDFTSize(hnmSignal.samplingRateInHz);

		for (i = 0; i < hnmSignal.frames.length; i++) {
			isNoised = ((hnmSignal.frames[i].maximumFrequencyOfVoicingInHz < 0.5f * hnmSignal.samplingRateInHz) ? true : false);
			if (isNoised && hnmSignal.frames[i].n != null && (hnmSignal.frames[i].n instanceof FrameNoisePartLpc)
					&& ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpCoeffs != null) {
				lpOrder = ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpCoeffs.length;
				break;
			}
		}

		if (lpOrder > 0) // At least one noisy frame with LP coefficients exist
		{
			noisePart = new double[outputLen]; // In fact, this should be prosody scaled length when you implement prosody
												// modifications
			Arrays.fill(noisePart, 0.0);

			noisePart2 = new double[outputLen]; // In fact, this should be prosody scaled length when you implement prosody
												// modifications
			Arrays.fill(noisePart2, 0.0);

			weights = new double[outputLen]; // In fact, this should be prosody scaled length when you implement prosody
												// modifications
			Arrays.fill(weights, 0.0);

			boolean bFirst = true;
			int pmInd = 0;
			int pmIndNext;

			int start = 0;
			double[] tmpy = null;
			double[] tmpalpha = null;
			double tmp;
			int count;

			for (i = 0; i < hnmSignal.frames.length; i++) {
				pmInd = SignalProcUtils.time2sample(hnmSignal.frames[i].tAnalysisInSeconds, hnmSignal.samplingRateInHz);
				if (i < hnmSignal.frames.length - 1)
					pmIndNext = SignalProcUtils.time2sample(hnmSignal.frames[i + 1].tAnalysisInSeconds,
							hnmSignal.samplingRateInHz);
				else
					pmIndNext = outputLen - 1;

				start = pmInd;

				isNoised = ((hnmSignal.frames[i].maximumFrequencyOfVoicingInHz < 0.5f * hnmSignal.samplingRateInHz) ? true
						: false);

				if (isNoised && hnmSignal.frames[i].n != null && (hnmSignal.frames[i].n instanceof FrameNoisePartLpc)
						&& ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpCoeffs != null) {
					if (i < hnmSignal.frames.length - 1) {
						for (n = 0; n <= pmIndNext - pmInd; n++) {
							tmpy = new double[Math.max(start - 1, 0) - Math.max(start - lpOrder, 0) + 1];
							count = 0;
							for (j = Math.max(start - 1, 0); j >= Math.max(start - lpOrder, 0); j--)
								tmpy[count++] = noisePart[j];

							tmpalpha = new double[tmpy.length];
							for (j = 0; j < tmpy.length; j++)
								tmpalpha[j] = ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpCoeffs[j];

							tmp = 0.0;
							for (j = 0; j < tmpalpha.length; j++)
								tmp += tmpalpha[j] * tmpy[j];

							if (start >= outputLen)
								break;

							noisePart[start] = ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpGain * excitation[start] + tmp;
							start++;
						}
					} else // for the last frame
					{
						for (n = 0; n < outputLen - pmInd; n++) {
							tmpy = new double[Math.max(start - 1, 0) - Math.max(start - lpOrder, 0) + 1];
							count = 0;
							for (j = Math.max(start - 1, 0); j >= Math.max(start - lpOrder, 0); j--)
								tmpy[count++] = noisePart[j];

							tmpalpha = new double[tmpy.length];
							for (j = 0; j < tmpy.length; j++)
								tmpalpha[j] = ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpCoeffs[j];

							tmp = 0.0;
							for (j = 0; j < tmpalpha.length; j++)
								tmp += tmpalpha[j] * tmpy[j];

							if (start >= outputLen)
								break;

							noisePart[start] = ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpGain * excitation[start] + tmp;
							start++;
						}
					}
				}

				pmInd = pmIndNext;
			}

			if (analysisParams.preemphasisCoefNoise > 0.0f)
				noisePart = SignalProcUtils.removePreemphasis(noisePart, analysisParams.preemphasisCoefNoise);

			MathUtils.adjustMean(noisePart, 0.0);

			int startInd = 0;
			int endInd;

			for (i = 0; i < hnmSignal.frames.length - 2; i++) {
				pmInd = SignalProcUtils.time2sample(hnmSignal.frames[i].tAnalysisInSeconds, hnmSignal.samplingRateInHz);
				if (i <= hnmSignal.frames.length - 3)
					pmIndNext = SignalProcUtils.time2sample(hnmSignal.frames[i + 2].tAnalysisInSeconds,
							hnmSignal.samplingRateInHz);
				else
					pmIndNext = outputLen - 1;

				start = pmInd;
				startInd = start;

				isPrevNoised = false;
				if (i > 0)
					isPrevNoised = ((hnmSignal.frames[i - 1].maximumFrequencyOfVoicingInHz < 0.5f * hnmSignal.samplingRateInHz) ? true
							: false);

				isNoised = ((hnmSignal.frames[i].maximumFrequencyOfVoicingInHz < 0.5f * hnmSignal.samplingRateInHz) ? true
						: false);

				isNextNoised = false;
				if (i < hnmSignal.frames.length - 1)
					isNextNoised = ((hnmSignal.frames[i + 1].maximumFrequencyOfVoicingInHz < 0.5f * hnmSignal.samplingRateInHz) ? true
							: false);

				if (isNoised && hnmSignal.frames[i].n != null && (hnmSignal.frames[i].n instanceof FrameNoisePartLpc)
						&& ((FrameNoisePartLpc) hnmSignal.frames[i].n).lpCoeffs != null) {
					endInd = Math.min(pmIndNext, outputLen - 1);
					double[] tmpFrm = ArrayUtils.subarray(noisePart, startInd, endInd - startInd + 1);

					if (synthesisParams.hpfAfterNoiseSynthesis
							&& hnmSignal.frames[i].maximumFrequencyOfVoicingInHz
									- analysisParams.overlapBetweenHarmonicAndNoiseRegionsInHz > 0.0f)
						tmpFrm = SignalProcUtils.fdFilter(tmpFrm, hnmSignal.frames[i].maximumFrequencyOfVoicingInHz
								- analysisParams.overlapBetweenHarmonicAndNoiseRegionsInHz, 0.5f * hnmSignal.samplingRateInHz,
								hnmSignal.samplingRateInHz, fftSizeNoise);

					tmpFrm = SignalProcUtils.normalizeAverageSampleEnergy(tmpFrm,
							((FrameNoisePartLpc) hnmSignal.frames[i].n).origAverageSampleEnergy);

					Window winNoise = Window.get(analysisParams.noiseAnalysisWindowType, endInd - startInd + 1);
					winNoise.normalizePeakValue(1.0f);
					double[] wgtNoise = winNoise.getCoeffs();

					if (!isPrevNoised) {
						int halfLen = (int) Math.floor(0.5 * tmpFrm.length + 0.5);
						for (j = 0; j < halfLen; j++)
							noisePart2[startInd + j] += tmpFrm[j] * wgtNoise[j];

						for (j = halfLen + 1; j < tmpFrm.length; j++) {
							noisePart2[startInd + j] += tmpFrm[j] * wgtNoise[j];
							weights[startInd + j] += wgtNoise[j];
						}
					} else if (!isNextNoised) {
						int halfLen = (int) Math.floor(0.5 * tmpFrm.length + 0.5);
						for (j = 0; j < halfLen; j++) {
							noisePart2[startInd + j] += tmpFrm[j] * wgtNoise[j];
							weights[startInd + j] += wgtNoise[j];
						}

						for (j = halfLen + 1; j < tmpFrm.length; j++)
							noisePart2[startInd + j] += tmpFrm[j] * wgtNoise[j];
					} else {
						for (j = 0; j < tmpFrm.length; j++) {
							noisePart2[startInd + j] += tmpFrm[j] * wgtNoise[j];
							weights[startInd + j] += wgtNoise[j];
						}
					}
				}

				pmInd = pmIndNext;
			}

			for (i = 0; i < outputLen; i++) {
				if (weights[i] > 1e-20)
					noisePart2[i] /= weights[i];
			}

			System.arraycopy(noisePart2, 0, noisePart, 0, outputLen);

			// Now, apply the triangular noise envelope for voiced parts
			if (synthesisParams.applyTriangularNoiseEnvelopeForVoicedParts) {
				double[] enEnv;
				int enEnvLen;
				tsiNext = 0;
				int l1, lMid, l2;
				for (i = 0; i < hnmSignal.frames.length; i++) {
					isVoiced = ((hnmSignal.frames[i].maximumFrequencyOfVoicingInHz > 0.0f) ? true : false);
					if (isVoiced) {
						if (i == 0)
							tsi = 0.0f;
						else
							tsi = hnmSignal.frames[i].tAnalysisInSeconds;

						startIndex = SignalProcUtils.time2sample(tsi, hnmSignal.samplingRateInHz);

						if (i < hnmSignal.frames.length - 1) {
							tsiNext = Math.max(0.0f, hnmSignal.frames[i + 1].tAnalysisInSeconds);
							startIndexNext = SignalProcUtils.time2sample(tsiNext, hnmSignal.samplingRateInHz);
						} else {
							startIndexNext = outputLen - 1;
							tsiNext = SignalProcUtils.sample2time(startIndexNext, hnmSignal.samplingRateInHz);
						}

						enEnvLen = startIndexNext - startIndex + 1;
						if (enEnvLen > 0) {
							enEnv = new double[enEnvLen];

							l1 = SignalProcUtils.time2sample(0.15 * (tsiNext - tsi), hnmSignal.samplingRateInHz);
							l2 = SignalProcUtils.time2sample(0.85 * (tsiNext - tsi), hnmSignal.samplingRateInHz);
							lMid = (int) Math.floor(0.5 * (l1 + l2) + 0.5);
							for (n = 0; n < l1; n++)
								enEnv[n] = synthesisParams.energyTriangleLowerValue;
							for (n = l1; n < lMid; n++)
								enEnv[n] = (n - l1)
										* (synthesisParams.energyTriangleUpperValue - synthesisParams.energyTriangleLowerValue)
										/ (lMid - l1) + synthesisParams.energyTriangleLowerValue;
							for (n = lMid; n < l2; n++)
								enEnv[n] = (n - lMid)
										* (synthesisParams.energyTriangleLowerValue - synthesisParams.energyTriangleUpperValue)
										/ (l2 - lMid) + synthesisParams.energyTriangleUpperValue;
							for (n = l2; n < enEnvLen; n++)
								enEnv[n] = synthesisParams.energyTriangleLowerValue;

							for (n = startIndex; n <= Math.min(noisePart.length - 1, startIndexNext); n++)
								noisePart[n] *= enEnv[n - startIndex];
						}
					}
				}
			}
		}

		// MaryUtils.plot(noisePart);

		return noisePart;
	}
}