HarmonicPartLinearPhaseInterpolatorSynthesizer.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 * 
 * Permission is hereby granted, free of charge, to use and distribute
 * this software and its documentation without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of this work, and to
 * permit persons to whom this work is furnished to do so, subject to
 * the following conditions:
 * 
 * 1. The code must retain the above copyright notice, this list of
 *    conditions and the following disclaimer.
 * 2. Any modifications must be clearly marked as such.
 * 3. Original authors' names are not deleted.
 * 4. The authors' names are not used to endorse or promote products
 *    derived from this software without specific prior written
 *    permission.
 *
 * DFKI GMBH AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL DFKI GMBH NOR THE
 * CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 * THIS SOFTWARE.
 */

package marytts.signalproc.sinusoidal.hntm.synthesis;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;

import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;

import marytts.signalproc.analysis.RegularizedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPostWarpedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPreWarpedCepstrumEstimator;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmAnalyzerParams;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechFrame;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechSignal;
import marytts.signalproc.window.Window;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.io.FileUtils;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
import marytts.util.string.StringUtils;

/**
 * Synthesizes the harmonic part using the linear phase interpolation and phase unwrapping technique described in:
 * 
 * Stylianou, Y., 1996, "Harmonic plus Noise Models for Speech, combined with Statistical Methods, for Speech and Speaker
 * Modification", Ph.D. thesis, Ecole Nationale Supérieure des Télécommunications. (Chapter 3, A Harmonic plus Noise Model, HNM)
 * 
 * @author Oytun Türk
 * 
 */
public class HarmonicPartLinearPhaseInterpolatorSynthesizer {
	// TO DO: Decrease the buffer sizes since with the latest implementation, we do not have to keep all signal
	// When the user enters a reference file to write separate output tracks to files, set the buffer sizes as we do previously,
	// i.e. sufficient to keep all signal
	// Note that, in normal operation mode, we do not write the harmonic tracks to separate files, i.e. reference file is null.
	private double[] harmonicPart = null;
	private double[][] harmonicTracks;
	private double[][] winOverlapWgt;
	//

	private HntmAnalyzerParams analysisParams;
	private HntmSynthesizerParams synthesisParams;

	private int transitionLen;
	private double[] halfTransitionWinLeft;
	private double[] halfTransitionWinRight;

	private String referenceFile; // Reference if the user wants to write the separate tracks to output

	private int pipeOutStartIndex;
	private int pipeOutEndIndex;
	private int currentFrameIndex;

	private HntmSpeechSignal hnmSignal;
	private boolean isReseted;

	public HarmonicPartLinearPhaseInterpolatorSynthesizer(HntmSpeechSignal hnmSignalIn, HntmAnalyzerParams analysisParamsIn,
			HntmSynthesizerParams synthesisParamsIn) {
		this(hnmSignalIn, analysisParamsIn, synthesisParamsIn, null);
	}

	public HarmonicPartLinearPhaseInterpolatorSynthesizer(HntmSpeechSignal hnmSignalIn, HntmAnalyzerParams analysisParamsIn,
			HntmSynthesizerParams synthesisParamsIn, String referenceFileIn) {
		hnmSignal = hnmSignalIn;
		harmonicPart = null;
		harmonicTracks = null;
		winOverlapWgt = null;

		analysisParams = analysisParamsIn;
		synthesisParams = synthesisParamsIn;
		referenceFile = referenceFileIn;

		transitionLen = SignalProcUtils.time2sample(synthesisParams.unvoicedVoicedTrackTransitionInSeconds,
				hnmSignal.samplingRateInHz);
		Window transitionWin = Window.get(Window.HAMMING, transitionLen * 2);
		transitionWin.normalizePeakValue(1.0f);
		halfTransitionWinLeft = transitionWin.getCoeffsLeftHalf();
		halfTransitionWinRight = transitionWin.getCoeffsRightHalf();

		isReseted = false;

		reset();
	}

	// Reset synthesis variables to start synthesis from the beginning
	public void reset() {
		if (!isReseted) {
			isReseted = true;
			int outputLen = SignalProcUtils.time2sample(hnmSignal.originalDurationInSeconds, hnmSignal.samplingRateInHz);

			harmonicPart = new double[outputLen]; // In fact, this should be prosody scaled length when you implement prosody
													// modifications
			Arrays.fill(harmonicPart, 0.0);

			// Separate tracks
			int k;
			if (analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics > 0) {
				harmonicTracks = new double[analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics][];
				winOverlapWgt = new double[analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics][];
				for (k = 0; k < analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics; k++) {
					harmonicTracks[k] = new double[outputLen];
					Arrays.fill(harmonicTracks[k], 0.0);

					if (synthesisParams.overlappingHarmonicPartSynthesis) {
						winOverlapWgt[k] = new double[outputLen];
						Arrays.fill(winOverlapWgt[k], 0.0);
					}
				}
			}
			//

			pipeOutStartIndex = 0;
			pipeOutEndIndex = -1; // You should increase this appropriately during frame based synthesis. Make sure it always
									// precedes any overlap region

			currentFrameIndex = 0;
			//
		}
	}

	// Is reseted for starting synthesis from the beginning?
	public boolean isReseted() {
		return isReseted;
	}

	public boolean nextFrameAvailable() {
		return currentFrameIndex + 1 < hnmSignal.frames.length;
	}

	// For frame based synthesis from outside, create the same loop as this function does
	// Make sure to call reset() if you want to do synthesis with the identical object more than once
	public double[] synthesizeAll() {
		reset();

		double[] output = null;
		int harmonicPartIndex = 0;
		while (nextFrameAvailable()) {
			output = synthesizeNext();
			if (output != null) {
				System.arraycopy(output, 0, harmonicPart, harmonicPartIndex, output.length);
				harmonicPartIndex += output.length;
			}
		}

		// Generate remaining output
		output = generateOutput(true);
		if (output != null) {
			System.arraycopy(output, 0, harmonicPart, harmonicPartIndex, output.length);
			harmonicPartIndex += output.length;
		}
		//

		return harmonicPart;
	}

	public double[] synthesizeNext() {
		assert currentFrameIndex < hnmSignal.frames.length;

		double[] output = null;

		HntmSpeechFrame prevFrame, nextFrame;

		if (currentFrameIndex > 0)
			prevFrame = hnmSignal.frames[currentFrameIndex - 1];
		else
			prevFrame = null;

		if (currentFrameIndex < hnmSignal.frames.length - 1)
			nextFrame = hnmSignal.frames[currentFrameIndex + 1];
		else
			nextFrame = null;

		boolean isFirstSynthesisFrame = false;
		if (currentFrameIndex == 0)
			isFirstSynthesisFrame = true;

		boolean isLastSynthesisFrame = false;
		if (currentFrameIndex == hnmSignal.frames.length - 1)
			isLastSynthesisFrame = true;

		processFrame(prevFrame, hnmSignal.frames[currentFrameIndex], nextFrame, isFirstSynthesisFrame, isLastSynthesisFrame);

		// Start to generate output as soon as a few frames are processed
		if (currentFrameIndex > synthesisParams.synthesisFramesToAccumulateBeforeAudioGeneration) {
			pipeOutEndIndex = SignalProcUtils.time2sample(hnmSignal.frames[currentFrameIndex
					- synthesisParams.synthesisFramesToAccumulateBeforeAudioGeneration].tAnalysisInSeconds,
					hnmSignal.samplingRateInHz);
			output = generateOutput(false);
		}
		//

		isReseted = false;
		currentFrameIndex++;

		return output;
	}

	private void processFrame(HntmSpeechFrame prevFrame, HntmSpeechFrame currentFrame, HntmSpeechFrame nextFrame,
			boolean isFirstSynthesisFrame, boolean isLastSynthesisFrame) {
		int i, k, n;
		int currentHarmonicNo;
		int numHarmonicsCurrentFrame;

		float f0InHz, f0InHzNext;
		float f0Average, f0AverageNext;

		float[] currentCeps = null;
		float[] nextCeps = null;

		boolean isPrevVoiced = false;
		boolean isVoiced = false;
		boolean isNextVoiced = false;

		double aksi;
		double aksiPlusOne;

		double phaseki;
		double phasekiPlusOne;

		double ht;
		double phasekt = 0.0;

		double phasekiEstimate = 0.0;
		double phasekiPlusOneEstimate = 0.0;
		int Mk;

		boolean isTrackVoiced, isNextTrackVoiced, isPrevTrackVoiced;

		double tsik = 0.0; // Synthesis time in seconds
		double tsikPlusOne = 0.0; // Synthesis time in seconds

		double trackStartInSeconds, trackEndInSeconds;
		int trackStartIndex, trackEndIndex;

		double akt;

		double currentOverlapWinWgt;

		if (prevFrame != null && prevFrame.h != null && prevFrame.h.complexAmps != null && prevFrame.h.complexAmps.length > 0)
			isPrevVoiced = true;

		if (currentFrame.h != null && currentFrame.h.complexAmps != null && currentFrame.h.complexAmps.length > 0)
			isVoiced = true;

		if (nextFrame != null && nextFrame.h != null && nextFrame.h.complexAmps != null && nextFrame.h.complexAmps.length > 0)
			isNextVoiced = true;

		if (isVoiced)
			numHarmonicsCurrentFrame = currentFrame.h.complexAmps.length;
		else if (!isVoiced && isNextVoiced)
			numHarmonicsCurrentFrame = nextFrame.h.complexAmps.length;
		else
			numHarmonicsCurrentFrame = 0;

		f0InHz = currentFrame.f0InHz;

		if (isNextVoiced)
			f0InHzNext = nextFrame.f0InHz;
		else
			f0InHzNext = f0InHz;

		f0Average = 0.5f * (f0InHz + f0InHzNext);

		if (!analysisParams.useHarmonicAmplitudesDirectly) {
			currentCeps = currentFrame.h.getCeps(f0InHz, hnmSignal.samplingRateInHz, analysisParams);
			if (nextFrame != null)
				nextCeps = nextFrame.h.getCeps(f0InHzNext, hnmSignal.samplingRateInHz, analysisParams);
			else
				nextCeps = null;
		}

		for (k = 0; k < numHarmonicsCurrentFrame; k++) {
			currentHarmonicNo = k + 1;

			aksi = 0.0;
			aksiPlusOne = 0.0;

			phaseki = 0.0f;
			phasekiPlusOne = 0.0f;

			isPrevTrackVoiced = false;
			isTrackVoiced = false;
			isNextTrackVoiced = false;

			if (prevFrame != null && prevFrame.h != null && prevFrame.h.complexAmps != null && prevFrame.h.complexAmps.length > k)
				isPrevTrackVoiced = true;

			if (currentFrame != null && currentFrame.h != null && currentFrame.h.complexAmps != null
					&& currentFrame.h.complexAmps.length > k)
				isTrackVoiced = true;

			if (nextFrame != null && nextFrame.h != null && nextFrame.h.complexAmps != null && nextFrame.h.complexAmps.length > k)
				isNextTrackVoiced = true;

			tsik = currentFrame.tAnalysisInSeconds;

			if (isFirstSynthesisFrame)
				trackStartInSeconds = 0.0;
			else
				trackStartInSeconds = tsik;

			if (isLastSynthesisFrame || nextFrame == null)
				tsikPlusOne = hnmSignal.originalDurationInSeconds;
			else
				tsikPlusOne = nextFrame.tAnalysisInSeconds;

			trackEndInSeconds = tsikPlusOne;

			if (synthesisParams.overlappingHarmonicPartSynthesis) {
				trackStartInSeconds -= synthesisParams.harmonicSynthesisOverlapInSeconds;
				trackEndInSeconds += synthesisParams.harmonicSynthesisOverlapInSeconds;
			}

			trackStartIndex = SignalProcUtils.time2sample(trackStartInSeconds, hnmSignal.samplingRateInHz);
			trackEndIndex = SignalProcUtils.time2sample(trackEndInSeconds, hnmSignal.samplingRateInHz);

			if (!synthesisParams.overlappingHarmonicPartSynthesis) {
				if (!isPrevTrackVoiced)
					trackStartIndex -= transitionLen;
				if (!isNextTrackVoiced)
					trackEndIndex += transitionLen;
			}

			Window overlapWin = null;
			double[] overlapWinWgt = null;
			if (synthesisParams.overlappingHarmonicPartSynthesis) {
				overlapWin = Window.get(Window.HAMMING, trackEndIndex - trackStartIndex + 1);
				overlapWin.normalizePeakValue(1.0f);
				overlapWinWgt = overlapWin.getCoeffs();
			}

			if (isTrackVoiced && trackEndIndex - trackStartIndex + 1 > 0) {
				// Amplitudes
				if (isTrackVoiced) {
					if (!analysisParams.useHarmonicAmplitudesDirectly) {
						if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING)
							aksi = RegularizedPreWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(currentCeps,
									currentHarmonicNo * f0InHz, hnmSignal.samplingRateInHz);
						else if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING)
							aksi = RegularizedPostWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(currentCeps,
									currentHarmonicNo * f0InHz, hnmSignal.samplingRateInHz);
					} else {
						if (k < currentFrame.h.complexAmps.length)
							aksi = MathUtils.magnitudeComplex(currentFrame.h.complexAmps[k]); // Use amplitudes directly without
																								// cepstrum method
					}
				} else
					aksi = 0.0;

				if (isNextTrackVoiced) {
					if (!analysisParams.useHarmonicAmplitudesDirectly) {
						if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING)
							aksiPlusOne = RegularizedPreWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(nextCeps,
									currentHarmonicNo * f0InHzNext, hnmSignal.samplingRateInHz);
						else if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING)
							aksiPlusOne = RegularizedPostWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(nextCeps,
									currentHarmonicNo * f0InHzNext, hnmSignal.samplingRateInHz);
					} else {
						if (k < nextFrame.h.complexAmps.length)
							aksiPlusOne = MathUtils.magnitudeComplex(nextFrame.h.complexAmps[k]); // Use amplitudes directly
																									// without cepstrum method
					}
				} else
					aksiPlusOne = 0.0;
				//

				// Phases
				if (isTrackVoiced) {
					if (currentHarmonicNo == 0)
						phaseki = 0.0f;
					else
						phaseki = MathUtils.phaseInRadians(currentFrame.h.complexAmps[k]);
				}
				if (isNextTrackVoiced) {
					if (currentHarmonicNo == 0)
						phasekiPlusOne = 0.0f;
					else
						phasekiPlusOne = MathUtils.phaseInRadians(nextFrame.h.complexAmps[k]);
				}

				// phaseki += MathUtils.degrees2radian(-4.0);
				// phasekiPlusOne += MathUtils.degrees2radian(-4.0);

				if (!isTrackVoiced && isNextTrackVoiced) {
					phaseki = (float) (phasekiPlusOne - currentHarmonicNo * MathUtils.TWOPI * f0InHzNext * (tsikPlusOne - tsik)); // Equation
																																	// (3.54)
					aksi = 0.0;
				} else if (isTrackVoiced && !isNextTrackVoiced) {
					phasekiPlusOne = phaseki + currentHarmonicNo * MathUtils.TWOPI * f0InHz * (tsikPlusOne - tsik); // Equation
																													// (3.55)
					aksiPlusOne = 0.0;
				}

				phasekiPlusOneEstimate = phaseki + currentHarmonicNo * MathUtils.TWOPI * f0Average * (tsikPlusOne - tsik);
				// phasekiPlusOneEstimate = MathUtils.TWOPI*(Math.random()-0.5); //Random phase

				// System.out.println(String.valueOf(f0Average) + " - " + String.valueOf(f0InHz) + " - " +
				// String.valueOf(f0InHzNext));

				Mk = (int) Math.floor((phasekiPlusOneEstimate - phasekiPlusOne) / MathUtils.TWOPI + 0.5);
				//

				for (n = Math.max(0, trackStartIndex); n <= Math.min(trackEndIndex, harmonicPart.length - 1); n++) {
					double t = SignalProcUtils.sample2time(n, hnmSignal.samplingRateInHz);

					// if (t>=tsik && t<tsikPlusOne)
					{
						// Amplitude estimate
						if (t < tsik)
							akt = MathUtils.interpolatedSample(tsik - synthesisParams.unvoicedVoicedTrackTransitionInSeconds, t,
									tsik, 0.0, aksi);
						else if (t > tsikPlusOne)
							akt = MathUtils.interpolatedSample(tsikPlusOne, t, tsikPlusOne
									+ synthesisParams.unvoicedVoicedTrackTransitionInSeconds, aksiPlusOne, 0.0);
						else
							akt = MathUtils.interpolatedSample(tsik, t, tsikPlusOne, aksi, aksiPlusOne);
						//

						// Phase estimate
						phasekt = phaseki + (phasekiPlusOne + MathUtils.TWOPI * Mk - phaseki) * (t - tsik) / (tsikPlusOne - tsik);
						//

						if (synthesisParams.overlappingHarmonicPartSynthesis) {
							currentOverlapWinWgt = overlapWinWgt[n - Math.max(0, trackStartIndex)];
							winOverlapWgt[k][n] += currentOverlapWinWgt;
						} else
							currentOverlapWinWgt = 1.0;

						if (!isPrevTrackVoiced && n - trackStartIndex < transitionLen)
							harmonicTracks[k][n] = currentOverlapWinWgt * halfTransitionWinLeft[n - trackStartIndex] * akt
									* Math.cos(phasekt);
						else if (!isNextTrackVoiced && trackEndIndex - n < transitionLen)
							harmonicTracks[k][n] = currentOverlapWinWgt
									* halfTransitionWinRight[transitionLen - (trackEndIndex - n) - 1] * akt * Math.cos(phasekt);
						else
							harmonicTracks[k][n] = currentOverlapWinWgt * akt * Math.cos(phasekt);
					}
				}
			}
		}
	}

	public double[] generateOutput(boolean pipeOutAllOutput) {
		double[] output = null;

		if (harmonicTracks != null) {
			int k, n;

			if (pipeOutAllOutput)
				pipeOutEndIndex = harmonicPart.length;

			output = new double[Math.min(pipeOutEndIndex, harmonicPart.length - 1) - pipeOutStartIndex + 1];
			if (!synthesisParams.overlappingHarmonicPartSynthesis) {
				for (k = 0; k < harmonicTracks.length; k++) {
					// for (n=0; n<harmonicPart.length; n++)
					for (n = pipeOutStartIndex; n <= Math.min(pipeOutEndIndex, harmonicPart.length - 1); n++) {
						// harmonicPart[n] += harmonicTracks[k][n];
						output[n - pipeOutStartIndex] += harmonicTracks[k][n];
					}
				}
			} else {
				for (k = 0; k < harmonicTracks.length; k++) {
					// for (n=0; n<harmonicPart.length; n++)
					for (n = pipeOutStartIndex; n <= Math.min(pipeOutEndIndex, harmonicPart.length - 1); n++) {
						if (winOverlapWgt[k][n] > 0.0f) {
							// harmonicPart[n] += harmonicTracks[k][n]/winOverlapWgt[k][n];
							output[n - pipeOutStartIndex] += harmonicTracks[k][n] / winOverlapWgt[k][n];
						} else {
							// harmonicPart[n] += harmonicTracks[k][n];
							output[n - pipeOutStartIndex] += harmonicTracks[k][n];
						}
					}
				}
			}

			pipeOutStartIndex = pipeOutEndIndex + 1;

			if (pipeOutAllOutput && referenceFile != null && FileUtils.exists(referenceFile)
					&& synthesisParams.writeSeparateHarmonicTracksToOutputs) {
				// Write separate tracks to output
				AudioInputStream inputAudio = null;
				try {
					inputAudio = AudioSystem.getAudioInputStream(new File(referenceFile));
				} catch (UnsupportedAudioFileException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				if (inputAudio != null) {
					for (k = 0; k < harmonicTracks.length; k++) {
						harmonicTracks[k] = MathUtils.divide(harmonicTracks[k], 32767.0);

						DDSAudioInputStream outputAudio = new DDSAudioInputStream(
								new BufferedDoubleDataSource(harmonicTracks[k]), inputAudio.getFormat());
						String outFileName = StringUtils.getFolderName(referenceFile) + "harmonicTrack" + String.valueOf(k + 1)
								+ ".wav";
						try {
							AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outFileName));
						} catch (IOException e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
					}
				}
			}
			//
		}

		return output;
	}
}