/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* Permission is hereby granted, free of charge, to use and distribute
* this software and its documentation without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of this work, and to
* permit persons to whom this work is furnished to do so, subject to
* the following conditions:
*
* 1. The code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* 2. Any modifications must be clearly marked as such.
* 3. Original authors' names are not deleted.
* 4. The authors' names are not used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* DFKI GMBH AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH
* REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL DFKI GMBH NOR THE
* CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
* DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
* THIS SOFTWARE.
*/
package marytts.signalproc.sinusoidal.hntm.synthesis;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.signalproc.analysis.RegularizedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPostWarpedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPreWarpedCepstrumEstimator;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmAnalyzerParams;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechFrame;
import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechSignal;
import marytts.signalproc.window.Window;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.io.FileUtils;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
import marytts.util.string.StringUtils;
/**
* Synthesizes the harmonic part using the linear phase interpolation and phase unwrapping technique described in:
*
* Stylianou, Y., 1996, "Harmonic plus Noise Models for Speech, combined with Statistical Methods, for Speech and Speaker
* Modification", Ph.D. thesis, Ecole Nationale Supérieure des Télécommunications. (Chapter 3, A Harmonic plus Noise Model, HNM)
*
* @author Oytun Türk
*
*/
public class HarmonicPartLinearPhaseInterpolatorSynthesizer {
// TO DO: Decrease the buffer sizes since with the latest implementation, we do not have to keep all signal
// When the user enters a reference file to write separate output tracks to files, set the buffer sizes as we do previously,
// i.e. sufficient to keep all signal
// Note that, in normal operation mode, we do not write the harmonic tracks to separate files, i.e. reference file is null.
private double[] harmonicPart = null;
private double[][] harmonicTracks;
private double[][] winOverlapWgt;
//
private HntmAnalyzerParams analysisParams;
private HntmSynthesizerParams synthesisParams;
private int transitionLen;
private double[] halfTransitionWinLeft;
private double[] halfTransitionWinRight;
private String referenceFile; // Reference if the user wants to write the separate tracks to output
private int pipeOutStartIndex;
private int pipeOutEndIndex;
private int currentFrameIndex;
private HntmSpeechSignal hnmSignal;
private boolean isReseted;
public HarmonicPartLinearPhaseInterpolatorSynthesizer(HntmSpeechSignal hnmSignalIn, HntmAnalyzerParams analysisParamsIn,
HntmSynthesizerParams synthesisParamsIn) {
this(hnmSignalIn, analysisParamsIn, synthesisParamsIn, null);
}
public HarmonicPartLinearPhaseInterpolatorSynthesizer(HntmSpeechSignal hnmSignalIn, HntmAnalyzerParams analysisParamsIn,
HntmSynthesizerParams synthesisParamsIn, String referenceFileIn) {
hnmSignal = hnmSignalIn;
harmonicPart = null;
harmonicTracks = null;
winOverlapWgt = null;
analysisParams = analysisParamsIn;
synthesisParams = synthesisParamsIn;
referenceFile = referenceFileIn;
transitionLen = SignalProcUtils.time2sample(synthesisParams.unvoicedVoicedTrackTransitionInSeconds,
hnmSignal.samplingRateInHz);
Window transitionWin = Window.get(Window.HAMMING, transitionLen * 2);
transitionWin.normalizePeakValue(1.0f);
halfTransitionWinLeft = transitionWin.getCoeffsLeftHalf();
halfTransitionWinRight = transitionWin.getCoeffsRightHalf();
isReseted = false;
reset();
}
// Reset synthesis variables to start synthesis from the beginning
public void reset() {
if (!isReseted) {
isReseted = true;
int outputLen = SignalProcUtils.time2sample(hnmSignal.originalDurationInSeconds, hnmSignal.samplingRateInHz);
harmonicPart = new double[outputLen]; // In fact, this should be prosody scaled length when you implement prosody
// modifications
Arrays.fill(harmonicPart, 0.0);
// Separate tracks
int k;
if (analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics > 0) {
harmonicTracks = new double[analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics][];
winOverlapWgt = new double[analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics][];
for (k = 0; k < analysisParams.hnmPitchVoicingAnalyzerParams.maximumTotalHarmonics; k++) {
harmonicTracks[k] = new double[outputLen];
Arrays.fill(harmonicTracks[k], 0.0);
if (synthesisParams.overlappingHarmonicPartSynthesis) {
winOverlapWgt[k] = new double[outputLen];
Arrays.fill(winOverlapWgt[k], 0.0);
}
}
}
//
pipeOutStartIndex = 0;
pipeOutEndIndex = -1; // You should increase this appropriately during frame based synthesis. Make sure it always
// precedes any overlap region
currentFrameIndex = 0;
//
}
}
// Is reseted for starting synthesis from the beginning?
public boolean isReseted() {
return isReseted;
}
public boolean nextFrameAvailable() {
return currentFrameIndex + 1 < hnmSignal.frames.length;
}
// For frame based synthesis from outside, create the same loop as this function does
// Make sure to call reset() if you want to do synthesis with the identical object more than once
public double[] synthesizeAll() {
reset();
double[] output = null;
int harmonicPartIndex = 0;
while (nextFrameAvailable()) {
output = synthesizeNext();
if (output != null) {
System.arraycopy(output, 0, harmonicPart, harmonicPartIndex, output.length);
harmonicPartIndex += output.length;
}
}
// Generate remaining output
output = generateOutput(true);
if (output != null) {
System.arraycopy(output, 0, harmonicPart, harmonicPartIndex, output.length);
harmonicPartIndex += output.length;
}
//
return harmonicPart;
}
public double[] synthesizeNext() {
assert currentFrameIndex < hnmSignal.frames.length;
double[] output = null;
HntmSpeechFrame prevFrame, nextFrame;
if (currentFrameIndex > 0)
prevFrame = hnmSignal.frames[currentFrameIndex - 1];
else
prevFrame = null;
if (currentFrameIndex < hnmSignal.frames.length - 1)
nextFrame = hnmSignal.frames[currentFrameIndex + 1];
else
nextFrame = null;
boolean isFirstSynthesisFrame = false;
if (currentFrameIndex == 0)
isFirstSynthesisFrame = true;
boolean isLastSynthesisFrame = false;
if (currentFrameIndex == hnmSignal.frames.length - 1)
isLastSynthesisFrame = true;
processFrame(prevFrame, hnmSignal.frames[currentFrameIndex], nextFrame, isFirstSynthesisFrame, isLastSynthesisFrame);
// Start to generate output as soon as a few frames are processed
if (currentFrameIndex > synthesisParams.synthesisFramesToAccumulateBeforeAudioGeneration) {
pipeOutEndIndex = SignalProcUtils.time2sample(hnmSignal.frames[currentFrameIndex
- synthesisParams.synthesisFramesToAccumulateBeforeAudioGeneration].tAnalysisInSeconds,
hnmSignal.samplingRateInHz);
output = generateOutput(false);
}
//
isReseted = false;
currentFrameIndex++;
return output;
}
private void processFrame(HntmSpeechFrame prevFrame, HntmSpeechFrame currentFrame, HntmSpeechFrame nextFrame,
boolean isFirstSynthesisFrame, boolean isLastSynthesisFrame) {
int i, k, n;
int currentHarmonicNo;
int numHarmonicsCurrentFrame;
float f0InHz, f0InHzNext;
float f0Average, f0AverageNext;
float[] currentCeps = null;
float[] nextCeps = null;
boolean isPrevVoiced = false;
boolean isVoiced = false;
boolean isNextVoiced = false;
double aksi;
double aksiPlusOne;
double phaseki;
double phasekiPlusOne;
double ht;
double phasekt = 0.0;
double phasekiEstimate = 0.0;
double phasekiPlusOneEstimate = 0.0;
int Mk;
boolean isTrackVoiced, isNextTrackVoiced, isPrevTrackVoiced;
double tsik = 0.0; // Synthesis time in seconds
double tsikPlusOne = 0.0; // Synthesis time in seconds
double trackStartInSeconds, trackEndInSeconds;
int trackStartIndex, trackEndIndex;
double akt;
double currentOverlapWinWgt;
if (prevFrame != null && prevFrame.h != null && prevFrame.h.complexAmps != null && prevFrame.h.complexAmps.length > 0)
isPrevVoiced = true;
if (currentFrame.h != null && currentFrame.h.complexAmps != null && currentFrame.h.complexAmps.length > 0)
isVoiced = true;
if (nextFrame != null && nextFrame.h != null && nextFrame.h.complexAmps != null && nextFrame.h.complexAmps.length > 0)
isNextVoiced = true;
if (isVoiced)
numHarmonicsCurrentFrame = currentFrame.h.complexAmps.length;
else if (!isVoiced && isNextVoiced)
numHarmonicsCurrentFrame = nextFrame.h.complexAmps.length;
else
numHarmonicsCurrentFrame = 0;
f0InHz = currentFrame.f0InHz;
if (isNextVoiced)
f0InHzNext = nextFrame.f0InHz;
else
f0InHzNext = f0InHz;
f0Average = 0.5f * (f0InHz + f0InHzNext);
if (!analysisParams.useHarmonicAmplitudesDirectly) {
currentCeps = currentFrame.h.getCeps(f0InHz, hnmSignal.samplingRateInHz, analysisParams);
if (nextFrame != null)
nextCeps = nextFrame.h.getCeps(f0InHzNext, hnmSignal.samplingRateInHz, analysisParams);
else
nextCeps = null;
}
for (k = 0; k < numHarmonicsCurrentFrame; k++) {
currentHarmonicNo = k + 1;
aksi = 0.0;
aksiPlusOne = 0.0;
phaseki = 0.0f;
phasekiPlusOne = 0.0f;
isPrevTrackVoiced = false;
isTrackVoiced = false;
isNextTrackVoiced = false;
if (prevFrame != null && prevFrame.h != null && prevFrame.h.complexAmps != null && prevFrame.h.complexAmps.length > k)
isPrevTrackVoiced = true;
if (currentFrame != null && currentFrame.h != null && currentFrame.h.complexAmps != null
&& currentFrame.h.complexAmps.length > k)
isTrackVoiced = true;
if (nextFrame != null && nextFrame.h != null && nextFrame.h.complexAmps != null && nextFrame.h.complexAmps.length > k)
isNextTrackVoiced = true;
tsik = currentFrame.tAnalysisInSeconds;
if (isFirstSynthesisFrame)
trackStartInSeconds = 0.0;
else
trackStartInSeconds = tsik;
if (isLastSynthesisFrame || nextFrame == null)
tsikPlusOne = hnmSignal.originalDurationInSeconds;
else
tsikPlusOne = nextFrame.tAnalysisInSeconds;
trackEndInSeconds = tsikPlusOne;
if (synthesisParams.overlappingHarmonicPartSynthesis) {
trackStartInSeconds -= synthesisParams.harmonicSynthesisOverlapInSeconds;
trackEndInSeconds += synthesisParams.harmonicSynthesisOverlapInSeconds;
}
trackStartIndex = SignalProcUtils.time2sample(trackStartInSeconds, hnmSignal.samplingRateInHz);
trackEndIndex = SignalProcUtils.time2sample(trackEndInSeconds, hnmSignal.samplingRateInHz);
if (!synthesisParams.overlappingHarmonicPartSynthesis) {
if (!isPrevTrackVoiced)
trackStartIndex -= transitionLen;
if (!isNextTrackVoiced)
trackEndIndex += transitionLen;
}
Window overlapWin = null;
double[] overlapWinWgt = null;
if (synthesisParams.overlappingHarmonicPartSynthesis) {
overlapWin = Window.get(Window.HAMMING, trackEndIndex - trackStartIndex + 1);
overlapWin.normalizePeakValue(1.0f);
overlapWinWgt = overlapWin.getCoeffs();
}
if (isTrackVoiced && trackEndIndex - trackStartIndex + 1 > 0) {
// Amplitudes
if (isTrackVoiced) {
if (!analysisParams.useHarmonicAmplitudesDirectly) {
if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING)
aksi = RegularizedPreWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(currentCeps,
currentHarmonicNo * f0InHz, hnmSignal.samplingRateInHz);
else if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING)
aksi = RegularizedPostWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(currentCeps,
currentHarmonicNo * f0InHz, hnmSignal.samplingRateInHz);
} else {
if (k < currentFrame.h.complexAmps.length)
aksi = MathUtils.magnitudeComplex(currentFrame.h.complexAmps[k]); // Use amplitudes directly without
// cepstrum method
}
} else
aksi = 0.0;
if (isNextTrackVoiced) {
if (!analysisParams.useHarmonicAmplitudesDirectly) {
if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING)
aksiPlusOne = RegularizedPreWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(nextCeps,
currentHarmonicNo * f0InHzNext, hnmSignal.samplingRateInHz);
else if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING)
aksiPlusOne = RegularizedPostWarpedCepstrumEstimator.cepstrum2linearSpectrumValue(nextCeps,
currentHarmonicNo * f0InHzNext, hnmSignal.samplingRateInHz);
} else {
if (k < nextFrame.h.complexAmps.length)
aksiPlusOne = MathUtils.magnitudeComplex(nextFrame.h.complexAmps[k]); // Use amplitudes directly
// without cepstrum method
}
} else
aksiPlusOne = 0.0;
//
// Phases
if (isTrackVoiced) {
if (currentHarmonicNo == 0)
phaseki = 0.0f;
else
phaseki = MathUtils.phaseInRadians(currentFrame.h.complexAmps[k]);
}
if (isNextTrackVoiced) {
if (currentHarmonicNo == 0)
phasekiPlusOne = 0.0f;
else
phasekiPlusOne = MathUtils.phaseInRadians(nextFrame.h.complexAmps[k]);
}
// phaseki += MathUtils.degrees2radian(-4.0);
// phasekiPlusOne += MathUtils.degrees2radian(-4.0);
if (!isTrackVoiced && isNextTrackVoiced) {
phaseki = (float) (phasekiPlusOne - currentHarmonicNo * MathUtils.TWOPI * f0InHzNext * (tsikPlusOne - tsik)); // Equation
// (3.54)
aksi = 0.0;
} else if (isTrackVoiced && !isNextTrackVoiced) {
phasekiPlusOne = phaseki + currentHarmonicNo * MathUtils.TWOPI * f0InHz * (tsikPlusOne - tsik); // Equation
// (3.55)
aksiPlusOne = 0.0;
}
phasekiPlusOneEstimate = phaseki + currentHarmonicNo * MathUtils.TWOPI * f0Average * (tsikPlusOne - tsik);
// phasekiPlusOneEstimate = MathUtils.TWOPI*(Math.random()-0.5); //Random phase
// System.out.println(String.valueOf(f0Average) + " - " + String.valueOf(f0InHz) + " - " +
// String.valueOf(f0InHzNext));
Mk = (int) Math.floor((phasekiPlusOneEstimate - phasekiPlusOne) / MathUtils.TWOPI + 0.5);
//
for (n = Math.max(0, trackStartIndex); n <= Math.min(trackEndIndex, harmonicPart.length - 1); n++) {
double t = SignalProcUtils.sample2time(n, hnmSignal.samplingRateInHz);
// if (t>=tsik && t<tsikPlusOne)
{
// Amplitude estimate
if (t < tsik)
akt = MathUtils.interpolatedSample(tsik - synthesisParams.unvoicedVoicedTrackTransitionInSeconds, t,
tsik, 0.0, aksi);
else if (t > tsikPlusOne)
akt = MathUtils.interpolatedSample(tsikPlusOne, t, tsikPlusOne
+ synthesisParams.unvoicedVoicedTrackTransitionInSeconds, aksiPlusOne, 0.0);
else
akt = MathUtils.interpolatedSample(tsik, t, tsikPlusOne, aksi, aksiPlusOne);
//
// Phase estimate
phasekt = phaseki + (phasekiPlusOne + MathUtils.TWOPI * Mk - phaseki) * (t - tsik) / (tsikPlusOne - tsik);
//
if (synthesisParams.overlappingHarmonicPartSynthesis) {
currentOverlapWinWgt = overlapWinWgt[n - Math.max(0, trackStartIndex)];
winOverlapWgt[k][n] += currentOverlapWinWgt;
} else
currentOverlapWinWgt = 1.0;
if (!isPrevTrackVoiced && n - trackStartIndex < transitionLen)
harmonicTracks[k][n] = currentOverlapWinWgt * halfTransitionWinLeft[n - trackStartIndex] * akt
* Math.cos(phasekt);
else if (!isNextTrackVoiced && trackEndIndex - n < transitionLen)
harmonicTracks[k][n] = currentOverlapWinWgt
* halfTransitionWinRight[transitionLen - (trackEndIndex - n) - 1] * akt * Math.cos(phasekt);
else
harmonicTracks[k][n] = currentOverlapWinWgt * akt * Math.cos(phasekt);
}
}
}
}
}
public double[] generateOutput(boolean pipeOutAllOutput) {
double[] output = null;
if (harmonicTracks != null) {
int k, n;
if (pipeOutAllOutput)
pipeOutEndIndex = harmonicPart.length;
output = new double[Math.min(pipeOutEndIndex, harmonicPart.length - 1) - pipeOutStartIndex + 1];
if (!synthesisParams.overlappingHarmonicPartSynthesis) {
for (k = 0; k < harmonicTracks.length; k++) {
// for (n=0; n<harmonicPart.length; n++)
for (n = pipeOutStartIndex; n <= Math.min(pipeOutEndIndex, harmonicPart.length - 1); n++) {
// harmonicPart[n] += harmonicTracks[k][n];
output[n - pipeOutStartIndex] += harmonicTracks[k][n];
}
}
} else {
for (k = 0; k < harmonicTracks.length; k++) {
// for (n=0; n<harmonicPart.length; n++)
for (n = pipeOutStartIndex; n <= Math.min(pipeOutEndIndex, harmonicPart.length - 1); n++) {
if (winOverlapWgt[k][n] > 0.0f) {
// harmonicPart[n] += harmonicTracks[k][n]/winOverlapWgt[k][n];
output[n - pipeOutStartIndex] += harmonicTracks[k][n] / winOverlapWgt[k][n];
} else {
// harmonicPart[n] += harmonicTracks[k][n];
output[n - pipeOutStartIndex] += harmonicTracks[k][n];
}
}
}
}
pipeOutStartIndex = pipeOutEndIndex + 1;
if (pipeOutAllOutput && referenceFile != null && FileUtils.exists(referenceFile)
&& synthesisParams.writeSeparateHarmonicTracksToOutputs) {
// Write separate tracks to output
AudioInputStream inputAudio = null;
try {
inputAudio = AudioSystem.getAudioInputStream(new File(referenceFile));
} catch (UnsupportedAudioFileException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (inputAudio != null) {
for (k = 0; k < harmonicTracks.length; k++) {
harmonicTracks[k] = MathUtils.divide(harmonicTracks[k], 32767.0);
DDSAudioInputStream outputAudio = new DDSAudioInputStream(
new BufferedDoubleDataSource(harmonicTracks[k]), inputAudio.getFormat());
String outFileName = StringUtils.getFolderName(referenceFile) + "harmonicTrack" + String.valueOf(k + 1)
+ ".wav";
try {
AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outFileName));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
//
}
return output;
}
}