/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * Permission is hereby granted, free of charge, to use and distribute * this software and its documentation without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of this work, and to * permit persons to whom this work is furnished to do so, subject to * the following conditions: * * 1. The code must retain the above copyright notice, this list of * conditions and the following disclaimer. * 2. Any modifications must be clearly marked as such. * 3. Original authors' names are not deleted. * 4. The authors' names are not used to endorse or promote products * derived from this software without specific prior written * permission. * * DFKI GMBH AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH * REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL DFKI GMBH NOR THE * CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF * THIS SOFTWARE. */ package marytts.signalproc.sinusoidal.hntm.synthesis; import java.io.File; import java.io.IOException; import java.util.Arrays; import javax.sound.sampled.AudioFileFormat; import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.signalproc.analysis.RegularizedCepstrumEstimator; import marytts.signalproc.analysis.RegularizedPostWarpedCepstrumEstimator; import marytts.signalproc.analysis.RegularizedPreWarpedCepstrumEstimator; import marytts.signalproc.sinusoidal.hntm.analysis.FrameNoisePartPseudoHarmonic; import marytts.signalproc.sinusoidal.hntm.analysis.HntmAnalyzerParams; import marytts.signalproc.sinusoidal.hntm.analysis.HntmSpeechSignal; import marytts.signalproc.window.Window; import marytts.util.data.BufferedDoubleDataSource; import marytts.util.data.audio.DDSAudioInputStream; import marytts.util.io.FileUtils; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; import marytts.util.string.StringUtils; /** * A pseudo-harmonic representation based synthesizer for the noise part. * * Reference: Stylianou, Y., 1996, "Harmonic plus Noise Models for Speech, combined with Statistical Methods, for Speech and * Speaker Modification", Ph.D. thesis, Ecole Nationale Supérieure des Télécommunications. * * @author oytun.turk * */ public class NoisePartPseudoHarmonicSynthesizer { // Pseudo harmonics based noise generation for pseudo periods public static double[] synthesize(HntmSpeechSignal hnmSignal, HntmAnalyzerParams analysisParams, HntmSynthesizerParams synthesisParams, String referenceFile) { double[] noisePart = null; int trackNoToExamine = 1; int i, k, n; double t; // Time in seconds double tsik = 0.0; // Synthesis time in seconds double tsikPlusOne = 0.0; // Synthesis time in seconds double trackStartInSeconds, trackEndInSeconds; // double lastPeriodInSeconds = 0.0; int trackStartIndex, trackEndIndex; double akt; int numHarmonicsCurrentFrame, numHarmonicsPrevFrame, numHarmonicsNextFrame; int harmonicIndexShiftPrev, harmonicIndexShiftCurrent, harmonicIndexShiftNext; int maxNumHarmonics = 0; for (i = 0; i < hnmSignal.frames.length; i++) { if (hnmSignal.frames[i].maximumFrequencyOfVoicingInHz > 0.0f && hnmSignal.frames[i].n != null) { numHarmonicsCurrentFrame = (int) Math.floor(hnmSignal.samplingRateInHz / analysisParams.noiseF0InHz + 0.5); numHarmonicsCurrentFrame = Math.max(0, numHarmonicsCurrentFrame); if (numHarmonicsCurrentFrame > maxNumHarmonics) maxNumHarmonics = numHarmonicsCurrentFrame; } } double aksi; double aksiPlusOne; float[] phasekis = null; float phasekiPlusOne; double ht; float phasekt = 0.0f; float phasekiEstimate = 0.0f; float phasekiPlusOneEstimate = 0.0f; int Mk; boolean isPrevNoised, isNoised, isNextNoised; boolean isTrackNoised, isNextTrackNoised, isPrevTrackNoised; int outputLen = SignalProcUtils.time2sample(hnmSignal.originalDurationInSeconds, hnmSignal.samplingRateInHz); noisePart = new double[outputLen]; // In fact, this should be prosody scaled length when you implement prosody // modifications Arrays.fill(noisePart, 0.0); // Write separate tracks to output double[][] noiseTracks = null; if (maxNumHarmonics > 0) { noiseTracks = new double[maxNumHarmonics][]; for (k = 0; k < maxNumHarmonics; k++) { noiseTracks[k] = new double[outputLen]; Arrays.fill(noiseTracks[k], 0.0); } phasekis = new float[maxNumHarmonics]; for (k = 0; k < maxNumHarmonics; k++) phasekis[k] = (float) (MathUtils.TWOPI * (Math.random() - 0.5)); } // int transitionLen = SignalProcUtils.time2sample(synthesisParams.unvoicedVoicedTrackTransitionInSeconds, hnmSignal.samplingRateInHz); Window transitionWin = Window.get(Window.HAMMING, transitionLen * 2); transitionWin.normalizePeakValue(1.0f); double[] halfTransitionWinLeft = transitionWin.getCoeffsLeftHalf(); float halfFs = hnmSignal.samplingRateInHz; for (i = 0; i < hnmSignal.frames.length; i++) { isPrevNoised = false; isNoised = false; isNextNoised = false; if (i > 0 && hnmSignal.frames[i - 1].n != null && hnmSignal.frames[i - 1].maximumFrequencyOfVoicingInHz < halfFs && ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i - 1].n).ceps != null) isPrevNoised = true; if (i > 0 && hnmSignal.frames[i].n != null && hnmSignal.frames[i].maximumFrequencyOfVoicingInHz < halfFs && ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i].n).ceps != null) isNoised = true; if (i < hnmSignal.frames.length - 1 && hnmSignal.frames[i + 1].maximumFrequencyOfVoicingInHz < halfFs && hnmSignal.frames[i + 1].n != null && ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i + 1].n).ceps != null) isNextNoised = true; numHarmonicsPrevFrame = 0; numHarmonicsCurrentFrame = 0; numHarmonicsNextFrame = 0; harmonicIndexShiftPrev = 0; harmonicIndexShiftCurrent = 0; harmonicIndexShiftNext = 0; if (isPrevNoised) { numHarmonicsPrevFrame = (int) Math .floor((hnmSignal.samplingRateInHz - hnmSignal.frames[i - 1].maximumFrequencyOfVoicingInHz) / analysisParams.noiseF0InHz + 0.5); numHarmonicsPrevFrame = Math.max(0, numHarmonicsPrevFrame); harmonicIndexShiftPrev = (int) Math.floor(hnmSignal.frames[i - 1].maximumFrequencyOfVoicingInHz / analysisParams.noiseF0InHz + 0.5); harmonicIndexShiftPrev = Math.max(1, harmonicIndexShiftPrev); } if (isNoised) { numHarmonicsCurrentFrame = (int) Math .floor((hnmSignal.samplingRateInHz - hnmSignal.frames[i].maximumFrequencyOfVoicingInHz) / analysisParams.noiseF0InHz + 0.5); numHarmonicsCurrentFrame = Math.max(0, numHarmonicsCurrentFrame); harmonicIndexShiftCurrent = (int) Math.floor(hnmSignal.frames[i].maximumFrequencyOfVoicingInHz / analysisParams.noiseF0InHz + 0.5); harmonicIndexShiftCurrent = Math.max(1, harmonicIndexShiftCurrent); } else if (!isNoised && isNextNoised) { numHarmonicsCurrentFrame = (int) Math .floor((hnmSignal.samplingRateInHz - hnmSignal.frames[i + 1].maximumFrequencyOfVoicingInHz) / analysisParams.noiseF0InHz + 0.5); numHarmonicsCurrentFrame = Math.max(0, numHarmonicsCurrentFrame); harmonicIndexShiftCurrent = (int) Math.floor(hnmSignal.frames[i + 1].maximumFrequencyOfVoicingInHz / analysisParams.noiseF0InHz + 0.5); harmonicIndexShiftCurrent = Math.max(1, harmonicIndexShiftCurrent); } if (isNextNoised) { numHarmonicsNextFrame = (int) Math .floor((hnmSignal.samplingRateInHz - hnmSignal.frames[i + 1].maximumFrequencyOfVoicingInHz) / analysisParams.noiseF0InHz + 0.5); numHarmonicsNextFrame = Math.max(0, numHarmonicsNextFrame); harmonicIndexShiftNext = (int) Math.floor(hnmSignal.frames[i + 1].maximumFrequencyOfVoicingInHz / analysisParams.noiseF0InHz + 0.5); harmonicIndexShiftNext = Math.max(1, harmonicIndexShiftNext); } for (k = 0; k < numHarmonicsCurrentFrame; k++) { aksi = 0.0; aksiPlusOne = 0.0; phasekiPlusOne = 0.0f; isPrevTrackNoised = false; isTrackNoised = false; isNextTrackNoised = false; if (i > 0 && hnmSignal.frames[i - 1].n != null && numHarmonicsPrevFrame > k) isPrevTrackNoised = true; if (hnmSignal.frames[i].n != null && numHarmonicsCurrentFrame > k) isTrackNoised = true; if (i < hnmSignal.frames.length - 1 && hnmSignal.frames[i + 1].n != null && numHarmonicsNextFrame > k) isNextTrackNoised = true; tsik = hnmSignal.frames[i].tAnalysisInSeconds; if (i == 0) trackStartInSeconds = 0.0; else trackStartInSeconds = tsik; if (i == hnmSignal.frames.length - 1) tsikPlusOne = hnmSignal.originalDurationInSeconds; else tsikPlusOne = hnmSignal.frames[i + 1].tAnalysisInSeconds; trackEndInSeconds = tsikPlusOne; trackStartIndex = SignalProcUtils.time2sample(trackStartInSeconds, hnmSignal.samplingRateInHz); trackEndIndex = SignalProcUtils.time2sample(trackEndInSeconds, hnmSignal.samplingRateInHz); if (isTrackNoised && trackEndIndex - trackStartIndex + 1 > 0) { // Amplitudes if (isTrackNoised) { if (!analysisParams.useNoiseAmplitudesDirectly) { if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING) aksi = RegularizedPreWarpedCepstrumEstimator.cepstrum2linearSpectrumValue( ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i].n).ceps, (k + harmonicIndexShiftCurrent) * analysisParams.noiseF0InHz, hnmSignal.samplingRateInHz); else if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING) aksi = RegularizedPostWarpedCepstrumEstimator.cepstrum2linearSpectrumValue( ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i].n).ceps, (k + harmonicIndexShiftCurrent) * analysisParams.noiseF0InHz, hnmSignal.samplingRateInHz); } else { if (k < ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i].n).ceps.length) aksi = ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i].n).ceps[k]; // Use amplitudes directly // without cepstrum method else aksi = 0.0; } } else aksi = 0.0; if (isNextTrackNoised) { if (!analysisParams.useNoiseAmplitudesDirectly) { if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING) aksiPlusOne = RegularizedPreWarpedCepstrumEstimator.cepstrum2linearSpectrumValue( ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i + 1].n).ceps, (k + harmonicIndexShiftNext) * analysisParams.noiseF0InHz, hnmSignal.samplingRateInHz); else if (analysisParams.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING) aksiPlusOne = RegularizedPostWarpedCepstrumEstimator.cepstrum2linearSpectrumValue( ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i + 1].n).ceps, (k + harmonicIndexShiftNext) * analysisParams.noiseF0InHz, hnmSignal.samplingRateInHz); } else { if (k < ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i + 1].n).ceps.length) aksiPlusOne = ((FrameNoisePartPseudoHarmonic) hnmSignal.frames[i + 1].n).ceps[k]; // Use // amplitudes // directly // without // cepstrum // method else aksiPlusOne = 0.0; } } else aksiPlusOne = 0.0; // // Phases phasekis[k] = (float) (MathUtils.TWOPI * (Math.random() - 0.5)); phasekiPlusOne = (float) (phasekis[k] + (k + harmonicIndexShiftCurrent) * MathUtils.TWOPI * analysisParams.noiseF0InHz * (tsikPlusOne - tsik)); // Equation (3.55) // if (!isPrevTrackNoised) trackStartIndex = Math.max(0, trackStartIndex - transitionLen); for (n = trackStartIndex; n <= Math.min(trackEndIndex, outputLen - 1); n++) { t = SignalProcUtils.sample2time(n, hnmSignal.samplingRateInHz); // if (t>=tsik && t<tsikPlusOne) { // Amplitude estimate akt = MathUtils.interpolatedSample(tsik, t, tsikPlusOne, aksi, aksiPlusOne); // // Phase estimate phasekt = (float) (phasekiPlusOne * (t - tsik) / (tsikPlusOne - tsik)); // if (!isPrevTrackNoised && n - trackStartIndex < transitionLen) noiseTracks[k][n] = halfTransitionWinLeft[n - trackStartIndex] * akt * Math.cos(phasekt); else noiseTracks[k][n] = akt * Math.cos(phasekt); } } phasekis[k] = phasekiPlusOne; } } } for (k = 0; k < noiseTracks.length; k++) { for (n = 0; n < noisePart.length; n++) noisePart[n] += noiseTracks[k][n]; } // Write separate tracks to output if (noiseTracks != null) { for (k = 0; k < noiseTracks.length; k++) { for (n = 0; n < noisePart.length; n++) noisePart[n] += noiseTracks[k][n]; } if (referenceFile != null && FileUtils.exists(referenceFile) && synthesisParams.writeSeparateHarmonicTracksToOutputs) { // Write separate tracks to output AudioInputStream inputAudio = null; try { inputAudio = AudioSystem.getAudioInputStream(new File(referenceFile)); } catch (UnsupportedAudioFileException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (inputAudio != null) { // k=1; for (k = 0; k < noiseTracks.length; k++) { noiseTracks[k] = MathUtils.divide(noiseTracks[k], 32767.0); DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(noiseTracks[k]), inputAudio.getFormat()); String outFileName = StringUtils.getFolderName(referenceFile) + "noiseTrack" + String.valueOf(k + 1) + ".wav"; try { AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outFileName)); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } // } return noisePart; } }