/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.sinusoidal; import marytts.signalproc.analysis.PitchMarks; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; /** * @author Oytun Türk */ public class TrackModifier { public static float DEFAULT_MODIFICATION_SKIP_SIZE = 0.005f; // Default skip size (in seconds) to be used in sinusoidal // analysis, modification, and synthesis // Note that lower skip sizes might be required in order to // obtain better performance for // large duration modification factors or to realize more // accurate final target lengths // because the time scaling resolution will only be as low as // the skip size public static final int FROM_ORIGINAL = 1; public static final int FROM_RESAMPLED = 2; public static final int FROM_CEPSTRUM = 3; public static final int FROM_INTERPOLATED = 4; // This is only available for phase public static SinusoidalTracks modifyTimeScale(SinusoidalTracks trIn, double[] f0s, float f0_ss, float f0_ws, int[] pitchMarks, float[] voicings, float numPeriods, boolean isVoicingAdaptiveTimeScaling, float timeScalingVoicingThreshold, boolean isVoicingAdaptivePitchScaling, float tScale, int offset, int sysAmpModMethod, int sysPhaseModMethod) { float[] tScales = new float[1]; float[] tScalesTimes = new float[1]; tScales[0] = tScale; tScalesTimes[0] = 0.02f; return modify(trIn, f0s, f0_ss, f0_ws, pitchMarks, voicings, numPeriods, isVoicingAdaptiveTimeScaling, timeScalingVoicingThreshold, isVoicingAdaptivePitchScaling, tScales, tScalesTimes, null, null, offset, sysAmpModMethod, sysPhaseModMethod); } public static SinusoidalTracks modify(SinusoidalTracks trIn, double[] f0s, float f0_ss, float f0_ws, int[] pitchMarks, float[] voicings, float numPeriods, boolean isVoicingAdaptiveTimeScaling, float timeScalingVoicingThreshold, boolean isVoicingAdaptivePitchScaling, float[] tScales, float[] tScalesTimes, float[] pScales, float[] pScalesTimes, int offset, int sysAmpModMethod, int sysPhaseModMethod) { int i, j, lShift; if (tScalesTimes == null) { tScalesTimes = new float[tScales.length]; for (i = 0; i < tScales.length; i++) tScalesTimes[i] = (float) ((i + 0.5) / tScales.length * trIn.origDur); } if (pScalesTimes == null) { pScalesTimes = new float[pScales.length]; for (i = 0; i < pScales.length; i++) pScalesTimes[i] = (float) ((i + 0.5) / pScales.length * trIn.origDur); } // Pitch scale pitch contour double[] f0sMod = SignalProcUtils.pitchScalePitchContour(f0s, f0_ws, f0_ss, pScales, pScalesTimes); // Time scale pitch contour f0sMod = SignalProcUtils.timeScalePitchContour(f0sMod, f0_ws, f0_ss, tScales, tScalesTimes); float maxDur = SignalProcUtils.timeScaledTime(trIn.origDur, tScales, tScalesTimes); // Find modified onsets PitchMarks pmMod = SignalProcUtils.pitchContour2pitchMarks(f0sMod, trIn.fs, (int) Math.floor(maxDur * trIn.fs + 0.5), f0_ws, f0_ss, false, offset); float tScaleCurrent; float pScaleCurrent; float pVoicing; float bandwidth = (float) (0.5f * MathUtils.TWOPI); float excPhase, excPhaseMod; float prevExcPhase, prevExcPhaseMod; float sysPhase, sysPhaseMod; float sysPhaseModReal; float sysPhaseModImag; float excAmp, excAmpMod; float sysAmp, sysAmpMod; float freq, freqMod; int closestInd; int closestIndMod; int sysTimeInd, sysFreqInd, sysFreqIndMod; double sysFreqIndDouble; int currentInd; int n0, n0Mod, n0Prev, n0ModPrev; int Pm; int J, JMod; int tempIndex; int middleAnalysisSample; int prevMiddleAnalysisSample; float middleSynthesisTime; int middleSynthesisSample; int prevMiddleSynthesisSample; float maxFreqOfVoicingInHz; float freqInHz; int maxFreqInd; SinusoidalTracks trMod = null; int trackSt, trackEn; boolean bSingleTrackTest = false; // boolean bSingleTrackTest = true; if (bSingleTrackTest) { trackSt = 7; trackEn = 7; trMod = new SinusoidalTracks(1, trIn.fs); } else { trackSt = 0; trackEn = trIn.totalTracks - 1; trMod = new SinusoidalTracks(trIn); } prevExcPhase = 0.0f; prevExcPhaseMod = 0.0f; prevMiddleAnalysisSample = 0; prevMiddleSynthesisSample = 0; float trackMeanFreqInHz, trackMeanFreqInRadians; for (i = trackSt; i <= trackEn; i++) { if (bSingleTrackTest) trMod.add(trIn.tracks[i]); n0Prev = 0; n0ModPrev = 0; trackMeanFreqInRadians = MathUtils.mean(trIn.tracks[i].freqs); trackMeanFreqInHz = SignalProcUtils.radian2hz(trackMeanFreqInRadians, trIn.fs); for (j = 0; j < trIn.tracks[i].totalSins; j++) { if (!bSingleTrackTest) currentInd = i; else currentInd = 0; if (trIn.tracks[i].states[j] == SinusoidalTrack.ACTIVE || trIn.tracks[i].states[j] == SinusoidalTrack.TURNED_OFF) { middleAnalysisSample = SignalProcUtils.time2sample(trIn.tracks[i].times[j], trIn.fs); closestInd = MathUtils.findClosest(pitchMarks, middleAnalysisSample); sysTimeInd = MathUtils.findClosest(trIn.times, trIn.tracks[i].times[j]); freqInHz = SignalProcUtils.radian2hz(trIn.tracks[i].freqs[j], trIn.fs); int pScaleInd = MathUtils.findClosest(pScalesTimes, trIn.tracks[i].times[j]); pScaleCurrent = pScales[pScaleInd]; maxFreqOfVoicingInHz = SignalProcUtils.radian2hz(trIn.tracks[i].maxFreqOfVoicings[j], trIn.fs); // Max freq. // of voicing // from hnm // analysis // maxFreqOfVoicingInHz = 3600.0f; //Manual float newGain = 1.0f; if (pScaleCurrent > 1.0f) { if (freqInHz < 10.0f) // Very low frequencies { pScaleCurrent = 1.0f; } else if (freqInHz + (pScaleCurrent - 1.0) * trackMeanFreqInHz > maxFreqOfVoicingInHz) // Frequencies // that should be // noise like { pScaleCurrent = 1.0f; newGain = 0.0f; // This results in higher freqs not being synthesized } else if (freqInHz > maxFreqOfVoicingInHz) // Do not include these components since these will interfere // with pitch scale modified sines { pScaleCurrent = 1.0f; newGain = 0.0f; } } // TO DO: How about pscale<1.0, how do we bridge the gap between voiced and unvoiced region? // This might not be necessary after the above implementation, check and remove as required, also use is // isVoicingAdaptivePitchScaling above somehow // Voicing dependent pitch scale modification factor estimation if (voicings != null && isVoicingAdaptivePitchScaling) { pVoicing = voicings[Math.min(closestInd, voicings.length - 1)]; float pitchScalingFreqThreshold = (float) (0.5f * pVoicing * MathUtils.TWOPI); // Frequency limit for pitch scaling needs some elaboration if (trIn.tracks[i].freqs[j] > pitchScalingFreqThreshold) pScaleCurrent = 1.0f; else pScaleCurrent = pScales[pScaleInd]; } /* * //Apply triangular decreasing of pitch scale in voiced/unvoiced transition region if (pScaleCurrent!=1.0f) * { float maxFreqOfVoicingInHz = SignalProcUtils.radian2Hz(trIn.tracks[i].maxFreqOfVoicings[j], trIn.fs); * float modFreqLowerCutoffInHz = maxFreqOfVoicingInHz; //3500.0f; float modFreqUpperCutoffInHz = * maxFreqOfVoicingInHz; //4500.0f; if (freqInHz>=modFreqLowerCutoffInHz && freqInHz<modFreqLowerCutoffInHz) * pScaleCurrent = * (freqInHz-modFreqLowerCutoffInHz)*(pScaleCurrent-1.0f)/(modFreqUpperCutoffInHz-modFreqLowerCutoffInHz * )+1.0f; else if (freqInHz>=modFreqLowerCutoffInHz) pScaleCurrent = 1.0f; } // */ int tScaleInd = MathUtils.findClosest(tScalesTimes, trIn.tracks[i].times[j]); tScaleCurrent = tScales[tScaleInd]; // Voicing dependent time scale modification factor estimation if (voicings != null && isVoicingAdaptiveTimeScaling) { pVoicing = voicings[Math.min(closestInd, voicings.length - 1)]; if (pVoicing < timeScalingVoicingThreshold) tScaleCurrent = 1.0f; else tScaleCurrent = (1.0f - pVoicing) + pVoicing * tScales[tScaleInd]; } sysFreqInd = SignalProcUtils.freq2index(freqInHz, trIn.fs, trIn.sysAmps.get(sysTimeInd).length - 1); sysFreqIndDouble = SignalProcUtils.freq2indexDouble(freqInHz, trIn.fs, trIn.sysAmps.get(sysTimeInd).length - 1); sysAmp = (float) (trIn.sysAmps.get(sysTimeInd)[sysFreqInd]); // This is from Van Santen´s et.al.´s book - Chapter 5 // (van Santen, et. al., Progress in Speech Synthesis) // sysAmp = (float)SignalProcUtils.cepstrum2linearSpecAmp(trIn.sysCeps.get(sysTimeInd), // trIn.tracks[i].freqs[j]); excPhase = prevExcPhase + trIn.tracks[i].freqs[j] * (middleAnalysisSample - prevMiddleAnalysisSample); sysPhase = trIn.tracks[i].phases[j] - excPhase; excAmp = trIn.tracks[i].amps[j] / sysAmp; // excAmp = 1.0f; //This should hold whenever an envelope that passes from spectral peaks is used, i.e. SEEVOC freq = trIn.tracks[i].freqs[j]; // Estimate modified excitation phase if (trIn.tracks[i].states[j] != SinusoidalTrack.TURNED_OFF) middleSynthesisTime = SignalProcUtils.timeScaledTime(trIn.tracks[i].times[j], tScales, tScalesTimes); else middleSynthesisTime = trMod.tracks[currentInd].times[j - 1] + TrackGenerator.ZERO_AMP_SHIFT_IN_SECONDS; middleSynthesisSample = (int) SignalProcUtils.time2sample(middleSynthesisTime, trIn.fs); closestIndMod = MathUtils.findClosest(pmMod.pitchMarks, middleSynthesisSample); excPhaseMod = prevExcPhaseMod + (freq + (pScaleCurrent - 1.0f) * trackMeanFreqInRadians) * (middleSynthesisSample - prevMiddleSynthesisSample); excAmpMod = excAmp; // excAmpMod = 1.0f; //This should hold whenever an envelope that passes from spectral peaks is used, i.e. // SEEVOC freqMod = (float) (freq + (pScaleCurrent - 1.0) * trackMeanFreqInRadians); if (freqMod > Math.PI) excAmpMod = 0.0f; while (freqMod > MathUtils.TWOPI) freqMod -= MathUtils.TWOPI; sysFreqIndMod = sysFreqInd; sysPhaseMod = sysPhase; sysAmpMod = sysAmp; if (pScaleCurrent != 1.0f) // Modify system phase and amplitude according to pitch scale modification factor { sysFreqIndMod = SignalProcUtils.freq2index(freqInHz + (pScaleCurrent - 1.0) * trackMeanFreqInHz, trIn.fs, trIn.sysAmps.get(sysTimeInd).length - 1); sysFreqIndMod = Math.min(sysFreqIndMod, trIn.sysAmps.get(sysTimeInd).length - 1); sysFreqIndMod = Math.max(sysFreqIndMod, 0); // System phase modification for pitch scaling if (sysPhaseModMethod == FROM_ORIGINAL) sysPhaseMod = sysPhase; else if (sysPhaseModMethod == FROM_RESAMPLED) sysPhaseMod = (float) (trIn.sysPhases.get(sysTimeInd)[sysFreqIndMod]); // This is wrong, create phase // envelope for real and // imaginary parts, and then // resample else if (sysPhaseModMethod == FROM_INTERPOLATED) { if (freqInHz < 0.5 * trIn.fs - (pScaleCurrent - 1.0) * trackMeanFreqInHz - 50.0f) { // This is from Quatieri´s paper "Shape Invariant..." tempIndex = (int) Math.floor(sysFreqIndDouble + SignalProcUtils.freq2index((pScaleCurrent - 1.0) * trackMeanFreqInHz, trIn.fs, trIn.sysAmps.get(sysTimeInd).length - 1)); if (sysFreqInd < trIn.frameDfts.get(sysTimeInd).real.length - 1) { sysPhaseModReal = (float) MathUtils.interpolatedSample(tempIndex, sysFreqIndDouble, tempIndex + 1, trIn.frameDfts.get(sysTimeInd).real[tempIndex], trIn.frameDfts.get(sysTimeInd).real[tempIndex + 1]); sysPhaseModImag = (float) MathUtils.interpolatedSample(tempIndex, sysFreqIndDouble, tempIndex + 1, trIn.frameDfts.get(sysTimeInd).imag[tempIndex], trIn.frameDfts.get(sysTimeInd).imag[tempIndex + 1]); } else { sysPhaseModReal = (float) MathUtils.interpolatedSample(tempIndex - 1, sysFreqIndDouble, tempIndex, trIn.frameDfts.get(sysTimeInd).real[tempIndex - 1], trIn.frameDfts.get(sysTimeInd).real[tempIndex]); sysPhaseModImag = (float) MathUtils.interpolatedSample(tempIndex - 1, sysFreqIndDouble, tempIndex, trIn.frameDfts.get(sysTimeInd).imag[tempIndex - 1], trIn.frameDfts.get(sysTimeInd).imag[tempIndex]); } sysPhaseMod = (float) Math.atan2(sysPhaseModImag, sysPhaseModReal); } else sysPhaseMod = sysPhase; } else if (sysPhaseModMethod == FROM_CEPSTRUM) { // This is from Van Santen´s et.al.´s book - Chapter 5 // (van Santen, et. al., Progress in Speech Synthesis) sysPhaseMod = (float) SignalProcUtils.cepstrum2minimumPhase(trIn.sysCeps.get(sysTimeInd), trIn.tracks[i].freqs[j] + (pScaleCurrent - 1.0f) * trackMeanFreqInRadians); } // // System amplitude modification for pitch scaling if (sysAmpModMethod == FROM_ORIGINAL) { // This will make vocal tract scaled in proportion to pitch scale amount sysAmpMod = sysAmp; } else if (sysAmpModMethod == FROM_RESAMPLED) { // This is from Quatieri´s paper "Shape Invariant..." // Get system amp from modified location sysAmpMod = (float) (trIn.sysAmps.get(sysTimeInd)[sysFreqIndMod]); } else if (sysAmpModMethod == FROM_CEPSTRUM) { // This is from Van Santen´s et.al.´s book - Chapter 5 // (van Santen, et. al., Progress in Speech Synthesis) sysAmpMod = (float) SignalProcUtils.cepstrum2linearSpecAmp(trIn.sysCeps.get(sysTimeInd), pScaleCurrent * trIn.tracks[i].freqs[j]); } // // MaryUtils.plot(trIn.sysAmps.get(sysTimeInd)); } trMod.tracks[currentInd].amps[j] = newGain * excAmpMod * sysAmpMod; trMod.tracks[currentInd].freqs[j] = freqMod; trMod.tracks[currentInd].phases[j] = sysPhaseMod + excPhaseMod; /* * //Assign random phase to upper freq sines if (freqInHz>maxFreqOfVoicingInHz) * trMod.tracks[currentInd].phases[j] = (float)(MathUtils.TWOPI*(Math.random()-0.5)); //Assign random phase to * higher freq else trMod.tracks[currentInd].phases[j] = sysPhaseMod + excPhaseMod; */ trMod.tracks[currentInd].times[j] = middleSynthesisTime; if (trMod.tracks[currentInd].times[j] > maxDur) maxDur = trMod.tracks[currentInd].times[j]; if (j > 0 && trIn.tracks[i].states[j - 1] == SinusoidalTrack.TURNED_ON) trMod.tracks[currentInd].times[j - 1] = Math.max(0.0f, trMod.tracks[currentInd].times[j] - TrackGenerator.ZERO_AMP_SHIFT_IN_SECONDS); prevExcPhase = excPhase; prevExcPhaseMod = excPhaseMod; prevMiddleSynthesisSample = middleSynthesisSample; prevMiddleAnalysisSample = middleAnalysisSample; } else if (trIn.tracks[i].states[j] == SinusoidalTrack.TURNED_ON) { prevMiddleAnalysisSample = SignalProcUtils.time2sample(trIn.tracks[i].times[j], trIn.fs); middleSynthesisTime = SignalProcUtils.timeScaledTime(trIn.tracks[i].times[j], tScales, tScalesTimes); prevMiddleSynthesisSample = (int) SignalProcUtils.time2sample(middleSynthesisTime, trIn.fs); prevExcPhase = 0.0f; prevExcPhaseMod = 0.0f; } } } trMod.origDur = maxDur; if (trMod != null) { System.out.println("--- Modified track statistics ---"); trMod.getTrackStatistics(); SinusoidalAnalyzer.getGrossStatistics(trMod); } return trMod; } }