/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.sinusoidal.hntm.analysis.pitch; import java.io.File; import java.io.IOException; import java.util.Arrays; import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.signalproc.window.Window; import marytts.util.data.audio.AudioDoubleDataSource; import marytts.util.display.DisplayUtils; import marytts.util.math.ArrayUtils; import marytts.util.math.ComplexArray; import marytts.util.math.FFT; import marytts.util.math.FFTMixedRadix; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; /** * Initial pitch, voicing, maximum frequency of voicing, and refined pitch estimation as described in: * * Y. Stylianou (1996) “A pitch and maximum voiced frequency estimation technique adapted to harmonic models of speech,” IEEE * Nordic Signal Processing Symposium, Helsinki, Finland, Sept. 1996. * * @author Oytun Türk */ public class HnmPitchVoicingAnalyzer { public HnmPitchVoicingAnalyzer() { } public static int getDefaultFFTSize(int samplingRate) { if (samplingRate < 10000) return 2048; else if (samplingRate < 20000) return 4096; else return 8192; } public static float[] estimateInitialPitch(double[] x, int samplingRate, float f0MinInHz, float f0MaxInHz, int windowType, HnmPitchVoicingAnalyzerParams params) { int PMax = (int) Math.floor(samplingRate / f0MinInHz + 0.5); int PMin = (int) Math.floor(samplingRate / f0MaxInHz + 0.5); int ws = SignalProcUtils.time2sample(params.mvfAnalysisWindowSizeInSeconds, samplingRate); ws = Math.max(ws, (int) Math.floor(params.numPeriodsInitialPitchEstimation * PMin + 0.5)); int ss = (int) Math.floor(params.mvfAnalysisSkipSizeInSeconds * samplingRate + 0.5); int numfrm = (int) Math.floor(((double) x.length - ws) / ss + 0.5); int numCandidates = PMax - PMin + 1; double[] E = new double[numCandidates]; int P; int i, t, l, k; double term1, term2, term3, r; double[] frm = new double[ws]; Window win = Window.get(windowType, ws); double[] wgt2 = win.getCoeffs(); wgt2 = MathUtils.normalizeToSumUpTo(wgt2, 1.0); for (t = 0; t < ws; t++) wgt2[t] = wgt2[t] * wgt2[t]; double tmpSum = 0.0; for (t = 0; t < ws; t++) tmpSum += wgt2[t]; for (t = 0; t < ws; t++) wgt2[t] = wgt2[t] / tmpSum; double[] wgt4 = new double[ws]; System.arraycopy(wgt2, 0, wgt4, 0, ws); for (t = 0; t < ws; t++) wgt4[t] = wgt4[t] * wgt4[t]; double termTmp = 0.0; for (t = 0; t < ws; t++) termTmp += wgt4[t]; float[] initialF0s = new float[numfrm]; int minInd; int startIndex, endIndex; for (i = 0; i < numfrm; i++) { Arrays.fill(frm, 0.0); System.arraycopy(x, i * ss, frm, 0, Math.min(ws, x.length - i * ss)); startIndex = i * ss; endIndex = startIndex + Math.min(ws, x.length - i * ss) - 1; // MaryUtils.plot(frm); term1 = 0.0; for (t = 0; t < ws; t++) term1 += frm[t] * frm[t] * wgt2[t]; float lLim; for (P = PMin; P <= PMax; P++) { lLim = ((float) ws) / P; term2 = 0.0; for (l = (int) (Math.floor(-1.0f * lLim) + 1); l < lLim; l++) { r = 0.0; for (t = 0; t < ws; t++) { if (t + l * P >= 0 && t + l * P < ws) r += frm[t] * wgt2[t] * frm[t + l * P] * wgt2[t + l * P]; } term2 += r; } term2 *= P; term3 = 1.0 - P * termTmp; E[P - PMin] = (term1 - term2) / (term1 * term3); } // MaryUtils.plot(E); minInd = MathUtils.getMinIndex(E); if (E[minInd] < 0.5) initialF0s[i] = 1.0f / SignalProcUtils.sample2time(minInd + PMin, samplingRate); else initialF0s[i] = 0.0f; } return initialF0s; } public static float[] analyzeVoicings(double[] x, int samplingRate, float[] initialF0s, HnmPitchVoicingAnalyzerParams params, boolean isSilentAnalysis) { // First remove dc-bias and normalize signal energy // MaryUtils.plot(x); double[] xNorm = MathUtils.add(x, -1.0 * MathUtils.mean(x)); double xEn = SignalProcUtils.energy(x); xNorm = MathUtils.multiply(x, 500.0 / Math.sqrt(xEn / x.length)); // MaryUtils.plot(xNorm); // double durationInSeconds = SignalProcUtils.sample2time(x.length, samplingRate); int numfrm = (int) Math.floor((durationInSeconds - 0.5 * params.mvfAnalysisWindowSizeInSeconds) / params.mvfAnalysisSkipSizeInSeconds + 0.5); float[] mappedF0s = new float[numfrm]; double[] voicingErrors = new double[numfrm]; int ws = SignalProcUtils.time2sample(params.mvfAnalysisWindowSizeInSeconds, samplingRate); int ss = (int) Math.floor(params.mvfAnalysisSkipSizeInSeconds * samplingRate + 0.5); int startIndex, endIndex; boolean isVoiced; float[] maxFrequencyOfVoicings = new float[numfrm]; float currentTime; double[] frm = new double[ws]; Window w = Window.get(Window.HANNING, ws); int currentF0Index; int i; while (params.fftSize < ws) params.fftSize *= 2; int maxFreq = (int) (Math.floor(0.5 * params.fftSize + 0.5) + 1); ComplexArray Y = new ComplexArray(params.fftSize); double[][] allDBSpectra = new double[numfrm][maxFreq]; float prevMaxFreqVoicing, prevPrevMaxFreqVoicing; for (i = 0; i < numfrm; i++) { Arrays.fill(frm, 0.0); // System.arraycopy(xNorm, i*ss, frm, 0, Math.min(ws, x.length-i*ss)); if (i * ss + ws < x.length) System.arraycopy(xNorm, i * ss, frm, 0, ws); else System.arraycopy(xNorm, x.length - ws, frm, 0, ws); // Here is a trick to avoid zeros in the last frame frm = w.apply(frm, 0); startIndex = i * ss; endIndex = startIndex + Math.min(ws, x.length - i * ss) - 1; currentTime = i * params.mvfAnalysisSkipSizeInSeconds + 0.5f * params.mvfAnalysisWindowSizeInSeconds; currentF0Index = SignalProcUtils.time2frameIndex(currentTime, params.f0AnalysisWindowSizeInSeconds, params.f0AnalysisSkipSizeInSeconds); currentF0Index = MathUtils.CheckLimits(currentF0Index, 0, initialF0s.length - 1); mappedF0s[i] = initialF0s[currentF0Index]; Arrays.fill(Y.real, 0.0); Arrays.fill(Y.imag, 0.0); System.arraycopy(frm, 0, Y.real, 0, frm.length); // // Compute DFT if (MathUtils.isPowerOfTwo(params.fftSize)) FFT.transform(Y.real, Y.imag, false); else Y = FFTMixedRadix.fftComplex(Y); // // Compute magnitude spectrum in dB as peak frequency estimates tend to be more accurate double[] YAbs = MathUtils.abs(Y, 0, maxFreq - 1); double[] YAbsDB = MathUtils.amp2db(YAbs); allDBSpectra[i] = ArrayUtils.copy(YAbsDB); // // MaryUtils.plot(YAbsDB); isVoiced = (mappedF0s[i] > 10.0f) ? true : false; /* * isVoiced = false; if (mappedF0s[i]>10.0f) { voicingErrors[i] = estimateVoicingFromFrameSpectrum(YAbs, samplingRate, * mappedF0s[i], params.vuvSearchMinHarmonicMultiplier, params.vuvSearchMaxHarmonicMultiplier); * * if (voicingErrors[i]<-1.0) isVoiced = true; } */ VoicingAnalysisOutputData vo = null; if (isVoiced) { prevMaxFreqVoicing = 0.0f; prevPrevMaxFreqVoicing = 0.0f; if (i >= 1) prevMaxFreqVoicing = maxFrequencyOfVoicings[i - 1]; if (i >= 2) prevPrevMaxFreqVoicing = maxFrequencyOfVoicings[i - 2]; vo = estimateMaxFrequencyOfVoicingsFrame(YAbsDB, samplingRate, mappedF0s[i], isVoiced, prevMaxFreqVoicing, prevPrevMaxFreqVoicing, params, isSilentAnalysis); maxFrequencyOfVoicings[i] = vo.maxFreqOfVoicing; } else maxFrequencyOfVoicings[i] = 0.0f; if (!isSilentAnalysis) { if (isVoiced) System.out.println("Time=" + String.valueOf(currentTime) + " sec." + " f0=" + String.valueOf(mappedF0s[i]) + " Hz." + " Voiced" + " MaxVFreq=" + String.valueOf(maxFrequencyOfVoicings[i])); else System.out.println("Time=" + String.valueOf(currentTime) + " sec." + " f0=" + String.valueOf(mappedF0s[i]) + " Hz." + " Unvoiced" + " MaxVFreq=" + String.valueOf(maxFrequencyOfVoicings[i])); } } // MaryUtils.plot(maxFrequencyOfVoicings); maxFrequencyOfVoicings = smoothUsingPeaks(maxFrequencyOfVoicings); // MaryUtils.plot(maxFrequencyOfVoicings); maxFrequencyOfVoicings = smoothUsingFilters(maxFrequencyOfVoicings, params); // MaryUtils.plot(maxFrequencyOfVoicings); maxFrequencyOfVoicings = applyConstraints(maxFrequencyOfVoicings, mappedF0s, samplingRate, params); // MaryUtils.plot(maxFrequencyOfVoicings); // Arrays.fill(maxFrequencyOfVoicings, samplingRate*0.5f); // MaryUtils.plot(voicingErrors); // MaryUtils.plot(maxFrequencyOfVoicings); return maxFrequencyOfVoicings; } public static float[] smoothUsingFilters(float[] maxFrequencyOfVoicings, HnmPitchVoicingAnalyzerParams params) { for (int i = 0; i < params.numFilteringStages; i++) { // Smooth with a median filter if (params.medianFilterLength > 1) { maxFrequencyOfVoicings = SignalProcUtils.medianFilter(maxFrequencyOfVoicings, params.medianFilterLength); // maxFrequencyOfVoicings = SignalProcUtils.shift(maxFrequencyOfVoicings, // -1*(int)Math.floor(0.5*params.medianFilterLength+0.5)); } // Smooth with moving average filters if (params.movingAverageFilterLength > 1) { maxFrequencyOfVoicings = SignalProcUtils.meanFilter(maxFrequencyOfVoicings, params.movingAverageFilterLength); // maxFrequencyOfVoicings = SignalProcUtils.shift(maxFrequencyOfVoicings, // -1*(int)Math.floor(0.5*params.movingAverageFilterLength+0.5)); } } return maxFrequencyOfVoicings; // } public static float[] smoothUsingPeaks(float[] maxFrequencyOfVoicings) { int i, j; // Peak picking based smoothing here // Find segments int numSegments = 0; boolean bSegmentStarted = false; int startInd = -1; int endInd = -1; if (maxFrequencyOfVoicings[0] > 10.0 && maxFrequencyOfVoicings[1] > 10.0) { startInd = 0; bSegmentStarted = true; } for (i = 2; i < maxFrequencyOfVoicings.length; i++) { if (!bSegmentStarted) { if (maxFrequencyOfVoicings[i - 2] < 10.0 && maxFrequencyOfVoicings[i - 1] < 10.0 && maxFrequencyOfVoicings[i] > 10.0) { bSegmentStarted = true; startInd = i; } } else { if (maxFrequencyOfVoicings[i] < 10.0) { bSegmentStarted = false; endInd = i; if (endInd - startInd > 5) numSegments++; startInd = -1; endInd = -1; } } } if (numSegments > 0) { float[] tmpMaxFrequencyOfVoicings = ArrayUtils.copy(maxFrequencyOfVoicings); Arrays.fill(maxFrequencyOfVoicings, 0.0f); int[][] segmentInds = new int[numSegments][2]; int currentSegment = 0; bSegmentStarted = false; startInd = -1; endInd = -1; if (tmpMaxFrequencyOfVoicings[0] > 10.0 && tmpMaxFrequencyOfVoicings[1] > 10.0) { startInd = 0; bSegmentStarted = true; } for (i = 2; i < tmpMaxFrequencyOfVoicings.length; i++) { if (!bSegmentStarted) { if (tmpMaxFrequencyOfVoicings[i - 2] < 10.0 && tmpMaxFrequencyOfVoicings[i - 1] < 10.0 && tmpMaxFrequencyOfVoicings[i] > 10.0) { bSegmentStarted = true; startInd = i; } } else { if (tmpMaxFrequencyOfVoicings[i] < 10.0) { bSegmentStarted = false; endInd = i; if (endInd - startInd > 5) { segmentInds[currentSegment][0] = startInd; segmentInds[currentSegment][1] = endInd; } currentSegment++; startInd = -1; endInd = -1; if (currentSegment > numSegments - 1) break; } } } // Find peaks in each segment for (i = 0; i < numSegments; i++) { double[] tmpSegment = new double[segmentInds[i][1] - segmentInds[i][0] + 1]; for (j = segmentInds[i][0]; j <= segmentInds[i][1]; j++) tmpSegment[j - segmentInds[i][0]] = tmpMaxFrequencyOfVoicings[j]; int[] tmpPeakInds = MathUtils.getExtrema(tmpSegment, 3, 3, true); int[] peakInds = null; int peakIndsLen = 0; if (tmpPeakInds != null) { peakInds = new int[tmpPeakInds.length + 2]; peakInds[0] = 0; for (j = 0; j < tmpPeakInds.length; j++) peakInds[j + 1] = tmpPeakInds[j]; peakInds[peakInds.length - 1] = segmentInds[i][1] - segmentInds[i][0]; } // Interpolate peaks within each segment and assign final values to maxFrequencyOfVoicings if (peakInds != null) { for (j = 0; j < peakInds.length; j++) peakInds[j] += segmentInds[i][0]; double[] peakVals = new double[peakInds.length]; for (j = 0; j < peakInds.length; j++) peakVals[j] = tmpMaxFrequencyOfVoicings[peakInds[j]]; int[] inds = new int[segmentInds[i][1] - segmentInds[i][0] + 1]; for (j = segmentInds[i][0]; j <= segmentInds[i][1]; j++) inds[j - segmentInds[i][0]] = j; tmpSegment = MathUtils.interpolate_linear(peakInds, peakVals, inds); for (j = segmentInds[i][0]; j <= segmentInds[i][1]; j++) maxFrequencyOfVoicings[j] = (float) tmpSegment[j - segmentInds[i][0]]; } else { for (j = segmentInds[i][0]; j <= segmentInds[i][1]; j++) maxFrequencyOfVoicings[j] = tmpMaxFrequencyOfVoicings[j]; } } } // return maxFrequencyOfVoicings; } public static float[] applyConstraints(float[] maxFrequencyOfVoicings, float[] mappedF0s, int samplingRate, HnmPitchVoicingAnalyzerParams params) { int i; // MaryUtils.plot(maxFrequencyOfVoicings); for (i = 0; i < maxFrequencyOfVoicings.length; i++) { if (mappedF0s[i] < 10.0f) maxFrequencyOfVoicings[i] = 0.0f; } // MaryUtils.plot(maxFrequencyOfVoicings); for (i = 0; i < maxFrequencyOfVoicings.length; i++) { if (mappedF0s[i] < 10.0f) maxFrequencyOfVoicings[i] = 0.0f; else { maxFrequencyOfVoicings[i] = MathUtils.CheckLimits(maxFrequencyOfVoicings[i], params.minimumTotalHarmonics * mappedF0s[i] - 0.3f * mappedF0s[i], params.maximumTotalHarmonics * mappedF0s[i] + 0.3f * mappedF0s[i]); maxFrequencyOfVoicings[i] = MathUtils.CheckLimits(maxFrequencyOfVoicings[i], params.minimumVoicedFrequencyOfVoicing, params.maximumVoicedFrequencyOfVoicing); } } // MaryUtils.plot(maxFrequencyOfVoicings); maxFrequencyOfVoicings = MathUtils.add(maxFrequencyOfVoicings, params.maximumFrequencyOfVoicingFinalShift); // MaryUtils.plot(maxFrequencyOfVoicings); // Final check for voicings and number of harmonics constraints for voiced portions for (i = 0; i < maxFrequencyOfVoicings.length; i++) { if (mappedF0s[i] < 10.0f) maxFrequencyOfVoicings[i] = 0.0f; else maxFrequencyOfVoicings[i] = MathUtils.CheckLimits(maxFrequencyOfVoicings[i], 0.0f, 0.5f * samplingRate); } // MaryUtils.plot(maxFrequencyOfVoicings); for (i = 0; i < maxFrequencyOfVoicings.length; i++) { if (mappedF0s[i] < 10.0f) maxFrequencyOfVoicings[i] = 0.0f; } // MaryUtils.plot(maxFrequencyOfVoicings); return maxFrequencyOfVoicings; } public static VoicingAnalysisOutputData estimateMaxFrequencyOfVoicingsFrame(double[] absDBSpec, int samplingRate, float f0, boolean isVoiced, HnmPitchVoicingAnalyzerParams params) { return estimateMaxFrequencyOfVoicingsFrame(absDBSpec, samplingRate, f0, isVoiced, -1.0f, -1.0f, params, false); } public static VoicingAnalysisOutputData estimateMaxFrequencyOfVoicingsFrame(double[] absDBSpec, int samplingRate, float f0, boolean isVoiced, float prevMaxFreqVoicing, float prevPrevMaxFreqVoicing, HnmPitchVoicingAnalyzerParams params, boolean isSilentAnalysis) { // double[] ampSpec = MathUtils.db2amp(absDBSpec); VoicingAnalysisOutputData output = new VoicingAnalysisOutputData(); int i, n; output.maxFreqOfVoicing = 0.0f; // Means the spectrum is completely unvoiced if (!isVoiced) f0 = 100.0f; int maxFreqIndex = absDBSpec.length - 1; int numHarmonics = (int) Math.floor((0.5 * samplingRate - 1.5 * f0) / f0 + 0.5); // Leave some space at the end of // spectrum, i.e. 1.5*f0 to avoid very // narrow bands there int[] bandInds = new int[numHarmonics + 1]; // 0.5f0 1.5f0 2.5f0 ... (numHarmonics-0.5)f0 (numHarmonics+0.5)f0 for (i = 0; i < bandInds.length; i++) bandInds[i] = SignalProcUtils.freq2index((i + 0.5) * f0, samplingRate, maxFreqIndex); double[] voiceds = new double[numHarmonics]; double[] vals1 = new double[numHarmonics]; double[] vals2 = new double[numHarmonics]; double[] vals3 = new double[numHarmonics]; Arrays.fill(vals1, Double.NEGATIVE_INFINITY); Arrays.fill(vals2, Double.NEGATIVE_INFINITY); Arrays.fill(vals3, Double.NEGATIVE_INFINITY); Arrays.fill(voiceds, 0.0); int[] valleyInds = MathUtils.getExtrema(absDBSpec, 2, 2, false); int[] tmpPeakIndices = new int[numHarmonics]; double bandExtremaVal; int fcIndex; for (i = 0; i < numHarmonics; i++) { // Get local peak // int[] fcIndices = MathUtils.getExtrema(absDBSpec, 2, 2, true, bandInds[i], bandInds[i+1]); int neigh = (int) (Math.floor(params.neighsPercent / 100.0 * 0.5 * (bandInds[i + 1] - bandInds[i]) + 0.5)); if (neigh < 1) neigh = 1; int[] fcIndices = MathUtils.getExtrema(absDBSpec, 1, 1, true, bandInds[i], bandInds[i + 1]); if (fcIndices != null) { fcIndex = fcIndices[0]; bandExtremaVal = absDBSpec[fcIndex]; for (n = 1; n < fcIndices.length; n++) { if (absDBSpec[fcIndices[n]] > bandExtremaVal) { fcIndex = fcIndices[n]; bandExtremaVal = absDBSpec[fcIndex]; } } } else { fcIndex = MathUtils.getAbsMaxInd(absDBSpec, bandInds[i], bandInds[i + 1]); if (fcIndex == -1) fcIndex = (int) Math.floor(0.5 * (bandInds[i] + bandInds[i + 1]) + 0.5); } double fc = SignalProcUtils.index2freq(fcIndex, samplingRate, maxFreqIndex); // From db spec double Am = absDBSpec[fcIndex]; double Amc = computeAmc(absDBSpec, fcIndex, valleyInds); // // From linear spec // double Am = ampSpec[fcIndex]; // double Amc = computeAmc(ampSpec, fcIndex, valleyInds); // Search for other peaks and compute Ams and Amcs int startInd = SignalProcUtils.freq2index(fc - 0.5 * f0, samplingRate, maxFreqIndex); int endInd = SignalProcUtils.freq2index(fc + 0.5 * f0, samplingRate, maxFreqIndex); int[] peakInds = MathUtils.getExtrema(absDBSpec, 1, 1, true, startInd, endInd); // MaryUtils.plot(absDBSpec, startInd, endInd); double[] Ams = null; double[] Amcs = null; voiceds[i] = 0.0; double bandMeanVal; double bandMinVal; if (peakInds != null) { if (peakInds.length > 1) { int total = 0; for (n = 0; n < peakInds.length; n++) { if (peakInds[n] != fcIndex) total++; } Ams = new double[total]; Amcs = new double[total]; int counter = 0; for (n = 0; n < peakInds.length; n++) { if (peakInds[n] != fcIndex) { // From db spec Ams[counter] = absDBSpec[peakInds[n]]; Amcs[counter] = computeAmc(absDBSpec, peakInds[n], valleyInds); // // From linear spec // Ams[counter] = ampSpec[peakInds[n]]; // Amcs[counter] = computeAmc(ampSpec, peakInds[n], valleyInds); // counter++; } } } else // A very sharp single peak might exist in this range, check by comparing with range mean { // MaryUtils.plot(absDBSpec, startInd, endInd); // bandMeanVal = MathUtils.mean(absDBSpec, startInd, endInd); bandMinVal = absDBSpec[MathUtils.getMinIndex(absDBSpec, startInd, endInd)]; if (Am - bandMinVal > params.sharpPeakAmpDiffInDB) voiceds[i] = 1.0; } } // // Now do harmonic tests if (voiceds[i] != 1.0) { if (Amcs != null) { double meanAmcs = MathUtils.mean(Amcs); vals1[i] = Amc / meanAmcs; vals3[i] = (Math.abs(fc - (double) (i + 1) * f0) / ((double) (i + 1) * f0)); if (vals1[i] > params.cumulativeAmpThreshold && vals3[i] < (params.harmonicDeviationPercent / 100.0)) voiceds[i] = 1.0; } /* * else { vals3[i] = (Math.abs(fc-(double)(i+1)*f0)/((double)(i+1)*f0)); * * if (vals3[i]<(HntmAnalyzer.HARMONIC_DEVIATION_PERCENT/100.0)) voiceds[i]=1.0; } */ } if (voiceds[i] != 1.0) // if (voiceds[i]==1.0) { if (Ams != null) { double maxAms = MathUtils.max(Ams); vals2[i] = Am - maxAms; // val2 = (MathUtils.amp2db(Am)-MathUtils.amp2db(maxAms)); //in amp should be converted to db vals3[i] = (Math.abs(fc - (double) (i + 1) * f0) / ((double) (i + 1) * f0)); if (vals2[i] > params.maximumAmpThresholdInDB && vals3[i] < (params.harmonicDeviationPercent / 100.0)) voiceds[i] = 1.0; } /* * else { vals3[i] = (Math.abs(fc-(double)(i+1)*f0)/((double)(i+1)*f0)); * * if (vals3[i]<(HntmAnalyzer.HARMONIC_DEVIATION_PERCENT/100.0)) voiceds[i]=1.0; } */ } // // Save for sending peak indices to output tmpPeakIndices[i] = fcIndex; } /* * //Median filter voicing cut-off int numPoints = 3; voiceds = SignalProcUtils.medianFilter(voiceds, numPoints); * //Forward look int maxVoicedHarmonicBand = -1; for (i=0; i<voiceds.length-2; i++) { if (voiceds[i]==1.0 && * voiceds[i+1]==0.0 && voiceds[i+2]==0.0) { maxVoicedHarmonicBand=i; break; } } */ // Running average computer for voicing cut-off double[] runningMeans = new double[voiceds.length]; for (i = 0; i < voiceds.length; i++) runningMeans[i] = MathUtils.mean(voiceds, 0, i); int numPoints = 3; runningMeans = SignalProcUtils.meanFilter(runningMeans, numPoints); int maxVoicedHarmonicBand = -1; for (i = 0; i < runningMeans.length - 2; i++) { if (runningMeans[i] < params.runningMeanVoicingThreshold && runningMeans[i + 1] < params.runningMeanVoicingThreshold && runningMeans[i + 1] < params.runningMeanVoicingThreshold) { break; } else maxVoicedHarmonicBand = i + 2; } // // Get the max freq. of voicing considering the shift by filtering if (maxVoicedHarmonicBand > -1) output.maxFreqOfVoicing = (float) Math.min((maxVoicedHarmonicBand + 0.5) * f0 + 0.5 * numPoints * f0, 0.5 * samplingRate); else output.maxFreqOfVoicing = 0.0f; output.maxFreqOfVoicing = MathUtils.CheckLimits(output.maxFreqOfVoicing, 0.0f, 0.5f * samplingRate); // Put harmonic peak indices into output as well if (output.maxFreqOfVoicing > 0.0f) { int count = (int) Math.floor(output.maxFreqOfVoicing / f0 + 0.5); count = MathUtils.CheckLimits(count, 0, numHarmonics); if (count > 0) { output.peakIndices = new int[count]; System.arraycopy(tmpPeakIndices, 0, output.peakIndices, 0, count); } } // // String strDebug = ""; // for (i=0; i<voiceds.length; i++) // System.out.println(String.valueOf(voiceds[i]) + " " + String.valueOf(vals1[i]) + " " + String.valueOf(vals2[i]) + " " + // String.valueOf(vals3[i]) + " "); if (!isSilentAnalysis) System.out.println("Max freq of voicing=" + String.valueOf(output.maxFreqOfVoicing)); // MaryUtils.plot(absDBSpec); // MaryUtils.plot(runningMeans); if (prevMaxFreqVoicing > -1.0f) { if (prevPrevMaxFreqVoicing > -1.0f) output.maxFreqOfVoicing = 0.45f * output.maxFreqOfVoicing + 0.35f * prevMaxFreqVoicing + 0.2f * prevPrevMaxFreqVoicing; else output.maxFreqOfVoicing = 0.60f * output.maxFreqOfVoicing + 0.40f * prevMaxFreqVoicing; } return output; } private static double computeAmc(double[] spec, int fcIndex, int[] valleyInds) { double Amc = spec[fcIndex]; if (valleyInds != null) { // Find closest valley indices int vLeftInd = -1; int vRightInd = -1; int counter = 0; vLeftInd = 0; while (fcIndex > valleyInds[counter]) { vLeftInd = valleyInds[counter]; if (counter == valleyInds.length - 1) break; counter++; } counter = valleyInds.length - 1; vRightInd = spec.length - 1; while (valleyInds[counter] > fcIndex) { vRightInd = valleyInds[counter]; if (counter == 0) break; counter--; } for (int i = vLeftInd; i <= vRightInd; i++) { if (i != fcIndex) Amc += spec[i]; } } return Amc; } public static double estimateVoicingFromFrameSpectrum(double[] absSpec, int samplingRate, float f0, double vuvSearchMinHarmonicMultiplier, double vuvSearchMaxHarmonicMultiplier) { boolean isVoiced = false; int maxFreq = absSpec.length; int minFreqInd = SignalProcUtils.freq2index(vuvSearchMinHarmonicMultiplier * f0, samplingRate, maxFreq - 1); int maxFreqInd = SignalProcUtils.freq2index(vuvSearchMaxHarmonicMultiplier * f0, samplingRate, maxFreq - 1); int harmonicNoFirst = ((int) Math.floor(vuvSearchMinHarmonicMultiplier)) + 1; int harmonicNoLast = ((int) Math.floor(vuvSearchMaxHarmonicMultiplier)); int numHarmonicsInRange = harmonicNoLast - harmonicNoFirst + 1; int currentHarmonicInd; int i, j; int[] harmonicsInds = new int[numHarmonicsInRange]; for (i = harmonicNoFirst; i <= harmonicNoLast; i++) harmonicsInds[i - harmonicNoFirst] = SignalProcUtils.freq2index(i * f0, samplingRate, maxFreq - 1); double num = 0.0; double denum = 0.0; for (j = minFreqInd; j < harmonicsInds[0] - 2; j++) num += absSpec[j] * absSpec[j]; for (i = 0; i < numHarmonicsInRange - 1; i++) { for (j = harmonicsInds[i] + 1 + 2; j < harmonicsInds[i + 1] - 2; j++) num += absSpec[j] * absSpec[j]; } for (j = harmonicsInds[numHarmonicsInRange - 1] + 1 + 2; j <= maxFreqInd; j++) num += absSpec[j] * absSpec[j]; for (j = minFreqInd; j <= maxFreqInd; j++) denum += absSpec[j] * absSpec[j]; double E = num / denum; E = MathUtils.db(E); return E; } public static float[] estimateRefinedPitch(int fftSize, int samplingRateInHz, float leftNeighInHz, float rightNeighInHz, float searchStepInHz, float[] initialF0s, float[] maxFrequencyOfVoicings) { float[] f0s = new float[initialF0s.length]; for (int i = 0; i < initialF0s.length; i++) f0s[i] = estimateRefinedFramePitch(initialF0s[i], maxFrequencyOfVoicings[i], fftSize, samplingRateInHz, leftNeighInHz, rightNeighInHz, searchStepInHz); return f0s; } // Searches for a refined f0 value by searching the range [f0InHz-leftNeighInHz, f0InHz+rightNeighInHz] in steps // searchStepInHz // This is done by evaluating the error function // E(f0New) = sum_i=0^maxVoicedIndex |iInHz-i*f0New|^2 // for f0New=f0InHz-leftNeighInHz,f0InHz-leftNeighInHz+searchStepInHz,f0InHz-leftNeighInHz+2*searchStepInHz, // ...,f0InHz+rightNeighInHz // and finding the value of f0New that minimizes it public static float estimateRefinedFramePitch(float f0InHz, float maxFreqOfVoicingInHz, int fftSize, int samplingRateInHz, float leftNeighInHz, float rightNeighInHz, float searchStepInHz) { float refinedF0InHz = 0.0f; double E, Emin; int maxFreqIndex = fftSize / 2 + 1; int maxVoicedFreqInd = SignalProcUtils.freq2index(maxFreqOfVoicingInHz, samplingRateInHz, maxFreqIndex); float f0New; int i; float[] freqIndsInHz = new float[maxVoicedFreqInd]; for (i = 0; i < maxVoicedFreqInd; i++) freqIndsInHz[i] = (float) SignalProcUtils.index2freq(i, samplingRateInHz, maxFreqIndex - 1); Emin = Double.MAX_VALUE; refinedF0InHz = f0InHz; for (f0New = f0InHz - leftNeighInHz; f0New <= f0InHz + rightNeighInHz; f0New += searchStepInHz) { E = 0.0; for (i = 0; i < maxVoicedFreqInd; i++) E += Math.abs(freqIndsInHz[i] - i * f0New); if (E < Emin) { Emin = E; refinedF0InHz = f0New; } } return refinedF0InHz; } public static void main(String[] args) throws UnsupportedAudioFileException, IOException { AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(args[0])); int samplingRate = (int) inputAudio.getFormat().getSampleRate(); AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio); double[] x = signal.getAllData(); HnmPitchVoicingAnalyzerParams params = new HnmPitchVoicingAnalyzerParams(); params.mvfAnalysisWindowSizeInSeconds = 0.040f; params.mvfAnalysisSkipSizeInSeconds = 0.010f; int windowType = Window.BLACKMAN; float f0MinInHz = 60.0f; float f0MaxInHz = 500.0f; // Pitch refinement parameters float leftNeighInHz = 20.0f; float rightNeighInHz = 20.0f; float searchStepInHz = 0.01f; // params.fftSize = getDefaultFFTSize(samplingRate); float[] initialF0s = HnmPitchVoicingAnalyzer.estimateInitialPitch(x, samplingRate, f0MinInHz, f0MaxInHz, windowType, params); float[] maxFrequencyOfVoicings = HnmPitchVoicingAnalyzer.analyzeVoicings(x, samplingRate, initialF0s, params, false); float[] f0s = HnmPitchVoicingAnalyzer.estimateRefinedPitch(params.fftSize, samplingRate, leftNeighInHz, rightNeighInHz, searchStepInHz, initialF0s, maxFrequencyOfVoicings); for (int i = 0; i < f0s.length; i++) System.out.println(String.valueOf(i * params.mvfAnalysisSkipSizeInSeconds + 0.5f * params.mvfAnalysisWindowSizeInSeconds) + " sec. InitialF0=" + String.valueOf(initialF0s[i]) + " RefinedF0=" + String.valueOf(f0s[i])); DisplayUtils.plot(initialF0s); DisplayUtils.plot(f0s); } }