/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.signalproc.sinusoidal;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.signalproc.analysis.CepstrumSpeechAnalyser;
import marytts.signalproc.analysis.LpcAnalyser;
import marytts.signalproc.analysis.PitchReaderWriter;
import marytts.signalproc.analysis.RegularizedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPostWarpedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPreWarpedCepstrumEstimator;
import marytts.signalproc.analysis.SeevocAnalyser;
import marytts.signalproc.analysis.SpectrumWithPeakIndices;
import marytts.signalproc.sinusoidal.hntm.analysis.pitch.HnmPitchVoicingAnalyzer;
import marytts.signalproc.sinusoidal.hntm.analysis.pitch.VoicingAnalysisOutputData;
import marytts.signalproc.window.Window;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.io.FileUtils;
import marytts.util.math.ArrayUtils;
import marytts.util.math.ComplexArray;
import marytts.util.math.FFT;
import marytts.util.math.FFTMixedRadix;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
/**
* Sinusoidal Modeling Analysis Module Given a speech/audio signal, a set of amplitudes, frequencies and phases are estimated on a
* frame-by-frame basis Then, sinusoids that are close in frequency are grouped together to form sinusoidal tracks Optional
* amplitude and phase continuity constraints can be employed during track generation The implementation consists of ideas and
* algorithms from various papers as described in function headers
*
* References: Quatieri, T. F. Discrete-Time Speech Signal Processing: Principles and Practice. Prentice-Hall Inc. 2001. (Chapter
* 9 – Sinusoidal Analysis/Synthesis)
*
* R.J. McAulay and T.F. Quatieri, "Speech Analysis/Synthesis Based on a Sinusoidal Representation," IEEE Transactions on
* Acoustics, Speech and Signal Processing, vol. ASSP-34, no. 4, August 1986.
*
* @author Oytun Türk
*/
public class SinusoidalAnalyzer extends BaseSinusoidalAnalyzer {
public SinusoidalAnalysisParams params;
public SinusoidalAnalyzer(SinusoidalAnalysisParams paramsIn) {
params = new SinusoidalAnalysisParams(paramsIn);
}
//
// Returns number of neighbouring samples to search for a peak in the spectrum
// Set default search range for peak detection for different frequency intervals
// The aim is to eliminate some of the peaks, especially in the high frequency region to reduce phase mismatches at frame
// boundaries
// However, we should not set the range to broad as it is required to estimate a peak per 100 Hz on the average theoretically
// for an accurate representation
// We simply divide the spectrum into non-overlapping bands using Bark scale
// Groups of 5-equal Bark ranges are assigned the same peak search range
// The search ranges are increased as frequency increases, i.e. a higher freq peak candidate needs to be greater than a larger
// number of neighbours
// to be selected as a peak
public static int[] setNeighFreq(int fftSize, boolean bAdjustNeighFreqDependent, float fs) {
int maxFreq = (int) (Math.floor(0.5 * fftSize + 0.5) + 1);
int[] freqSampNeighs = new int[maxFreq];
int i;
if (!bAdjustNeighFreqDependent) {
for (i = 0; i < maxFreq; i++)
freqSampNeighs[i] = SinusoidalAnalysisParams.DEFAULT_FREQ_SAMP_NEIGHS_LOW;
} else {
float freq;
int[] vals = new int[6];
float maxSeparationInHz = 100.0f; // Maximum average separation allowed for peaks of noise
for (i = 0; i < vals.length; i++)
vals[i] = Math.min(i + 1, (int) Math.floor(0.5 * maxSeparationInHz / fs * fftSize + 0.5));
for (i = 0; i < maxFreq; i++) {
freq = ((float) i) / (maxFreq - 1.0f) * 0.5f * fs;
if (freq < 500.0f)
freqSampNeighs[i] = vals[0];
else if (freq < 1270.0f)
freqSampNeighs[i] = vals[1];
else if (freq < 2700.0f)
freqSampNeighs[i] = vals[2];
else if (freq < 6400.0f)
freqSampNeighs[i] = vals[3];
else if (freq < 15500.0f)
freqSampNeighs[i] = vals[4];
else
freqSampNeighs[i] = vals[5];
}
}
return freqSampNeighs;
}
//
/*
* Fixed rate analysis
*
* x: Speech/Audio signal to be analyzed
*/
public SinusoidalTracks analyzeFixedRate(double[] x) {
return analyzeFixedRate(x, SinusoidalAnalysisParams.DEFAULT_ANALYSIS_WINDOW_SIZE);
}
/*
* Fixed rate analysis
*
* x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
*/
public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds) {
return analyzeFixedRate(x, winSizeInSeconds, SinusoidalAnalysisParams.DEFAULT_ANALYSIS_SKIP_SIZE);
}
/*
* Fixed rate analysis
*
* x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
* skipSizeInSeconds: Number of pitch periods to be used in analysis
*/
public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds) {
return analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, SinusoidalAnalysisParams.DEFAULT_DELTA_IN_HZ);
}
/*
* Fixed rate analysis
*
* x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
* skipSizeInSeconds: Number of pitch periods to be used in analysis deltaInHz: Maximum allowed frequency deviance when
* creating sinusoidal tracks
*/
public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds, float deltaInHz) {
return analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, SinusoidalAnalysisParams.LP_SPEC);
}
/*
* Fixed rate analysis
*
* x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
* skipSizeInSeconds: Number of pitch periods to be used in analysis deltaInHz: Maximum allowed frequency deviance when
* creating sinusoidal tracks spectralEnvelopeType: Spectral envelope estimation method with possible values NO_SPEC (do not
* compute spectral envelope) LP_SPEC (linear prediction based envelope) SEEVOC_SPEC (Spectral Envelope Estimation Vocoder
* based envelope) REGULARIZED_CEPS (Regularized cepstrum based envelope) See below for details...
*/
public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds, float deltaInHz,
int spectralEnvelopeType) {
return analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, spectralEnvelopeType, null, -1.0f, -1.0f);
}
/*
* Fixed rate analysis
*
* x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
* skipSizeInSeconds: Number of pitch periods to be used in analysis deltaInHz: Maximum allowed frequency deviation when
* creating sinusoidal tracks spectralEnvelopeType: Spectral envelope estimation method with possible values NO_SPEC (do not
* compute spectral envelope) LP_SPEC (linear prediction based envelope) SEEVOC_SPEC (Spectral Envelope Estimation Vocoder
* based envelope) REGULARIZED_CEPS (Regularized cepstrum based envelope) f0s: f0 values in Hz (optional, required for SEEVOC
* based spectral envelope estimation. If not specified, SEEVOC based estimation will be performed at a fixed f0 value of
* 100.0 Hz ws_f0s: Window size in seconds used for f0 extraction (Functional only for SEEVOC based envelope estimation and
* when f0s are not null) ss_f0s: Skip size in seconds used for f0 extraction (Functional only for SEEVOC based envelope
* estimation and when f0s are not null)
*/
public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds, float deltaInHz,
int spectralEnvelopeType, double[] f0s, float ws_f0, float ss_f0) {
NonharmonicSinusoidalSpeechSignal sinSignal = extractSinusoidsFixedRate(x, winSizeInSeconds, skipSizeInSeconds,
deltaInHz, spectralEnvelopeType, f0s, ws_f0, ss_f0);
// Extract sinusoidal tracks
TrackGenerator tg = new TrackGenerator();
SinusoidalTracks sinTracks = tg.generateTracks(sinSignal, deltaInHz, params.fs);
if (sinTracks != null) {
sinTracks.getTrackStatistics(winSizeInSeconds, skipSizeInSeconds);
getGrossStatistics(sinTracks);
}
//
sinTracks.absMaxOriginal = (float) params.absMax;
sinTracks.totalEnergy = (float) params.totalEnergy;
return sinTracks;
}
public NonharmonicSinusoidalSpeechSignal extractSinusoidsFixedRate(double[] x, float winSizeInSeconds,
float skipSizeInSeconds, float deltaInHz) {
return extractSinusoidsFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, SinusoidalAnalysisParams.LP_SPEC);
}
public NonharmonicSinusoidalSpeechSignal extractSinusoidsFixedRate(double[] x, float winSizeInSeconds,
float skipSizeInSeconds, float deltaInHz, int spectralEnvelopeType) {
return extractSinusoidsFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, spectralEnvelopeType, null, -1.0f,
-1.0f);
}
// ws_f0: Window size in pitch extraction in seconds
// ss_f0: Skip size in pitch extraction in seconds
public NonharmonicSinusoidalSpeechSignal extractSinusoidsFixedRate(double[] x, float winSizeInSeconds,
float skipSizeInSeconds, float deltaInHz, int spectralEnvelopeType, double[] f0s, float ws_f0, float ss_f0) {
int i, j;
int f0Ind;
params.absMax = MathUtils.getAbsMax(x);
params.totalEnergy = SignalProcUtils.energy(x);
params.ws = (int) Math.floor(winSizeInSeconds * params.fs + 0.5);
if (params.ws % 2 == 0) // Always use an odd window size to have a zero-phase analysis window
params.ws++;
// System.out.println("ws=" + String.valueOf(params.ws) + " minWindowSize=" + String.valueOf(params.minWindowSize));
params.ws = Math.max(params.ws, params.minWindowSize);
params.ss = (int) Math.floor(skipSizeInSeconds * params.fs + 0.5);
params.win = Window.get(params.windowType, params.ws);
params.win.normalize(1.0f); // Normalize to sum up to unity
int totalFrm = (int) ((x.length - 0.5 * params.ws) / params.ss);
// Extract frames and analyze them
double[] frm = new double[params.ws];
NonharmonicSinusoidalSpeechSignal sinSignal = new NonharmonicSinusoidalSpeechSignal(totalFrm);
boolean[] isSinusoidNulls = new boolean[totalFrm];
Arrays.fill(isSinusoidNulls, false);
int totalNonNull = 0;
int peakCounter;
float currentTime;
boolean isOutputToTextFile = false;
for (i = 0; i < totalFrm; i++) {
Arrays.fill(frm, 0.0);
for (j = i * params.ss; j < Math.min(i * params.ss + params.ws, x.length); j++)
frm[j - i * params.ss] = x[j];
params.win.applyInline(frm, 0, params.ws);
currentTime = (float) ((i * params.ss + 0.5 * params.ws) / params.fs); // Middle of analysis frame
/*
* if (currentTime>0.500 && currentTime<0.520) isOutputToTextFile = true; else isOutputToTextFile = false;
*/
if (spectralEnvelopeType == SinusoidalAnalysisParams.SEEVOC_SPEC && f0s != null) {
f0Ind = (int) Math.floor((currentTime - 0.5 * ws_f0) / ss_f0 + 0.5);
f0Ind = Math.min(f0Ind, f0s.length - 1);
f0Ind = Math.max(0, f0Ind);
boolean isVoiced = false;
if (f0s[f0Ind] > 10.0f)
isVoiced = true;
sinSignal.framesSins[i] = analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced,
(float) f0s[f0Ind], params);
} else
sinSignal.framesSins[i] = analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, true, params);
if (sinSignal.framesSins[i] != null) {
for (j = 0; j < sinSignal.framesSins[i].sinusoids.length; j++)
sinSignal.framesSins[i].sinusoids[j].frameIndex = i;
}
int peakCount = 0;
if (sinSignal.framesSins[i] == null)
isSinusoidNulls[i] = true;
else {
isSinusoidNulls[i] = false;
totalNonNull++;
peakCount = sinSignal.framesSins[i].sinusoids.length;
}
if (sinSignal.framesSins[i] != null) {
sinSignal.framesSins[i].time = currentTime;
System.out.println("Analysis complete at " + String.valueOf(sinSignal.framesSins[i].time) + "s. for frame "
+ String.valueOf(i + 1) + " of " + String.valueOf(totalFrm) + "(found " + String.valueOf(peakCount)
+ " peaks)");
}
}
//
NonharmonicSinusoidalSpeechSignal sinSignal2 = null;
if (totalNonNull > 0) {
// Collect non-null sinusoids only
sinSignal2 = new NonharmonicSinusoidalSpeechSignal(totalNonNull);
int ind = 0;
for (i = 0; i < totalFrm; i++) {
if (!isSinusoidNulls[i]) {
sinSignal2.framesSins[ind] = new NonharmonicSinusoidalSpeechFrame(sinSignal.framesSins[i]);
ind++;
if (ind > totalNonNull - 1)
break;
}
}
//
sinSignal2.originalDurationInSeconds = ((float) x.length) / params.fs;
}
return sinSignal2;
}
public static void getGrossStatistics(SinusoidalTracks sinTracks) {
int totalSins = 0;
int i, j;
for (i = 0; i < sinTracks.totalTracks; i++) {
for (j = 0; j < sinTracks.tracks[i].totalSins; j++) {
if (sinTracks.tracks[i].states[j] == SinusoidalTrack.ACTIVE)
totalSins++;
}
}
System.out.println("Total sinusoids to model this file = " + String.valueOf(totalSins));
}
public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] frm, boolean isOutputToTextFile, boolean isVoiced,
SinusoidalAnalysisParams params) {
return analyze_frame(frm, isOutputToTextFile, SinusoidalAnalysisParams.LP_SPEC, isVoiced, -1.0f, params);
}
public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] frm, boolean isOutputToTextFile,
int spectralEnvelopeType, boolean isVoiced, SinusoidalAnalysisParams params) {
if (spectralEnvelopeType == SinusoidalAnalysisParams.SEEVOC_SPEC)
return analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced, 100.0f, params);
else
return analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced, -1.0f, params);
}
public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] frm, boolean isOutputToTextFile,
int spectralEnvelopeType, boolean isVoiced, float f0, SinusoidalAnalysisParams params) {
return analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced, f0, -1.0f, false, params, null);
}
// Extract sinusoidal model parameter from a windowed speech frame using the DFT peak-picking algorithm
// frm: Windowed speech frame
// spectralEnvelopeType: Desired spectral envelope (See above, i.e LP_SPEC, SEEVOC_SPEC, REGULARIZED_CEPS)
//
public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] windowedFrm, boolean isOutputToTextFile,
int spectralEnvelopeType, boolean isVoiced, float f0, float maxFreqOfVoicing, boolean bEstimateHNMVoicing,
SinusoidalAnalysisParams params, float[] initialPeakLocationsInHz) {
VoicingAnalysisOutputData vo = null;
float maxVoicingFreqInHz = 0.0f;
NonharmonicSinusoidalSpeechFrame frameSins = null;
if (params.fftSize < windowedFrm.length)
params.fftSize = windowedFrm.length;
if (params.fftSize % 2 == 1)
params.fftSize++;
int[] freqSampNeighs = setNeighFreq(params.fftSize, params.bAdjustNeighFreqDependent, params.fs);
int maxFreqInd = SignalProcUtils.halfSpectrumSize(params.fftSize) - 1;
ComplexArray frameDft = new ComplexArray(params.fftSize);
int i;
// Perform circular buffering as described in (Quatieri, 2001) to provide correct phase estimates
int midPoint = (int) Math.floor(0.5 * windowedFrm.length + 0.5);
System.arraycopy(windowedFrm, midPoint, frameDft.real, 0, windowedFrm.length - midPoint);
System.arraycopy(windowedFrm, 0, frameDft.real, params.fftSize - midPoint, midPoint);
// System.arraycopy(windowedFrm, 0, frameDft.real, 0, windowedFrm.length); //No circular buffer
//
// Take windowed frame derivative´s FFT for spectral reassignment
ComplexArray windowedFrameDerivativeFFT = null;
if (params.bSpectralReassignment) {
windowedFrameDerivativeFFT = new ComplexArray(params.fftSize);
double[] dfrm = new double[windowedFrm.length];
dfrm[0] = windowedFrm[0];
for (i = 1; i < windowedFrm.length; i++)
dfrm[i] = windowedFrm[i] - windowedFrm[i - 1];
System.arraycopy(dfrm, midPoint, windowedFrameDerivativeFFT.real, 0, dfrm.length - midPoint);
System.arraycopy(dfrm, 0, windowedFrameDerivativeFFT.real, params.fftSize - midPoint, midPoint);
}
// Compute DFT
if (MathUtils.isPowerOfTwo(params.fftSize))
FFT.transform(frameDft.real, frameDft.imag, false);
else
frameDft = FFTMixedRadix.fftComplex(frameDft);
//
// Compute magnitude spectrum in dB as peak frequency estimates tend to be more accurate
double[] frameDftAbs = MathUtils.abs(frameDft, 0, maxFreqInd);
double[] frameDftDB = MathUtils.amp2db(frameDftAbs);
//
int[] freqIndsLow = null;
int[] freqIndsHigh = null;
int[] freqInds = null;
int[] freqIndsLowest = null;
FreqIndicesWithAmplitudes fiwa;
double frmEn = SignalProcUtils.getEnergy(windowedFrm);
if (frmEn > SinusoidalAnalysisParams.MIN_ENERGY_TH) {
// The following lines are for testing purposes: Fixed frequency tracks will be created
// Set bManualPeakPickingTest = true to enable this test.
// Otherwise, the peaks will be automatically estimated (i.e. normal operation mode)
// below by int [] freqInds = MathUtils.getExtrema...
boolean bManualPeakPickingTest = false;
if (bManualPeakPickingTest) {
int w;
int numSins = 0;
int numNoises = 0;
float noiseRange1 = 0.0f;
float noiseRange2 = 0.0f;
float deltaNoise = 1.0f;
numSins = 4;
float[] sinFreqs = new float[numSins];
sinFreqs[0] = 180.0f;
sinFreqs[1] = 1580.0f;
sinFreqs[2] = 2580.0f;
sinFreqs[3] = 4780.0f;
/*
* noiseRange1=2000.0f; noiseRange2=4000.0f; deltaNoise=50.0f; numNoises = Math.max(1,
* (int)Math.floor((noiseRange2-noiseRange1)/deltaNoise+0.5));
*/
freqInds = new int[numSins + numNoises];
for (w = 0; w < numSins; w++)
freqInds[w] = SignalProcUtils.freq2index(sinFreqs[w], params.fs, maxFreqInd);
for (w = numSins; w < numSins + numNoises; w++)
freqInds[w] = SignalProcUtils.freq2index(noiseRange1 + (w - numSins) * deltaNoise, params.fs, maxFreqInd);
}
//
/*
* int startInd = 0; int endInd = maxFreq-1; if (startFreq>=0) startInd = SignalProcUtils.freq2index(startFreq, fs,
* maxFreq); if (endFreq>=0) endInd = SignalProcUtils.freq2index(endFreq, fs, maxFreq);
*/
// Vocal tract magnitude spectrum (linear) & phase analysis
double[] vocalTractSpec = null;
// SpectrumWithPeakIndices swpi = SeevocAnalyser.calcSpecEnvelopeLinear(frameDftDB, fs, f0); //Note that frameDftDB is
// in dB but the computed envelope is returned as linear
if (spectralEnvelopeType == SinusoidalAnalysisParams.LP_SPEC)
vocalTractSpec = LpcAnalyser.calcSpecFrameLinear(windowedFrm, params.LPOrder, params.fftSize);
else if (spectralEnvelopeType == SinusoidalAnalysisParams.SEEVOC_SPEC) {
SpectrumWithPeakIndices swpi = SeevocAnalyser.calcSpecEnvelopeLinear(frameDftDB, params.fs, f0); // Note that
// frameDftDB
// is in dB
// but the
// computed
// envelope is
// returned as
// linear
vocalTractSpec = swpi.spec;
} else if (spectralEnvelopeType == SinusoidalAnalysisParams.REGULARIZED_CEPS) {
SpectrumWithPeakIndices swpi = SeevocAnalyser.calcSpecEnvelopeLinear(frameDftDB, params.fs, f0); // Note that
// frameDftDB
// is in dB
// but the
// computed
// envelope is
// returned as
// linear
int cepsOrderPre = 13;
int cepsOrder = 19;
int numPeaks = swpi.indices.length;
double[] linearAmps = new double[numPeaks];
double[] freqsInHz = new double[numPeaks];
for (i = 0; i < numPeaks; i++) {
linearAmps[i] = frameDftAbs[swpi.indices[i]];
freqsInHz[i] = SignalProcUtils.index2freq(swpi.indices[i], params.fs, maxFreqInd);
}
if (params.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING)
vocalTractSpec = RegularizedPreWarpedCepstrumEstimator.spectralEnvelopeLinear(linearAmps, freqsInHz,
params.fs, cepsOrder, params.fftSize);
else if (params.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING)
vocalTractSpec = RegularizedPostWarpedCepstrumEstimator.spectralEnvelopeLinear(linearAmps, freqsInHz,
params.fs, cepsOrderPre, cepsOrder, params.fftSize);
}
/*
* double[] vocalTractSpecDB = MathUtils.amp2db(vocalTractSpec); double[] excSpecAbs = new double[maxFreq]; for (i=0;
* i<maxFreq; i++) excSpecAbs[i] = frameDftAbs[i]/vocalTractSpec[i]; double[] excSpecDB =
* MathUtils.amp2db(excSpecAbs); MaryUtils.plot(frameDftDB); MaryUtils.plot(vocalTractSpecDB);
* MaryUtils.plot(excSpecDB);
*/
if (maxFreqOfVoicing < 0.0f) {
// Use abs dft in db for maximum frequency of voicing estimation
vo = HnmPitchVoicingAnalyzer.estimateMaxFrequencyOfVoicingsFrame(frameDftDB, params.fs, (float) f0, isVoiced,
params.hnmPitchVoicingAnalyzerParams);
maxVoicingFreqInHz = (float) Math.min(vo.maxFreqOfVoicing, SinusoidalAnalysisParams.MAX_VOICED_FREQ_IN_HZ); // From
// hnm,
// not
// working
// very
// properly
// yet
if (maxVoicingFreqInHz > 0.0f)
maxVoicingFreqInHz = (float) Math.max(maxVoicingFreqInHz, SinusoidalAnalysisParams.MIN_VOICED_FREQ_IN_HZ); // From
// hnm,
// not
// working
// very
// properly
// yet
} else
maxVoicingFreqInHz = maxFreqOfVoicing;
// maxVoicingFreqInHz = 3600.0f; //manual
float upperFreqSamplingStepInHz = 100.0f;
int startIndLow = SignalProcUtils.freq2index(params.startFreq, params.fs, maxFreqInd);
int endIndLow = SignalProcUtils.freq2index(maxVoicingFreqInHz, params.fs, maxFreqInd);
int startIndHigh = endIndLow + 1;
int endIndHigh = SignalProcUtils.freq2index(params.endFreq, params.fs, maxFreqInd);
int halfF0Ind = SignalProcUtils.freq2index(0.5 * f0, params.fs, maxFreqInd);
// Determine peak amplitude indices and the corresponding amplitudes, frequencies, and phases
if (initialPeakLocationsInHz != null) // Some initial peak locations given, search peaks around them
{
/*
* freqInds = new int[initialPeakLocationsInHz.length]; for (i=0; i<initialPeakLocationsInHz.length; i++) { int
* startInd =
* SignalProcUtils.freq2index(initialPeakLocationsInHz[i]*(1.0-HntmAnalyzer.HARMONIC_DEVIATION_PERCENT/100.0),
* params.fs, maxFreqInd); int endInd =
* SignalProcUtils.freq2index(initialPeakLocationsInHz[i]*(1.0+HntmAnalyzer.HARMONIC_DEVIATION_PERCENT/100.0),
* params.fs, maxFreqInd); freqInds[i] = MathUtils.getAbsMaxInd(frameDftDB, startInd, endInd); if (freqInds[i]<0)
* freqInds[i] = SignalProcUtils.freq2index(initialPeakLocationsInHz[i], params.fs, maxFreqInd); }
*/
// Return directly at specified locations without peak search
freqInds = new int[initialPeakLocationsInHz.length];
for (i = 0; i < initialPeakLocationsInHz.length; i++)
freqInds[i] = SignalProcUtils.freq2index(initialPeakLocationsInHz[i], params.fs, maxFreqInd);
//
} else if (!bManualPeakPickingTest) {
// Method A: Conventional method gets local extrema from db spectrum
freqInds = MathUtils.getExtrema(frameDftDB, freqSampNeighs, freqSampNeighs, true, startIndLow, endIndHigh,
SinusoidalAnalysisParams.MIN_PEAK_IN_DB_LOW);
if (!ArrayUtils.isOneOf(freqInds, 0)) // Add zeroth index since it is the dc bias
freqInds = ArrayUtils.appendToStart(freqInds, 0);
/*
* //Method B: Peak picking in lower freqs + Peak picking in upper freqs but with different parameters freqIndsLow
* = MathUtils.getExtrema(frameDftDB, DEFAULT_FREQ_SAMP_NEIGHS_LOW, DEFAULT_FREQ_SAMP_NEIGHS_LOW, true,
* startIndLow, endIndLow, MIN_PEAK_IN_DB_LOW); freqIndsHigh = MathUtils.getExtrema(frameDftDB,
* DEFAULT_FREQ_SAMP_NEIGHS_HIGH, DEFAULT_FREQ_SAMP_NEIGHS_HIGH, true, startIndHigh, endIndHigh,
* MIN_PEAK_IN_DB_HIGH); //
*/
/*
* //Method C: Peak picking in lower freqs + Uniform sampling in upper freqs freqIndsLow =
* MathUtils.getExtrema(frameDftDB, DEFAULT_FREQ_SAMP_NEIGHS_LOW, DEFAULT_FREQ_SAMP_NEIGHS_LOW, true, startIndLow,
* endIndLow, MIN_PEAK_IN_DB_LOW); int totalHighs =
* (int)(Math.floor(0.5*fs-maxVoicingFreqInHz)/upperFreqSamplingStepInHz); if (totalHighs>0) { freqIndsHigh = new
* int[totalHighs]; for (i=0; i<totalHighs; i++) freqIndsHigh[i] =
* SignalProcUtils.freq2index(maxVoicingFreqInHz+(i-0.5)*upperFreqSamplingStepInHz, fs, maxFreqInd); }
*/
/*
* //Method D: Lower peaks from HNM analysis, upper peaks with peak picking freqIndsLow = vo.peakIndices; if
* (freqIndsLow!=null) { if (!ArrayUtils.isOneOf(freqIndsLow, 0)) //Add zeroth index since it is the dc bias
* freqIndsLow = ArrayUtils.appendToStart(freqIndsLow, 0); } freqIndsHigh = MathUtils.getExtrema(frameDftDB,
* DEFAULT_FREQ_SAMP_NEIGHS_HIGH, DEFAULT_FREQ_SAMP_NEIGHS_HIGH, true, startIndHigh, endIndHigh,
* MIN_PEAK_IN_DB_HIGH); //
*
* int numInds = 0; if (freqIndsLow!=null) numInds += freqIndsLow.length; if (freqIndsHigh!=null) numInds +=
* freqIndsHigh.length;
*
* if (numInds>0) { freqInds=new int[numInds]; int currentInd = 0; if (freqIndsLow!=null) {
* System.arraycopy(freqIndsLow, 0, freqInds, 0, freqIndsLow.length); currentInd+=freqIndsLow.length; } if
* (freqIndsHigh!=null) System.arraycopy(freqIndsHigh, 0, freqInds, currentInd, freqIndsHigh.length); } //
*/
/*
* //A hybrid method: // Gets lower freq peaks from SEEVOC, i.e. harmonic peaks // and upper freq peaks from local
* extrema int[] maxInds = MathUtils.getExtrema(frameDftDB, freqSampNeighs, freqSampNeighs, true, startInd,
* endInd, MIN_PEAK_IN_DB);
*
* int cutoffInd = SignalProcUtils.freq2index(5.5*f0, fs, maxFreqInd); int totalFromHarmonics = 0; int
* totalFromExtrema = 0; for (i=0; i<swpi.indices.length; i++) { if (swpi.indices[i]<cutoffInd)
* totalFromHarmonics++; }
*
* for (i=0; i<maxInds.length; i++) { if (maxInds[i]>=cutoffInd) totalFromExtrema++; }
*
* freqInds = new int[totalFromHarmonics+totalFromExtrema]; int counter=0; for (i=0; i<swpi.indices.length; i++) {
* if (swpi.indices[i]<cutoffInd) freqInds[counter++] = swpi.indices[i]; }
*
* for (i=0; i<maxInds.length; i++) { if (maxInds[i]>=cutoffInd) freqInds[counter++] = maxInds[i]; }
*/
}
if (freqInds != null) {
int numFrameSinusoids = freqInds.length;
frameSins = new NonharmonicSinusoidalSpeechFrame(numFrameSinusoids);
// Perform parabola fitting around peak estimates to refine frequency estimation (Ref. - PARSHL, see the function
// for more details)
fiwa = new FreqIndicesWithAmplitudes(numFrameSinusoids);
float[] ampsDB = new float[numFrameSinusoids];
for (i = 0; i < numFrameSinusoids; i++)
ampsDB[i] = (float) frameDftDB[freqInds[i]];
if (params.bRefinePeakEstimatesParabola) {
fiwa = refinePeakEstimatesParabola(frameDftDB, freqInds);
/*
* //Need to check this function, does not work properly if (params.bRefinePeakEstimatesBias) fiwa =
* refinePeakEstimatesBias(frameDftDB, fiwa.freqIndsRefined, params.windowType);
*/
} else {
System.arraycopy(ampsDB, 0, fiwa.ampsRefined, 0, numFrameSinusoids);
for (i = 0; i < numFrameSinusoids; i++)
fiwa.freqIndsRefined[i] = (float) freqInds[i];
}
if (params.bSpectralReassignment)
fiwa.freqIndsRefined = refineBySpectralReassignment(frameDft, windowedFrameDerivativeFFT,
fiwa.freqIndsRefined, params.fftSize, params.fs);
if (initialPeakLocationsInHz == null) {
for (i = 0; i < numFrameSinusoids; i++) {
frameSins.sinusoids[i] = new Sinusoid((float) (MathUtils.db2amp(fiwa.ampsRefined[i])), // amp in linear
// scale
(float) ((Math.PI * fiwa.freqIndsRefined[i]) / maxFreqInd), // freq in radians
// (float)((0.5*fs*freqIndsRefined[i])/maxFreqInd), //freq in Hz
(float) (Math.atan2(frameDft.imag[freqInds[i]], frameDft.real[freqInds[i]]))); // phase in radians
// Possible improvement: Refinement of phase values here...
}
} else {
for (i = 0; i < numFrameSinusoids; i++) {
frameSins.sinusoids[i] = new Sinusoid((float) frameDftAbs[freqInds[i]], // amp in linear scale
(float) ((Math.PI * fiwa.freqIndsRefined[i]) / maxFreqInd), // freq in radians
// (float)((0.5*fs*freqIndsRefined[i])/maxFreqInd), //freq in Hz
(float) (Math.atan2(frameDft.imag[freqInds[i]], frameDft.real[freqInds[i]]))); // phase in radians
}
}
// For visualization purposes:
if (isOutputToTextFile) {
// FileUtils.writeToTextFile(excDftAbs, "d:/out_exc.txt");
FileUtils.writeToTextFile(vocalTractSpec, "d:/out_vt.txt");
FileUtils.writeToTextFile(frameDftAbs, "d:/out_spec.txt");
}
//
double[] vocalTractPhase = null;
if (vocalTractSpec != null) {
double[] tmpSpec = null;
// tmpSpec = SignalProcUtils.cepstralSmoothedSpectrumInNeper(frm, fftSize, lifterOrder); //Use cepstral
// processing to find minimum phase system response
// Use LP spectrum to find a minimum phase system response
tmpSpec = new double[params.fftSize];
System.arraycopy(vocalTractSpec, 0, tmpSpec, 0, vocalTractSpec.length);
for (i = params.fftSize - 1; i >= vocalTractSpec.length; i--)
tmpSpec[i] = tmpSpec[params.fftSize - i];
tmpSpec = MathUtils.amp2neper(tmpSpec);
//
// MaryUtils.plot(vocalTractSpec);
// MaryUtils.plot(tmpSpec);
double[] tmpPhase = CepstrumSpeechAnalyser.minimumPhaseResponseInRadians(tmpSpec);
vocalTractPhase = new double[params.fftSize / 2 + 1];
System.arraycopy(tmpPhase, 0, vocalTractPhase, 0, vocalTractPhase.length);
// Arrays.fill(vocalTractSpec, 1.0); //Set system amps to one for testing purposes
// Arrays.fill(vocalTractPhase, 0.0); //Set system phase to zero for testing purposes
frameSins.setSystemAmps(vocalTractSpec);
frameSins.setSystemPhases(vocalTractPhase);
frameSins.setFrameDfts(frameDft);
// This is from Van Santen´s et.al.´s book - Chapter 5
// (van Santen, et. al., Progress in Speech Synthesis)
float[] ceps = SignalProcUtils.specLinear2cepstrum(vocalTractSpec, 32);
frameSins.setSystemCeps(ceps);
//
}
//
}
//
}
if (frameSins != null) {
// frameSins.voicing = (float)SignalProcUtils.getVoicingProbability(windowedFrm, params.fs);
frameSins.voicing = isVoiced ? 1.0f : 0.0f;
frameSins.maxFreqOfVoicing = SignalProcUtils.hz2radian(maxVoicingFreqInHz, params.fs);
}
return frameSins;
}
// Refine peak detection to get more accurate frequency and amplitude values as described in (Smith III and Serra, 1985)(*)
//
// (*) Julius O. Smith III and Xavier Serra, 1985, "PARSHL: An Analysis/Synthesis Program for Non-Harmonic Sounds
// Based on a Sinusoidal Representation", Technical Report, Stanford University, CCRMA STAN-M-43.
//
// The basic idea is to fit a parabola to each of the peak detected by simple peak picking from the dB spectrum
// The previous and next frequency bin is used along with each peak to fit the parabola
// Then, the peak of the parabola is returned as the peak amplitude estimate and
// its location as a floating point frequency index for refined peak location
//
// Parameters:
// powSpecdB: Power spectrum estimate in dB
// freqInds: Peak locations (frequency bins) which we want to refine
// freqIndsRefined: (OUTPUT) - Refined peak locations as floating point frequency bins
// ampsRefined: (OUTPUT) - Refined peak amplitude estimates corresponding to the peak value of the parabola fit to each
// amplitude triplet
public static FreqIndicesWithAmplitudes refinePeakEstimatesParabola(double[] powSpecdB, int[] freqInds) {
double alpha, beta, gamma, p;
FreqIndicesWithAmplitudes fiwa = new FreqIndicesWithAmplitudes(freqInds.length);
for (int i = 0; i < freqInds.length; i++) {
// Make sure the peak is not at the first or last freq bin
if (freqInds[i] > 0 && freqInds[i] < freqInds.length - 1) {
alpha = powSpecdB[freqInds[i] - 1];
beta = powSpecdB[freqInds[i]];
gamma = powSpecdB[freqInds[i] + 1];
p = 0.5 * (alpha - gamma) / (alpha - 2 * beta + gamma);
fiwa.freqIndsRefined[i] = (float) (freqInds[i] + p);
fiwa.ampsRefined[i] = (float) (beta - 0.25 * p * (alpha - gamma));
// fiwa.ampsRefined[i] = (float)((p*p*(alpha-beta)+p*2*(alpha-2*beta)-beta)/(2*(alpha-beta)));
} else // otherwise do not refine
{
fiwa.freqIndsRefined[i] = (float) freqInds[i];
fiwa.ampsRefined[i] = (float) powSpecdB[freqInds[i]];
}
}
return fiwa;
}
// Need to check this function, does not work properly!
// Further refine peak detection to get more accurate frequency and amplitude values as described in (Abe and Smith III,
// 2004)(**)
//
// (**) Mototsugu Abe and Julius O. Smith III, 2004, "CQIFFT: Correcting Bias in a Sinusoidal Parameter Estimator based
// on Quadratic Interpolation of FFT Magnitude Peaks", Technical Report, Center for Computer Research in Music
// and Acoustics, Department of Music, Stanford University, STAN-M-117.
//
// The basic idea is to measure and correct the window-dependent bias of the quadratic refinement method in the previous
// function
//
// Parameters:
// powSpecdB: Power spectrum estimate in dB
// freqInds: Peak locations (frequency bins) which we want to refine
// freqIndsRefined: (OUTPUT) - Refined peak locations as floating point frequency bins
// ampsRefined: (OUTPUT) - Refined peak amplitude estimates corresponding to the peak value of the parabola fit to each
// amplitude triplet
public static FreqIndicesWithAmplitudes refinePeakEstimatesBias(double[] powSpecdB, float[] freqInds, int windowType) {
FreqIndicesWithAmplitudes fiwa = new FreqIndicesWithAmplitudes(freqInds.length);
double delHat, Zpf, ZpA;
double[] c = new double[4];
// The Zp values are for a max bias of 0.01% in frequency and amplitude as given in Table 3 (Abe and Smith III, 2004)
switch (windowType) {
case Window.HANNING:
Zpf = 1.5;
ZpA = 1.9;
c[0] = 0.247560;
c[1] = 0.084372;
c[2] = -0.090608;
c[3] = -0.055781;
break;
case Window.HAMMING:
Zpf = 1.5;
ZpA = 2.0;
c[0] = 0.256498;
c[1] = 0.075977;
c[2] = -0.116927;
c[3] = -0.062882;
break;
case Window.BLACKMAN:
Zpf = 1.2;
ZpA = 1.7;
c[0] = 0.124188;
c[1] = 0.013752;
c[2] = -0.038073;
c[3] = -0.006195;
break;
default: // These are for rectangular window in fact
Zpf = 2.9;
ZpA = 3.5;
c[0] = 1.279369;
c[1] = 1.756245;
c[2] = -1.173273;
c[3] = -3.241966;
}
double EZpf = c[0] * Math.pow(Zpf, -2.0) + c[1] * Math.pow(Zpf, -4.0);
double nZpA = c[2] * Math.pow(ZpA, -4.0) + c[3] * Math.pow(ZpA, -6.0);
for (int i = 0; i < freqInds.length; i++) {
delHat = fiwa.freqIndsRefined[i] - freqInds[i];
fiwa.freqIndsRefined[i] = fiwa.freqIndsRefined[i]
+ (float) (delHat + EZpf * (delHat - 0.5) * (delHat + 0.5) * delHat);
fiwa.ampsRefined[i] = (float) (fiwa.ampsRefined[i] + nZpA * delHat * delHat);
}
return fiwa;
}
// References: Auger, F. and Flandrin, P., 1995,
// "Improving the readability of time-frequency and time-scale representations by the reassignment method",
// in IEEE Trans. on Signal Proc., Vol. 43, Issue 5, May 1995, pp. 1068-1089.
// Borum, S. and Jensen, K., 1999, "Additive analysis/synthesis using analytically derived windows",
// in Proc. of the 2nd COST G-6 Workshop on Digital Audio Effects (DAFx-99), NTNU, Trondheim,
public static float[] refineBySpectralReassignment(ComplexArray windowedFrameFFT, ComplexArray windowedFrameDerivativeFFT,
float[] freqInds, int fftSize, int fs) {
float[] freqIndsRefined = null;
if (freqInds != null) {
freqIndsRefined = new float[freqInds.length];
if (windowedFrameFFT != null && windowedFrameDerivativeFFT != null) {
int km;
double f0InRadians, f0RefinedInRadians, f0RefinedInHz, f0RefinedInd;
int maxFreqInd = (int) (fftSize / 2 + 1) - 1;
for (int i = 0; i < freqInds.length; i++) {
km = (int) Math.floor(freqInds[i] + 0.5);
f0InRadians = SignalProcUtils.hz2radian(SignalProcUtils.index2freq(km, fs, maxFreqInd), fs);
f0RefinedInRadians = f0InRadians
- (windowedFrameFFT.real[km] * windowedFrameDerivativeFFT.imag[km] - windowedFrameFFT.imag[km]
* windowedFrameDerivativeFFT.real[km])
/ (windowedFrameFFT.real[km] * windowedFrameFFT.real[km] + windowedFrameFFT.imag[km]
* windowedFrameFFT.imag[km]) / MathUtils.TWOPI;
f0RefinedInHz = SignalProcUtils.radian2hz(f0RefinedInRadians, fs);
freqIndsRefined[i] = (float) SignalProcUtils.freq2indexDouble(f0RefinedInHz, fs, maxFreqInd);
}
} else {
freqIndsRefined = new float[freqInds.length];
System.arraycopy(freqInds, 0, freqIndsRefined, 0, freqInds.length);
}
}
return freqIndsRefined;
}
public static void main(String[] args) throws UnsupportedAudioFileException, IOException {
AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(args[0]));
int samplingRate = (int) inputAudio.getFormat().getSampleRate();
AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio);
double[] x = signal.getAllData();
double startFreqInHz = 0.0;
double endFreqInHz = 0.5 * samplingRate;
int windowType = Window.HAMMING;
boolean bRefinePeakEstimatesParabola = false;
boolean bRefinePeakEstimatesBias = false;
boolean bSpectralReassignment = true;
boolean bAdjustNeighFreqDependent = false;
SinusoidalAnalysisParams params = new SinusoidalAnalysisParams(samplingRate, startFreqInHz, endFreqInHz, windowType,
bRefinePeakEstimatesParabola, bRefinePeakEstimatesBias, bSpectralReassignment, bAdjustNeighFreqDependent);
SinusoidalAnalyzer sa = new SinusoidalAnalyzer(params);
float winSizeInSeconds = 0.020f;
float skipSizeInSeconds = 0.010f;
float deltaInHz = 50.0f;
int spectralEnvelopeType = SinusoidalAnalysisParams.SEEVOC_SPEC;
String strPitchFile = args[0].substring(0, args[0].length() - 4) + ".ptc";
PitchReaderWriter f0 = new PitchReaderWriter(strPitchFile);
float ws_f0 = (float) f0.header.windowSizeInSeconds;
float ss_f0 = (float) f0.header.skipSizeInSeconds;
SinusoidalTracks st = sa.analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, spectralEnvelopeType,
f0.contour, ws_f0, ss_f0);
}
}