SinusoidalAnalyzer.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.signalproc.sinusoidal;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;

import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;

import marytts.signalproc.analysis.CepstrumSpeechAnalyser;
import marytts.signalproc.analysis.LpcAnalyser;
import marytts.signalproc.analysis.PitchReaderWriter;
import marytts.signalproc.analysis.RegularizedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPostWarpedCepstrumEstimator;
import marytts.signalproc.analysis.RegularizedPreWarpedCepstrumEstimator;
import marytts.signalproc.analysis.SeevocAnalyser;
import marytts.signalproc.analysis.SpectrumWithPeakIndices;
import marytts.signalproc.sinusoidal.hntm.analysis.pitch.HnmPitchVoicingAnalyzer;
import marytts.signalproc.sinusoidal.hntm.analysis.pitch.VoicingAnalysisOutputData;
import marytts.signalproc.window.Window;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.io.FileUtils;
import marytts.util.math.ArrayUtils;
import marytts.util.math.ComplexArray;
import marytts.util.math.FFT;
import marytts.util.math.FFTMixedRadix;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;

/**
 * Sinusoidal Modeling Analysis Module Given a speech/audio signal, a set of amplitudes, frequencies and phases are estimated on a
 * frame-by-frame basis Then, sinusoids that are close in frequency are grouped together to form sinusoidal tracks Optional
 * amplitude and phase continuity constraints can be employed during track generation The implementation consists of ideas and
 * algorithms from various papers as described in function headers
 * 
 * References: Quatieri, T. F. Discrete-Time Speech Signal Processing: Principles and Practice. Prentice-Hall Inc. 2001. (Chapter
 * 9 – Sinusoidal Analysis/Synthesis)
 * 
 * R.J. McAulay and T.F. Quatieri, "Speech Analysis/Synthesis Based on a Sinusoidal Representation," IEEE Transactions on
 * Acoustics, Speech and Signal Processing, vol. ASSP-34, no. 4, August 1986.
 * 
 * @author Oytun Türk
 */
public class SinusoidalAnalyzer extends BaseSinusoidalAnalyzer {

	public SinusoidalAnalysisParams params;

	public SinusoidalAnalyzer(SinusoidalAnalysisParams paramsIn) {
		params = new SinusoidalAnalysisParams(paramsIn);
	}

	//

	// Returns number of neighbouring samples to search for a peak in the spectrum
	// Set default search range for peak detection for different frequency intervals
	// The aim is to eliminate some of the peaks, especially in the high frequency region to reduce phase mismatches at frame
	// boundaries
	// However, we should not set the range to broad as it is required to estimate a peak per 100 Hz on the average theoretically
	// for an accurate representation
	// We simply divide the spectrum into non-overlapping bands using Bark scale
	// Groups of 5-equal Bark ranges are assigned the same peak search range
	// The search ranges are increased as frequency increases, i.e. a higher freq peak candidate needs to be greater than a larger
	// number of neighbours
	// to be selected as a peak
	public static int[] setNeighFreq(int fftSize, boolean bAdjustNeighFreqDependent, float fs) {
		int maxFreq = (int) (Math.floor(0.5 * fftSize + 0.5) + 1);

		int[] freqSampNeighs = new int[maxFreq];

		int i;
		if (!bAdjustNeighFreqDependent) {
			for (i = 0; i < maxFreq; i++)
				freqSampNeighs[i] = SinusoidalAnalysisParams.DEFAULT_FREQ_SAMP_NEIGHS_LOW;
		} else {
			float freq;
			int[] vals = new int[6];
			float maxSeparationInHz = 100.0f; // Maximum average separation allowed for peaks of noise
			for (i = 0; i < vals.length; i++)
				vals[i] = Math.min(i + 1, (int) Math.floor(0.5 * maxSeparationInHz / fs * fftSize + 0.5));

			for (i = 0; i < maxFreq; i++) {
				freq = ((float) i) / (maxFreq - 1.0f) * 0.5f * fs;
				if (freq < 500.0f)
					freqSampNeighs[i] = vals[0];
				else if (freq < 1270.0f)
					freqSampNeighs[i] = vals[1];
				else if (freq < 2700.0f)
					freqSampNeighs[i] = vals[2];
				else if (freq < 6400.0f)
					freqSampNeighs[i] = vals[3];
				else if (freq < 15500.0f)
					freqSampNeighs[i] = vals[4];
				else
					freqSampNeighs[i] = vals[5];
			}
		}

		return freqSampNeighs;
	}

	//

	/*
	 * Fixed rate analysis
	 * 
	 * x: Speech/Audio signal to be analyzed
	 */
	public SinusoidalTracks analyzeFixedRate(double[] x) {
		return analyzeFixedRate(x, SinusoidalAnalysisParams.DEFAULT_ANALYSIS_WINDOW_SIZE);
	}

	/*
	 * Fixed rate analysis
	 * 
	 * x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
	 */
	public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds) {
		return analyzeFixedRate(x, winSizeInSeconds, SinusoidalAnalysisParams.DEFAULT_ANALYSIS_SKIP_SIZE);
	}

	/*
	 * Fixed rate analysis
	 * 
	 * x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
	 * skipSizeInSeconds: Number of pitch periods to be used in analysis
	 */
	public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds) {
		return analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, SinusoidalAnalysisParams.DEFAULT_DELTA_IN_HZ);
	}

	/*
	 * Fixed rate analysis
	 * 
	 * x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
	 * skipSizeInSeconds: Number of pitch periods to be used in analysis deltaInHz: Maximum allowed frequency deviance when
	 * creating sinusoidal tracks
	 */
	public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds, float deltaInHz) {
		return analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, SinusoidalAnalysisParams.LP_SPEC);
	}

	/*
	 * Fixed rate analysis
	 * 
	 * x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
	 * skipSizeInSeconds: Number of pitch periods to be used in analysis deltaInHz: Maximum allowed frequency deviance when
	 * creating sinusoidal tracks spectralEnvelopeType: Spectral envelope estimation method with possible values NO_SPEC (do not
	 * compute spectral envelope) LP_SPEC (linear prediction based envelope) SEEVOC_SPEC (Spectral Envelope Estimation Vocoder
	 * based envelope) REGULARIZED_CEPS (Regularized cepstrum based envelope) See below for details...
	 */
	public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds, float deltaInHz,
			int spectralEnvelopeType) {
		return analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, spectralEnvelopeType, null, -1.0f, -1.0f);
	}

	/*
	 * Fixed rate analysis
	 * 
	 * x: Speech/Audio signal to be analyzed winSizeInSeconds: Integer array of sample indices for pitch period start instants
	 * skipSizeInSeconds: Number of pitch periods to be used in analysis deltaInHz: Maximum allowed frequency deviation when
	 * creating sinusoidal tracks spectralEnvelopeType: Spectral envelope estimation method with possible values NO_SPEC (do not
	 * compute spectral envelope) LP_SPEC (linear prediction based envelope) SEEVOC_SPEC (Spectral Envelope Estimation Vocoder
	 * based envelope) REGULARIZED_CEPS (Regularized cepstrum based envelope) f0s: f0 values in Hz (optional, required for SEEVOC
	 * based spectral envelope estimation. If not specified, SEEVOC based estimation will be performed at a fixed f0 value of
	 * 100.0 Hz ws_f0s: Window size in seconds used for f0 extraction (Functional only for SEEVOC based envelope estimation and
	 * when f0s are not null) ss_f0s: Skip size in seconds used for f0 extraction (Functional only for SEEVOC based envelope
	 * estimation and when f0s are not null)
	 */
	public SinusoidalTracks analyzeFixedRate(double[] x, float winSizeInSeconds, float skipSizeInSeconds, float deltaInHz,
			int spectralEnvelopeType, double[] f0s, float ws_f0, float ss_f0) {
		NonharmonicSinusoidalSpeechSignal sinSignal = extractSinusoidsFixedRate(x, winSizeInSeconds, skipSizeInSeconds,
				deltaInHz, spectralEnvelopeType, f0s, ws_f0, ss_f0);

		// Extract sinusoidal tracks
		TrackGenerator tg = new TrackGenerator();
		SinusoidalTracks sinTracks = tg.generateTracks(sinSignal, deltaInHz, params.fs);

		if (sinTracks != null) {
			sinTracks.getTrackStatistics(winSizeInSeconds, skipSizeInSeconds);
			getGrossStatistics(sinTracks);
		}
		//

		sinTracks.absMaxOriginal = (float) params.absMax;
		sinTracks.totalEnergy = (float) params.totalEnergy;

		return sinTracks;
	}

	public NonharmonicSinusoidalSpeechSignal extractSinusoidsFixedRate(double[] x, float winSizeInSeconds,
			float skipSizeInSeconds, float deltaInHz) {
		return extractSinusoidsFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, SinusoidalAnalysisParams.LP_SPEC);
	}

	public NonharmonicSinusoidalSpeechSignal extractSinusoidsFixedRate(double[] x, float winSizeInSeconds,
			float skipSizeInSeconds, float deltaInHz, int spectralEnvelopeType) {
		return extractSinusoidsFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, spectralEnvelopeType, null, -1.0f,
				-1.0f);
	}

	// ws_f0: Window size in pitch extraction in seconds
	// ss_f0: Skip size in pitch extraction in seconds
	public NonharmonicSinusoidalSpeechSignal extractSinusoidsFixedRate(double[] x, float winSizeInSeconds,
			float skipSizeInSeconds, float deltaInHz, int spectralEnvelopeType, double[] f0s, float ws_f0, float ss_f0) {
		int i, j;
		int f0Ind;
		params.absMax = MathUtils.getAbsMax(x);
		params.totalEnergy = SignalProcUtils.energy(x);

		params.ws = (int) Math.floor(winSizeInSeconds * params.fs + 0.5);
		if (params.ws % 2 == 0) // Always use an odd window size to have a zero-phase analysis window
			params.ws++;

		// System.out.println("ws=" + String.valueOf(params.ws) + " minWindowSize=" + String.valueOf(params.minWindowSize));
		params.ws = Math.max(params.ws, params.minWindowSize);

		params.ss = (int) Math.floor(skipSizeInSeconds * params.fs + 0.5);

		params.win = Window.get(params.windowType, params.ws);
		params.win.normalize(1.0f); // Normalize to sum up to unity

		int totalFrm = (int) ((x.length - 0.5 * params.ws) / params.ss);

		// Extract frames and analyze them
		double[] frm = new double[params.ws];

		NonharmonicSinusoidalSpeechSignal sinSignal = new NonharmonicSinusoidalSpeechSignal(totalFrm);
		boolean[] isSinusoidNulls = new boolean[totalFrm];
		Arrays.fill(isSinusoidNulls, false);
		int totalNonNull = 0;
		int peakCounter;
		float currentTime;
		boolean isOutputToTextFile = false;

		for (i = 0; i < totalFrm; i++) {
			Arrays.fill(frm, 0.0);
			for (j = i * params.ss; j < Math.min(i * params.ss + params.ws, x.length); j++)
				frm[j - i * params.ss] = x[j];

			params.win.applyInline(frm, 0, params.ws);

			currentTime = (float) ((i * params.ss + 0.5 * params.ws) / params.fs); // Middle of analysis frame

			/*
			 * if (currentTime>0.500 && currentTime<0.520) isOutputToTextFile = true; else isOutputToTextFile = false;
			 */

			if (spectralEnvelopeType == SinusoidalAnalysisParams.SEEVOC_SPEC && f0s != null) {
				f0Ind = (int) Math.floor((currentTime - 0.5 * ws_f0) / ss_f0 + 0.5);
				f0Ind = Math.min(f0Ind, f0s.length - 1);
				f0Ind = Math.max(0, f0Ind);
				boolean isVoiced = false;
				if (f0s[f0Ind] > 10.0f)
					isVoiced = true;

				sinSignal.framesSins[i] = analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced,
						(float) f0s[f0Ind], params);
			} else
				sinSignal.framesSins[i] = analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, true, params);

			if (sinSignal.framesSins[i] != null) {
				for (j = 0; j < sinSignal.framesSins[i].sinusoids.length; j++)
					sinSignal.framesSins[i].sinusoids[j].frameIndex = i;
			}

			int peakCount = 0;
			if (sinSignal.framesSins[i] == null)
				isSinusoidNulls[i] = true;
			else {
				isSinusoidNulls[i] = false;
				totalNonNull++;
				peakCount = sinSignal.framesSins[i].sinusoids.length;
			}

			if (sinSignal.framesSins[i] != null) {
				sinSignal.framesSins[i].time = currentTime;
				System.out.println("Analysis complete at " + String.valueOf(sinSignal.framesSins[i].time) + "s. for frame "
						+ String.valueOf(i + 1) + " of " + String.valueOf(totalFrm) + "(found " + String.valueOf(peakCount)
						+ " peaks)");
			}
		}
		//

		NonharmonicSinusoidalSpeechSignal sinSignal2 = null;
		if (totalNonNull > 0) {
			// Collect non-null sinusoids only
			sinSignal2 = new NonharmonicSinusoidalSpeechSignal(totalNonNull);
			int ind = 0;
			for (i = 0; i < totalFrm; i++) {
				if (!isSinusoidNulls[i]) {
					sinSignal2.framesSins[ind] = new NonharmonicSinusoidalSpeechFrame(sinSignal.framesSins[i]);

					ind++;
					if (ind > totalNonNull - 1)
						break;
				}
			}
			//

			sinSignal2.originalDurationInSeconds = ((float) x.length) / params.fs;
		}

		return sinSignal2;
	}

	public static void getGrossStatistics(SinusoidalTracks sinTracks) {
		int totalSins = 0;
		int i, j;
		for (i = 0; i < sinTracks.totalTracks; i++) {
			for (j = 0; j < sinTracks.tracks[i].totalSins; j++) {
				if (sinTracks.tracks[i].states[j] == SinusoidalTrack.ACTIVE)
					totalSins++;
			}
		}

		System.out.println("Total sinusoids to model this file = " + String.valueOf(totalSins));
	}

	public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] frm, boolean isOutputToTextFile, boolean isVoiced,
			SinusoidalAnalysisParams params) {
		return analyze_frame(frm, isOutputToTextFile, SinusoidalAnalysisParams.LP_SPEC, isVoiced, -1.0f, params);
	}

	public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] frm, boolean isOutputToTextFile,
			int spectralEnvelopeType, boolean isVoiced, SinusoidalAnalysisParams params) {
		if (spectralEnvelopeType == SinusoidalAnalysisParams.SEEVOC_SPEC)
			return analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced, 100.0f, params);
		else
			return analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced, -1.0f, params);
	}

	public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] frm, boolean isOutputToTextFile,
			int spectralEnvelopeType, boolean isVoiced, float f0, SinusoidalAnalysisParams params) {
		return analyze_frame(frm, isOutputToTextFile, spectralEnvelopeType, isVoiced, f0, -1.0f, false, params, null);
	}

	// Extract sinusoidal model parameter from a windowed speech frame using the DFT peak-picking algorithm
	// frm: Windowed speech frame
	// spectralEnvelopeType: Desired spectral envelope (See above, i.e LP_SPEC, SEEVOC_SPEC, REGULARIZED_CEPS)
	//
	public static NonharmonicSinusoidalSpeechFrame analyze_frame(double[] windowedFrm, boolean isOutputToTextFile,
			int spectralEnvelopeType, boolean isVoiced, float f0, float maxFreqOfVoicing, boolean bEstimateHNMVoicing,
			SinusoidalAnalysisParams params, float[] initialPeakLocationsInHz) {
		VoicingAnalysisOutputData vo = null;
		float maxVoicingFreqInHz = 0.0f;
		NonharmonicSinusoidalSpeechFrame frameSins = null;

		if (params.fftSize < windowedFrm.length)
			params.fftSize = windowedFrm.length;

		if (params.fftSize % 2 == 1)
			params.fftSize++;

		int[] freqSampNeighs = setNeighFreq(params.fftSize, params.bAdjustNeighFreqDependent, params.fs);

		int maxFreqInd = SignalProcUtils.halfSpectrumSize(params.fftSize) - 1;
		ComplexArray frameDft = new ComplexArray(params.fftSize);
		int i;

		// Perform circular buffering as described in (Quatieri, 2001) to provide correct phase estimates
		int midPoint = (int) Math.floor(0.5 * windowedFrm.length + 0.5);
		System.arraycopy(windowedFrm, midPoint, frameDft.real, 0, windowedFrm.length - midPoint);
		System.arraycopy(windowedFrm, 0, frameDft.real, params.fftSize - midPoint, midPoint);
		// System.arraycopy(windowedFrm, 0, frameDft.real, 0, windowedFrm.length); //No circular buffer
		//

		// Take windowed frame derivative´s FFT for spectral reassignment
		ComplexArray windowedFrameDerivativeFFT = null;
		if (params.bSpectralReassignment) {
			windowedFrameDerivativeFFT = new ComplexArray(params.fftSize);
			double[] dfrm = new double[windowedFrm.length];

			dfrm[0] = windowedFrm[0];
			for (i = 1; i < windowedFrm.length; i++)
				dfrm[i] = windowedFrm[i] - windowedFrm[i - 1];

			System.arraycopy(dfrm, midPoint, windowedFrameDerivativeFFT.real, 0, dfrm.length - midPoint);
			System.arraycopy(dfrm, 0, windowedFrameDerivativeFFT.real, params.fftSize - midPoint, midPoint);
		}

		// Compute DFT
		if (MathUtils.isPowerOfTwo(params.fftSize))
			FFT.transform(frameDft.real, frameDft.imag, false);
		else
			frameDft = FFTMixedRadix.fftComplex(frameDft);
		//

		// Compute magnitude spectrum in dB as peak frequency estimates tend to be more accurate
		double[] frameDftAbs = MathUtils.abs(frameDft, 0, maxFreqInd);
		double[] frameDftDB = MathUtils.amp2db(frameDftAbs);
		//

		int[] freqIndsLow = null;
		int[] freqIndsHigh = null;
		int[] freqInds = null;
		int[] freqIndsLowest = null;
		FreqIndicesWithAmplitudes fiwa;

		double frmEn = SignalProcUtils.getEnergy(windowedFrm);
		if (frmEn > SinusoidalAnalysisParams.MIN_ENERGY_TH) {
			// The following lines are for testing purposes: Fixed frequency tracks will be created
			// Set bManualPeakPickingTest = true to enable this test.
			// Otherwise, the peaks will be automatically estimated (i.e. normal operation mode)
			// below by int [] freqInds = MathUtils.getExtrema...
			boolean bManualPeakPickingTest = false;
			if (bManualPeakPickingTest) {
				int w;
				int numSins = 0;
				int numNoises = 0;
				float noiseRange1 = 0.0f;
				float noiseRange2 = 0.0f;
				float deltaNoise = 1.0f;

				numSins = 4;
				float[] sinFreqs = new float[numSins];
				sinFreqs[0] = 180.0f;
				sinFreqs[1] = 1580.0f;
				sinFreqs[2] = 2580.0f;
				sinFreqs[3] = 4780.0f;

				/*
				 * noiseRange1=2000.0f; noiseRange2=4000.0f; deltaNoise=50.0f; numNoises = Math.max(1,
				 * (int)Math.floor((noiseRange2-noiseRange1)/deltaNoise+0.5));
				 */

				freqInds = new int[numSins + numNoises];

				for (w = 0; w < numSins; w++)
					freqInds[w] = SignalProcUtils.freq2index(sinFreqs[w], params.fs, maxFreqInd);

				for (w = numSins; w < numSins + numNoises; w++)
					freqInds[w] = SignalProcUtils.freq2index(noiseRange1 + (w - numSins) * deltaNoise, params.fs, maxFreqInd);
			}
			//

			/*
			 * int startInd = 0; int endInd = maxFreq-1; if (startFreq>=0) startInd = SignalProcUtils.freq2index(startFreq, fs,
			 * maxFreq); if (endFreq>=0) endInd = SignalProcUtils.freq2index(endFreq, fs, maxFreq);
			 */

			// Vocal tract magnitude spectrum (linear) & phase analysis
			double[] vocalTractSpec = null;
			// SpectrumWithPeakIndices swpi = SeevocAnalyser.calcSpecEnvelopeLinear(frameDftDB, fs, f0); //Note that frameDftDB is
			// in dB but the computed envelope is returned as linear

			if (spectralEnvelopeType == SinusoidalAnalysisParams.LP_SPEC)
				vocalTractSpec = LpcAnalyser.calcSpecFrameLinear(windowedFrm, params.LPOrder, params.fftSize);
			else if (spectralEnvelopeType == SinusoidalAnalysisParams.SEEVOC_SPEC) {
				SpectrumWithPeakIndices swpi = SeevocAnalyser.calcSpecEnvelopeLinear(frameDftDB, params.fs, f0); // Note that
																													// frameDftDB
																													// is in dB
																													// but the
																													// computed
																													// envelope is
																													// returned as
																													// linear
				vocalTractSpec = swpi.spec;
			} else if (spectralEnvelopeType == SinusoidalAnalysisParams.REGULARIZED_CEPS) {
				SpectrumWithPeakIndices swpi = SeevocAnalyser.calcSpecEnvelopeLinear(frameDftDB, params.fs, f0); // Note that
																													// frameDftDB
																													// is in dB
																													// but the
																													// computed
																													// envelope is
																													// returned as
																													// linear
				int cepsOrderPre = 13;
				int cepsOrder = 19;
				int numPeaks = swpi.indices.length;
				double[] linearAmps = new double[numPeaks];
				double[] freqsInHz = new double[numPeaks];
				for (i = 0; i < numPeaks; i++) {
					linearAmps[i] = frameDftAbs[swpi.indices[i]];
					freqsInHz[i] = SignalProcUtils.index2freq(swpi.indices[i], params.fs, maxFreqInd);
				}

				if (params.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_PRE_BARK_WARPING)
					vocalTractSpec = RegularizedPreWarpedCepstrumEstimator.spectralEnvelopeLinear(linearAmps, freqsInHz,
							params.fs, cepsOrder, params.fftSize);
				else if (params.regularizedCepstrumWarpingMethod == RegularizedCepstrumEstimator.REGULARIZED_CEPSTRUM_WITH_POST_MEL_WARPING)
					vocalTractSpec = RegularizedPostWarpedCepstrumEstimator.spectralEnvelopeLinear(linearAmps, freqsInHz,
							params.fs, cepsOrderPre, cepsOrder, params.fftSize);
			}

			/*
			 * double[] vocalTractSpecDB = MathUtils.amp2db(vocalTractSpec); double[] excSpecAbs = new double[maxFreq]; for (i=0;
			 * i<maxFreq; i++) excSpecAbs[i] = frameDftAbs[i]/vocalTractSpec[i]; double[] excSpecDB =
			 * MathUtils.amp2db(excSpecAbs); MaryUtils.plot(frameDftDB); MaryUtils.plot(vocalTractSpecDB);
			 * MaryUtils.plot(excSpecDB);
			 */

			if (maxFreqOfVoicing < 0.0f) {
				// Use abs dft in db for maximum frequency of voicing estimation
				vo = HnmPitchVoicingAnalyzer.estimateMaxFrequencyOfVoicingsFrame(frameDftDB, params.fs, (float) f0, isVoiced,
						params.hnmPitchVoicingAnalyzerParams);

				maxVoicingFreqInHz = (float) Math.min(vo.maxFreqOfVoicing, SinusoidalAnalysisParams.MAX_VOICED_FREQ_IN_HZ); // From
																															// hnm,
																															// not
																															// working
																															// very
																															// properly
																															// yet
				if (maxVoicingFreqInHz > 0.0f)
					maxVoicingFreqInHz = (float) Math.max(maxVoicingFreqInHz, SinusoidalAnalysisParams.MIN_VOICED_FREQ_IN_HZ); // From
																																// hnm,
																																// not
																																// working
																																// very
																																// properly
																																// yet
			} else
				maxVoicingFreqInHz = maxFreqOfVoicing;

			// maxVoicingFreqInHz = 3600.0f; //manual

			float upperFreqSamplingStepInHz = 100.0f;
			int startIndLow = SignalProcUtils.freq2index(params.startFreq, params.fs, maxFreqInd);
			int endIndLow = SignalProcUtils.freq2index(maxVoicingFreqInHz, params.fs, maxFreqInd);
			int startIndHigh = endIndLow + 1;
			int endIndHigh = SignalProcUtils.freq2index(params.endFreq, params.fs, maxFreqInd);
			int halfF0Ind = SignalProcUtils.freq2index(0.5 * f0, params.fs, maxFreqInd);

			// Determine peak amplitude indices and the corresponding amplitudes, frequencies, and phases
			if (initialPeakLocationsInHz != null) // Some initial peak locations given, search peaks around them
			{
				/*
				 * freqInds = new int[initialPeakLocationsInHz.length]; for (i=0; i<initialPeakLocationsInHz.length; i++) { int
				 * startInd =
				 * SignalProcUtils.freq2index(initialPeakLocationsInHz[i]*(1.0-HntmAnalyzer.HARMONIC_DEVIATION_PERCENT/100.0),
				 * params.fs, maxFreqInd); int endInd =
				 * SignalProcUtils.freq2index(initialPeakLocationsInHz[i]*(1.0+HntmAnalyzer.HARMONIC_DEVIATION_PERCENT/100.0),
				 * params.fs, maxFreqInd); freqInds[i] = MathUtils.getAbsMaxInd(frameDftDB, startInd, endInd); if (freqInds[i]<0)
				 * freqInds[i] = SignalProcUtils.freq2index(initialPeakLocationsInHz[i], params.fs, maxFreqInd); }
				 */

				// Return directly at specified locations without peak search
				freqInds = new int[initialPeakLocationsInHz.length];
				for (i = 0; i < initialPeakLocationsInHz.length; i++)
					freqInds[i] = SignalProcUtils.freq2index(initialPeakLocationsInHz[i], params.fs, maxFreqInd);
				//
			} else if (!bManualPeakPickingTest) {
				// Method A: Conventional method gets local extrema from db spectrum
				freqInds = MathUtils.getExtrema(frameDftDB, freqSampNeighs, freqSampNeighs, true, startIndLow, endIndHigh,
						SinusoidalAnalysisParams.MIN_PEAK_IN_DB_LOW);
				if (!ArrayUtils.isOneOf(freqInds, 0)) // Add zeroth index since it is the dc bias
					freqInds = ArrayUtils.appendToStart(freqInds, 0);

				/*
				 * //Method B: Peak picking in lower freqs + Peak picking in upper freqs but with different parameters freqIndsLow
				 * = MathUtils.getExtrema(frameDftDB, DEFAULT_FREQ_SAMP_NEIGHS_LOW, DEFAULT_FREQ_SAMP_NEIGHS_LOW, true,
				 * startIndLow, endIndLow, MIN_PEAK_IN_DB_LOW); freqIndsHigh = MathUtils.getExtrema(frameDftDB,
				 * DEFAULT_FREQ_SAMP_NEIGHS_HIGH, DEFAULT_FREQ_SAMP_NEIGHS_HIGH, true, startIndHigh, endIndHigh,
				 * MIN_PEAK_IN_DB_HIGH); //
				 */

				/*
				 * //Method C: Peak picking in lower freqs + Uniform sampling in upper freqs freqIndsLow =
				 * MathUtils.getExtrema(frameDftDB, DEFAULT_FREQ_SAMP_NEIGHS_LOW, DEFAULT_FREQ_SAMP_NEIGHS_LOW, true, startIndLow,
				 * endIndLow, MIN_PEAK_IN_DB_LOW); int totalHighs =
				 * (int)(Math.floor(0.5*fs-maxVoicingFreqInHz)/upperFreqSamplingStepInHz); if (totalHighs>0) { freqIndsHigh = new
				 * int[totalHighs]; for (i=0; i<totalHighs; i++) freqIndsHigh[i] =
				 * SignalProcUtils.freq2index(maxVoicingFreqInHz+(i-0.5)*upperFreqSamplingStepInHz, fs, maxFreqInd); }
				 */

				/*
				 * //Method D: Lower peaks from HNM analysis, upper peaks with peak picking freqIndsLow = vo.peakIndices; if
				 * (freqIndsLow!=null) { if (!ArrayUtils.isOneOf(freqIndsLow, 0)) //Add zeroth index since it is the dc bias
				 * freqIndsLow = ArrayUtils.appendToStart(freqIndsLow, 0); } freqIndsHigh = MathUtils.getExtrema(frameDftDB,
				 * DEFAULT_FREQ_SAMP_NEIGHS_HIGH, DEFAULT_FREQ_SAMP_NEIGHS_HIGH, true, startIndHigh, endIndHigh,
				 * MIN_PEAK_IN_DB_HIGH); //
				 * 
				 * int numInds = 0; if (freqIndsLow!=null) numInds += freqIndsLow.length; if (freqIndsHigh!=null) numInds +=
				 * freqIndsHigh.length;
				 * 
				 * if (numInds>0) { freqInds=new int[numInds]; int currentInd = 0; if (freqIndsLow!=null) {
				 * System.arraycopy(freqIndsLow, 0, freqInds, 0, freqIndsLow.length); currentInd+=freqIndsLow.length; } if
				 * (freqIndsHigh!=null) System.arraycopy(freqIndsHigh, 0, freqInds, currentInd, freqIndsHigh.length); } //
				 */

				/*
				 * //A hybrid method: // Gets lower freq peaks from SEEVOC, i.e. harmonic peaks // and upper freq peaks from local
				 * extrema int[] maxInds = MathUtils.getExtrema(frameDftDB, freqSampNeighs, freqSampNeighs, true, startInd,
				 * endInd, MIN_PEAK_IN_DB);
				 * 
				 * int cutoffInd = SignalProcUtils.freq2index(5.5*f0, fs, maxFreqInd); int totalFromHarmonics = 0; int
				 * totalFromExtrema = 0; for (i=0; i<swpi.indices.length; i++) { if (swpi.indices[i]<cutoffInd)
				 * totalFromHarmonics++; }
				 * 
				 * for (i=0; i<maxInds.length; i++) { if (maxInds[i]>=cutoffInd) totalFromExtrema++; }
				 * 
				 * freqInds = new int[totalFromHarmonics+totalFromExtrema]; int counter=0; for (i=0; i<swpi.indices.length; i++) {
				 * if (swpi.indices[i]<cutoffInd) freqInds[counter++] = swpi.indices[i]; }
				 * 
				 * for (i=0; i<maxInds.length; i++) { if (maxInds[i]>=cutoffInd) freqInds[counter++] = maxInds[i]; }
				 */
			}

			if (freqInds != null) {
				int numFrameSinusoids = freqInds.length;
				frameSins = new NonharmonicSinusoidalSpeechFrame(numFrameSinusoids);

				// Perform parabola fitting around peak estimates to refine frequency estimation (Ref. - PARSHL, see the function
				// for more details)
				fiwa = new FreqIndicesWithAmplitudes(numFrameSinusoids);

				float[] ampsDB = new float[numFrameSinusoids];
				for (i = 0; i < numFrameSinusoids; i++)
					ampsDB[i] = (float) frameDftDB[freqInds[i]];

				if (params.bRefinePeakEstimatesParabola) {
					fiwa = refinePeakEstimatesParabola(frameDftDB, freqInds);

					/*
					 * //Need to check this function, does not work properly if (params.bRefinePeakEstimatesBias) fiwa =
					 * refinePeakEstimatesBias(frameDftDB, fiwa.freqIndsRefined, params.windowType);
					 */
				} else {
					System.arraycopy(ampsDB, 0, fiwa.ampsRefined, 0, numFrameSinusoids);

					for (i = 0; i < numFrameSinusoids; i++)
						fiwa.freqIndsRefined[i] = (float) freqInds[i];
				}

				if (params.bSpectralReassignment)
					fiwa.freqIndsRefined = refineBySpectralReassignment(frameDft, windowedFrameDerivativeFFT,
							fiwa.freqIndsRefined, params.fftSize, params.fs);

				if (initialPeakLocationsInHz == null) {
					for (i = 0; i < numFrameSinusoids; i++) {
						frameSins.sinusoids[i] = new Sinusoid((float) (MathUtils.db2amp(fiwa.ampsRefined[i])), // amp in linear
																												// scale
								(float) ((Math.PI * fiwa.freqIndsRefined[i]) / maxFreqInd), // freq in radians
								// (float)((0.5*fs*freqIndsRefined[i])/maxFreqInd), //freq in Hz
								(float) (Math.atan2(frameDft.imag[freqInds[i]], frameDft.real[freqInds[i]]))); // phase in radians

						// Possible improvement: Refinement of phase values here...

					}
				} else {
					for (i = 0; i < numFrameSinusoids; i++) {
						frameSins.sinusoids[i] = new Sinusoid((float) frameDftAbs[freqInds[i]], // amp in linear scale
								(float) ((Math.PI * fiwa.freqIndsRefined[i]) / maxFreqInd), // freq in radians
								// (float)((0.5*fs*freqIndsRefined[i])/maxFreqInd), //freq in Hz
								(float) (Math.atan2(frameDft.imag[freqInds[i]], frameDft.real[freqInds[i]]))); // phase in radians
					}
				}

				// For visualization purposes:
				if (isOutputToTextFile) {
					// FileUtils.writeToTextFile(excDftAbs, "d:/out_exc.txt");
					FileUtils.writeToTextFile(vocalTractSpec, "d:/out_vt.txt");
					FileUtils.writeToTextFile(frameDftAbs, "d:/out_spec.txt");
				}
				//

				double[] vocalTractPhase = null;
				if (vocalTractSpec != null) {
					double[] tmpSpec = null;
					// tmpSpec = SignalProcUtils.cepstralSmoothedSpectrumInNeper(frm, fftSize, lifterOrder); //Use cepstral
					// processing to find minimum phase system response
					// Use LP spectrum to find a minimum phase system response
					tmpSpec = new double[params.fftSize];
					System.arraycopy(vocalTractSpec, 0, tmpSpec, 0, vocalTractSpec.length);
					for (i = params.fftSize - 1; i >= vocalTractSpec.length; i--)
						tmpSpec[i] = tmpSpec[params.fftSize - i];
					tmpSpec = MathUtils.amp2neper(tmpSpec);
					//

					// MaryUtils.plot(vocalTractSpec);
					// MaryUtils.plot(tmpSpec);

					double[] tmpPhase = CepstrumSpeechAnalyser.minimumPhaseResponseInRadians(tmpSpec);
					vocalTractPhase = new double[params.fftSize / 2 + 1];
					System.arraycopy(tmpPhase, 0, vocalTractPhase, 0, vocalTractPhase.length);

					// Arrays.fill(vocalTractSpec, 1.0); //Set system amps to one for testing purposes
					// Arrays.fill(vocalTractPhase, 0.0); //Set system phase to zero for testing purposes

					frameSins.setSystemAmps(vocalTractSpec);
					frameSins.setSystemPhases(vocalTractPhase);
					frameSins.setFrameDfts(frameDft);

					// This is from Van Santen´s et.al.´s book - Chapter 5
					// (van Santen, et. al., Progress in Speech Synthesis)
					float[] ceps = SignalProcUtils.specLinear2cepstrum(vocalTractSpec, 32);
					frameSins.setSystemCeps(ceps);
					//
				}
				//
			}
			//
		}

		if (frameSins != null) {
			// frameSins.voicing = (float)SignalProcUtils.getVoicingProbability(windowedFrm, params.fs);
			frameSins.voicing = isVoiced ? 1.0f : 0.0f;
			frameSins.maxFreqOfVoicing = SignalProcUtils.hz2radian(maxVoicingFreqInHz, params.fs);
		}

		return frameSins;
	}

	// Refine peak detection to get more accurate frequency and amplitude values as described in (Smith III and Serra, 1985)(*)
	//
	// (*) Julius O. Smith III and Xavier Serra, 1985, "PARSHL: An Analysis/Synthesis Program for Non-Harmonic Sounds
	// Based on a Sinusoidal Representation", Technical Report, Stanford University, CCRMA STAN-M-43.
	//
	// The basic idea is to fit a parabola to each of the peak detected by simple peak picking from the dB spectrum
	// The previous and next frequency bin is used along with each peak to fit the parabola
	// Then, the peak of the parabola is returned as the peak amplitude estimate and
	// its location as a floating point frequency index for refined peak location
	//
	// Parameters:
	// powSpecdB: Power spectrum estimate in dB
	// freqInds: Peak locations (frequency bins) which we want to refine
	// freqIndsRefined: (OUTPUT) - Refined peak locations as floating point frequency bins
	// ampsRefined: (OUTPUT) - Refined peak amplitude estimates corresponding to the peak value of the parabola fit to each
	// amplitude triplet
	public static FreqIndicesWithAmplitudes refinePeakEstimatesParabola(double[] powSpecdB, int[] freqInds) {
		double alpha, beta, gamma, p;

		FreqIndicesWithAmplitudes fiwa = new FreqIndicesWithAmplitudes(freqInds.length);
		for (int i = 0; i < freqInds.length; i++) {
			// Make sure the peak is not at the first or last freq bin
			if (freqInds[i] > 0 && freqInds[i] < freqInds.length - 1) {
				alpha = powSpecdB[freqInds[i] - 1];
				beta = powSpecdB[freqInds[i]];
				gamma = powSpecdB[freqInds[i] + 1];

				p = 0.5 * (alpha - gamma) / (alpha - 2 * beta + gamma);

				fiwa.freqIndsRefined[i] = (float) (freqInds[i] + p);
				fiwa.ampsRefined[i] = (float) (beta - 0.25 * p * (alpha - gamma));
				// fiwa.ampsRefined[i] = (float)((p*p*(alpha-beta)+p*2*(alpha-2*beta)-beta)/(2*(alpha-beta)));
			} else // otherwise do not refine
			{
				fiwa.freqIndsRefined[i] = (float) freqInds[i];
				fiwa.ampsRefined[i] = (float) powSpecdB[freqInds[i]];
			}
		}

		return fiwa;
	}

	// Need to check this function, does not work properly!
	// Further refine peak detection to get more accurate frequency and amplitude values as described in (Abe and Smith III,
	// 2004)(**)
	//
	// (**) Mototsugu Abe and Julius O. Smith III, 2004, "CQIFFT: Correcting Bias in a Sinusoidal Parameter Estimator based
	// on Quadratic Interpolation of FFT Magnitude Peaks", Technical Report, Center for Computer Research in Music
	// and Acoustics, Department of Music, Stanford University, STAN-M-117.
	//
	// The basic idea is to measure and correct the window-dependent bias of the quadratic refinement method in the previous
	// function
	//
	// Parameters:
	// powSpecdB: Power spectrum estimate in dB
	// freqInds: Peak locations (frequency bins) which we want to refine
	// freqIndsRefined: (OUTPUT) - Refined peak locations as floating point frequency bins
	// ampsRefined: (OUTPUT) - Refined peak amplitude estimates corresponding to the peak value of the parabola fit to each
	// amplitude triplet
	public static FreqIndicesWithAmplitudes refinePeakEstimatesBias(double[] powSpecdB, float[] freqInds, int windowType) {
		FreqIndicesWithAmplitudes fiwa = new FreqIndicesWithAmplitudes(freqInds.length);
		double delHat, Zpf, ZpA;
		double[] c = new double[4];

		// The Zp values are for a max bias of 0.01% in frequency and amplitude as given in Table 3 (Abe and Smith III, 2004)
		switch (windowType) {
		case Window.HANNING:
			Zpf = 1.5;
			ZpA = 1.9;
			c[0] = 0.247560;
			c[1] = 0.084372;
			c[2] = -0.090608;
			c[3] = -0.055781;
			break;
		case Window.HAMMING:
			Zpf = 1.5;
			ZpA = 2.0;
			c[0] = 0.256498;
			c[1] = 0.075977;
			c[2] = -0.116927;
			c[3] = -0.062882;
			break;
		case Window.BLACKMAN:
			Zpf = 1.2;
			ZpA = 1.7;
			c[0] = 0.124188;
			c[1] = 0.013752;
			c[2] = -0.038073;
			c[3] = -0.006195;
			break;
		default: // These are for rectangular window in fact
			Zpf = 2.9;
			ZpA = 3.5;
			c[0] = 1.279369;
			c[1] = 1.756245;
			c[2] = -1.173273;
			c[3] = -3.241966;
		}

		double EZpf = c[0] * Math.pow(Zpf, -2.0) + c[1] * Math.pow(Zpf, -4.0);
		double nZpA = c[2] * Math.pow(ZpA, -4.0) + c[3] * Math.pow(ZpA, -6.0);
		for (int i = 0; i < freqInds.length; i++) {
			delHat = fiwa.freqIndsRefined[i] - freqInds[i];
			fiwa.freqIndsRefined[i] = fiwa.freqIndsRefined[i]
					+ (float) (delHat + EZpf * (delHat - 0.5) * (delHat + 0.5) * delHat);
			fiwa.ampsRefined[i] = (float) (fiwa.ampsRefined[i] + nZpA * delHat * delHat);
		}

		return fiwa;
	}

	// References: Auger, F. and Flandrin, P., 1995,
	// "Improving the readability of time-frequency and time-scale representations by the reassignment method",
	// in IEEE Trans. on Signal Proc., Vol. 43, Issue 5, May 1995, pp. 1068-1089.
	// Borum, S. and Jensen, K., 1999, "Additive analysis/synthesis using analytically derived windows",
	// in Proc. of the 2nd COST G-6 Workshop on Digital Audio Effects (DAFx-99), NTNU, Trondheim,
	public static float[] refineBySpectralReassignment(ComplexArray windowedFrameFFT, ComplexArray windowedFrameDerivativeFFT,
			float[] freqInds, int fftSize, int fs) {
		float[] freqIndsRefined = null;

		if (freqInds != null) {
			freqIndsRefined = new float[freqInds.length];

			if (windowedFrameFFT != null && windowedFrameDerivativeFFT != null) {
				int km;
				double f0InRadians, f0RefinedInRadians, f0RefinedInHz, f0RefinedInd;
				int maxFreqInd = (int) (fftSize / 2 + 1) - 1;
				for (int i = 0; i < freqInds.length; i++) {
					km = (int) Math.floor(freqInds[i] + 0.5);
					f0InRadians = SignalProcUtils.hz2radian(SignalProcUtils.index2freq(km, fs, maxFreqInd), fs);
					f0RefinedInRadians = f0InRadians
							- (windowedFrameFFT.real[km] * windowedFrameDerivativeFFT.imag[km] - windowedFrameFFT.imag[km]
									* windowedFrameDerivativeFFT.real[km])
							/ (windowedFrameFFT.real[km] * windowedFrameFFT.real[km] + windowedFrameFFT.imag[km]
									* windowedFrameFFT.imag[km]) / MathUtils.TWOPI;
					f0RefinedInHz = SignalProcUtils.radian2hz(f0RefinedInRadians, fs);
					freqIndsRefined[i] = (float) SignalProcUtils.freq2indexDouble(f0RefinedInHz, fs, maxFreqInd);
				}
			} else {
				freqIndsRefined = new float[freqInds.length];
				System.arraycopy(freqInds, 0, freqIndsRefined, 0, freqInds.length);
			}
		}

		return freqIndsRefined;
	}

	public static void main(String[] args) throws UnsupportedAudioFileException, IOException {
		AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(args[0]));
		int samplingRate = (int) inputAudio.getFormat().getSampleRate();
		AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio);
		double[] x = signal.getAllData();

		double startFreqInHz = 0.0;
		double endFreqInHz = 0.5 * samplingRate;
		int windowType = Window.HAMMING;
		boolean bRefinePeakEstimatesParabola = false;
		boolean bRefinePeakEstimatesBias = false;
		boolean bSpectralReassignment = true;
		boolean bAdjustNeighFreqDependent = false;

		SinusoidalAnalysisParams params = new SinusoidalAnalysisParams(samplingRate, startFreqInHz, endFreqInHz, windowType,
				bRefinePeakEstimatesParabola, bRefinePeakEstimatesBias, bSpectralReassignment, bAdjustNeighFreqDependent);

		SinusoidalAnalyzer sa = new SinusoidalAnalyzer(params);

		float winSizeInSeconds = 0.020f;
		float skipSizeInSeconds = 0.010f;
		float deltaInHz = 50.0f;
		int spectralEnvelopeType = SinusoidalAnalysisParams.SEEVOC_SPEC;

		String strPitchFile = args[0].substring(0, args[0].length() - 4) + ".ptc";
		PitchReaderWriter f0 = new PitchReaderWriter(strPitchFile);
		float ws_f0 = (float) f0.header.windowSizeInSeconds;
		float ss_f0 = (float) f0.header.skipSizeInSeconds;

		SinusoidalTracks st = sa.analyzeFixedRate(x, winSizeInSeconds, skipSizeInSeconds, deltaInHz, spectralEnvelopeType,
				f0.contour, ws_f0, ss_f0);
	}
}