BaseSinusoidalPitchTracker.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.signalproc.sinusoidal.pitch;

import marytts.signalproc.sinusoidal.NonharmonicSinusoidalSpeechFrame;
import marytts.signalproc.sinusoidal.NonharmonicSinusoidalSpeechSignal;
import marytts.util.math.MathUtils;

/**
 * Sinusoidal model based pitch tracker
 * 
 * @author Oytun Türk
 * 
 */
public class BaseSinusoidalPitchTracker {
	float[] f0s; // f0 values in Hz
	double[] Qs; // Performance measure for each frame

	public class F0Value {
		public float f0;
		public double maxQ;

		public F0Value() {
			f0 = 0.0f;
			maxQ = 0.0;
		}
	}

	public BaseSinusoidalPitchTracker() {

	}

	public float[] pitchTrack(NonharmonicSinusoidalSpeechSignal sinSignal, int samplingRate, float searchStepInHz,
			float minFreqInHz, float maxFreqInHz) {
		f0s = null;
		Qs = null;

		if (sinSignal.framesSins.length > 0) {
			f0s = new float[sinSignal.framesSins.length];
			Qs = new double[sinSignal.framesSins.length];

			F0Value v;
			int i;
			for (i = 0; i < sinSignal.framesSins.length; i++) {
				v = pitchAnalyzeFrame(sinSignal.framesSins[i], samplingRate, searchStepInHz, minFreqInHz, maxFreqInHz);
				f0s[i] = v.f0;
				Qs[i] = v.maxQ;
				// System.out.println("f0=" + String.valueOf(f0s[i]) + " " + String.valueOf(i+1) + " of " +
				// String.valueOf(sinSignal.framesSins.length));
			}

			f0s = postProcessTrack(f0s, Qs);
		}

		return f0s;
	}

	public F0Value pitchAnalyzeFrame(NonharmonicSinusoidalSpeechFrame sinFrame, int samplingRate, float searchStepInHz,
			float minFreqInHz, float maxFreqInHz) {
		F0Value v = new F0Value();

		double[] Q = null;
		float w0;
		int i;

		if (sinFrame != null) {
			int numCandidates = (int) Math.floor((maxFreqInHz - minFreqInHz) / searchStepInHz + 1 + 0.5);

			Q = new double[numCandidates];

			int stInd, enInd;

			for (i = 0; i < numCandidates; i++) {
				w0 = i * searchStepInHz + minFreqInHz;

				Q[i] = performanceCriterion(sinFrame, w0, samplingRate);
			}

			// MaryUtils.plot(Q, true, 1000);
		}

		// Search for distinct peaks in the Q-function
		if (Q != null) {
			v.maxQ = MathUtils.getMax(Q);
			int numNeighs = (int) Math.floor(10.0f / searchStepInHz + 0.5);
			int[] maxInds = MathUtils.getExtrema(Q, numNeighs, numNeighs, true, 0, Q.length - 1, 0.1 * v.maxQ);
			if (maxInds != null) {
				int maxInd = 0;
				v.maxQ = Q[maxInds[maxInd]];

				for (i = 1; i < maxInds.length; i++) {
					if (Q[maxInds[i]] > v.maxQ) {
						v.maxQ = Q[maxInds[i]];
						maxInd = i;
					}
				}

				v.f0 = maxInds[maxInd] * searchStepInHz + minFreqInHz;
			}
		}

		if (v.maxQ < 5.0e-5)
			v.f0 = 0.0f;

		char chTab = 9;
		System.out.println(String.valueOf(v.f0) + chTab + String.valueOf(v.maxQ));

		return v;
	}

	// Baseline version that does nothing, implement functionality in derived classes
	public double performanceCriterion(NonharmonicSinusoidalSpeechFrame sinFrame, float f0Candidate, int samplingRate) {
		return -1.0f;
	}

	// Post process f0 values to eliminate obvious pitch halving errors
	// Note that comb filter based sinusoidal pitch tracker solves pitch doubling automatically.
	// But there is always a pitch halving possibility
	// This function tries to eliminate obvious halving errors,
	// i.e. isolated f0 values that are approximately half of the neighboring f0 values
	// The function also checks for isolated voiced or unvoiced f0 values and tries to correct them
	public float[] postProcessTrack(float[] f0sIn, double[] QsIn) {
		float[] f0sOut = null;
		if (f0sIn != null) {
			int i, j;
			int numfrm = f0sIn.length;
			float avgContextF0;
			int contextCount = 1;

			f0sOut = new float[numfrm];
			System.arraycopy(f0sIn, 0, f0sOut, 0, numfrm);

			// Search for isolated unvoiceds & voiceds:
			for (i = 1; i < numfrm - 1; i++) {
				if (f0sOut[i] <= 10.0f && f0sOut[i - 1] > 10.0f && f0sOut[i + 1] > 10.0f) // isolated unvoiced
					f0sOut[i] = 0.5f * (f0sOut[i - 1] + f0sOut[i + 1]);
				else if (f0sOut[i] > 10.0f && f0sOut[i - 1] <= 10.0f && f0sOut[i + 1] <= 10.0f) // isolated voiced
					f0sOut[i] = 0.0f;
			}

			// Search for isolated halvings & doublings:
			// (Doubling check should not be necessary for sinusoidal based pitch trackers!)

			for (i = contextCount; i < numfrm - contextCount; i++) {
				boolean bAllVoiced = true;

				for (j = -contextCount; j <= contextCount; j++) {
					if (f0sOut[i + j] < 10.0f) {
						bAllVoiced = false;
						break;
					}
				}

				if (bAllVoiced) {
					avgContextF0 = 0.0f;
					for (j = -contextCount; j < 0; j++)
						avgContextF0 += f0sOut[i + j];

					for (j = 1; j <= contextCount; j++)
						avgContextF0 += f0sOut[i + j];

					avgContextF0 = avgContextF0 / (2.0f * contextCount);

					if (Math.abs(f0sOut[i] - 0.5 * avgContextF0) < 10.0f) // isolated halving
						f0sOut[i] *= 2.0f;
					else if (Math.abs(f0sOut[i] - 2.0 * avgContextF0) < 10.0f) // isolated doubling
						f0sOut[i] *= 0.5f;
				}
			}
		}

		return f0sOut;
	}
}