/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.signalproc.analysis;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.signalproc.display.FunctionGraph;
import marytts.signalproc.filter.BandPassFilter;
import marytts.signalproc.filter.FIRFilter;
import marytts.signalproc.filter.LowPassFilter;
import marytts.util.data.DoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.io.FileUtils;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
import marytts.util.string.StringUtils;
/**
* Autocorrelation based F0 tracker with heuristic rules based on statistics for smoothing and halving/doubling prevention
*
* @author Oytun Türk
*/
public class F0TrackerAutocorrelationHeuristic {
public double[] f0s;
protected PitchFileHeader params; // Pitch detection parameters
protected int totalVoicedFrames; // Total number of voiced frames
protected double[] voicingProbabilities; // Probability of voicing for each frame
protected int minT0Index; // Minimum period length in samples (i.e. corresponding to maximum f0)
protected int maxT0Index; // Maximum period length in samples (i.e. corresponding to minimum f0)
protected double[] prevF0s;
protected double[] voicedF0s; // Voiced frameĀ“s f0 values
protected double longTermAverageF0; // Long term average f0 in voiced frames
protected double shortTermAverageF0; // Short term average f0 in voiced frames
public static double MAX_SAMPLE = 32767.0; // Max 16-bit absolute sample value
public static double MINIMUM_SPEECH_ENERGY = 50.0; // Minimum average sample energy for detecting unvoiced parts
protected double averageSampleEnergy; // Keeps average sample energy for the current analysis frame
// The following are used in internal computations only and are not accessible for the user
private double[] pitchFrm; // A buffer for analysis speech frames
private int frameIndex; // Current frame index
private int ws; // Window size in samples
private int ss; // Skip size in samples
//
public F0TrackerAutocorrelationHeuristic(String wavFile) throws Exception {
if (FileUtils.exists(wavFile)) {
String ptcFile = StringUtils.modifyExtension(wavFile, "ptc");
params = new PitchFileHeader();
init();
PitchReaderWriter f0 = null;
try {
f0 = pitchAnalyzeWavFile(wavFile, ptcFile);
} catch (UnsupportedAudioFileException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else
throw new Exception("Wav file not found!");
}
public F0TrackerAutocorrelationHeuristic(String wavFile, String ptcFile) throws Exception {
if (FileUtils.exists(wavFile)) {
params = new PitchFileHeader();
init();
PitchReaderWriter f0 = null;
try {
f0 = pitchAnalyzeWavFile(wavFile, ptcFile);
} catch (UnsupportedAudioFileException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else
throw new Exception("Wav file not found!");
}
public F0TrackerAutocorrelationHeuristic(PitchFileHeader paramsIn) {
params = new PitchFileHeader(paramsIn);
init();
}
public void init() {
int i;
voicingProbabilities = new double[2];
for (i = 0; i < voicingProbabilities.length; i++)
voicingProbabilities[i] = 0.0;
prevF0s = new double[5];
for (i = 0; i < prevF0s.length; i++)
prevF0s[i] = 0.0;
voicedF0s = new double[20];
for (i = 0; i < voicedF0s.length; i++)
voicedF0s[i] = 0.0;
longTermAverageF0 = 0.5 * (params.maximumF0 + params.minimumF0);
shortTermAverageF0 = longTermAverageF0;
frameIndex = 0;
ws = (int) Math.floor(params.windowSizeInSeconds * params.fs + 0.5);
ss = (int) Math.floor(params.skipSizeInSeconds * params.fs + 0.5);
pitchFrm = new double[ws];
minT0Index = (int) Math.floor(params.fs / params.maximumF0 + 0.5);
maxT0Index = (int) Math.floor(params.fs / params.minimumF0 + 0.5);
if (minT0Index < 0)
minT0Index = 0;
if (minT0Index > ws - 1)
minT0Index = ws - 1;
if (maxT0Index < minT0Index)
maxT0Index = minT0Index;
if (maxT0Index > ws - 1)
maxT0Index = ws - 1;
}
public PitchReaderWriter pitchAnalyzeWavFile(String wavFileIn) throws UnsupportedAudioFileException, IOException {
return pitchAnalyzeWavFile(wavFileIn, null);
}
public PitchReaderWriter pitchAnalyzeWavFile(String wavFileIn, String ptcFileOut) throws UnsupportedAudioFileException,
IOException {
PitchReaderWriter f0 = new PitchReaderWriter();
pitchAnalyzeWav(wavFileIn);
if (f0s != null) {
params.numfrm = f0s.length;
if (ptcFileOut != null)
PitchReaderWriter.write_pitch_file(ptcFileOut, f0s, (float) (params.windowSizeInSeconds),
(float) (params.skipSizeInSeconds), params.fs);
} else
params.numfrm = 0;
f0.header = new PitchFileHeader(params);
f0.setContour(f0s);
return f0;
}
public void pitchAnalyzeWav(String wavFile) throws UnsupportedAudioFileException, IOException {
AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(wavFile));
params.fs = (int) inputAudio.getFormat().getSampleRate();
AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio);
pitchAnalyze(signal);
}
/**
* Analyse the f0 contour of the given audio signal.
*
* @param signal
* signal
*/
public void pitchAnalyze(DoubleDataSource signal) {
pitchAnalyze(signal.getAllData());
if (f0s != null)
params.numfrm = f0s.length;
else
params.numfrm = 0;
}
private void pitchAnalyze(double[] x) {
init();
if (params.cutOff1 > 0.0 || params.cutOff2 > 0.0) {
FIRFilter f = null;
if (params.cutOff2 <= 0.0)
f = new LowPassFilter(params.cutOff1 / params.fs);
else
f = new BandPassFilter(params.cutOff1 / params.fs, params.cutOff2 / params.fs);
if (f != null)
f.apply(x);
}
f0s = null;
int numfrm = (int) Math.floor(((double) x.length - ws) / ss + 0.5);
if (numfrm <= 0)
return;
double maxSample = MathUtils.getAbsMax(x);
f0s = new double[numfrm];
int i, j;
frameIndex = 0;
Arrays.fill(f0s, 0.0);
Random random = new Random();
for (i = 0; i < numfrm; i++) {
System.arraycopy(x, i * ss, pitchFrm, 0, Math.min(ws, x.length - i * ss));
for (j = 0; j < ws; j++)
pitchFrm[j] = (pitchFrm[j] / maxSample) * MAX_SAMPLE + 1e-50 * random.nextDouble();
f0s[i] = pitchFrameAutocorrelation(pitchFrm);
frameIndex++;
}
}
private double pitchFrameAutocorrelation(double[] frmIn) {
assert pitchFrm.length == frmIn.length;
System.arraycopy(pitchFrm, 0, frmIn, 0, frmIn.length);
averageSampleEnergy = SignalProcUtils.getAverageSampleEnergy(pitchFrm);
double f0 = 0.0;
double probabilityOfVoicing;
double tmp;
int i, j;
if (params.centerClippingRatio > 0.0)
SignalProcUtils.centerClip(pitchFrm, params.centerClippingRatio);
double r0 = 0.0;
for (i = 0; i < pitchFrm.length; i++)
r0 += pitchFrm[i] * pitchFrm[i];
int maxIndex = 0;
double maxR = -1.0e10;
for (i = minT0Index; i <= maxT0Index; i++) {
tmp = 0.0;
for (j = 0; j < pitchFrm.length - i; j++)
tmp += pitchFrm[j] * pitchFrm[j + i];
if (tmp > maxR) {
maxIndex = i;
maxR = tmp;
}
}
if (maxIndex == minT0Index || maxIndex == maxT0Index)
probabilityOfVoicing = 0.0;
else
probabilityOfVoicing = maxR / r0;
f0 = ((double) params.fs) / maxIndex;
// look at previous two frame voicing decision to correct F0 estimate
if (probabilityOfVoicing > params.voicingThreshold) {
if (voicingProbabilities[0] < params.voicingThreshold && voicingProbabilities[1] > params.voicingThreshold)
voicingProbabilities[0] = params.voicingThreshold + 0.01;
} else if (probabilityOfVoicing > params.voicingThreshold - 0.1) {
if (voicingProbabilities[0] > params.voicingThreshold && voicingProbabilities[1] > params.voicingThreshold)
probabilityOfVoicing = params.voicingThreshold + 0.01;
}
if (probabilityOfVoicing < params.voicingThreshold)
f0 = 0.0;
if (averageSampleEnergy < MINIMUM_SPEECH_ENERGY)
f0 = 0.0;
for (i = voicingProbabilities.length - 1; i > 0; i--)
voicingProbabilities[i] = voicingProbabilities[i - 1];
voicingProbabilities[0] = probabilityOfVoicing;
if (f0 > 10.0)
totalVoicedFrames++;
if (params.isDoublingCheck || params.isHalvingCheck) {
if (f0 > 10.0) {
totalVoicedFrames++;
if (totalVoicedFrames > voicedF0s.length) {
boolean bNeighVoiced = true;
for (i = 0; i < voicingProbabilities.length; i++) {
if (voicingProbabilities[i] < params.voicingThreshold) {
bNeighVoiced = false;
break;
}
}
if (bNeighVoiced) {
if (params.isDoublingCheck && f0 > 1.25 * longTermAverageF0 && f0 > 1.33 * shortTermAverageF0)
f0 *= 0.5;
if (params.isHalvingCheck && f0 < 0.80 * longTermAverageF0 && f0 < 0.66 * shortTermAverageF0)
f0 *= 2.0;
}
}
}
}
if (f0 > 10.0) {
longTermAverageF0 = 0.99 * longTermAverageF0 + 0.01 * f0;
shortTermAverageF0 = 0.90 * shortTermAverageF0 + 0.10 * f0;
}
// Smooth the F0 contour both with a median and linear filter
prevF0s[2] = f0;
boolean bAllVoiced = true;
for (i = 0; i < prevF0s.length; i++) {
if (prevF0s[i] < 10.0) {
bAllVoiced = false;
break;
}
}
if (bAllVoiced) {
f0 = MathUtils.median(prevF0s);
tmp = 0.5 * prevF0s[2] + 0.25 * prevF0s[1] + 0.25 * prevF0s[0];
if (Math.abs(tmp - f0) < 10.0)
f0 = tmp;
prevF0s[0] = prevF0s[1];
prevF0s[1] = prevF0s[2];
if (totalVoicedFrames == voicedF0s.length) {
longTermAverageF0 = MathUtils.median(voicedF0s);
shortTermAverageF0 = longTermAverageF0;
}
}
// System.out.println("Frame=" + String.valueOf(frameIndex) + " " + String.valueOf(averageSampleEnergy) + " " +
// String.valueOf(probabilityOfVoicing) + " " + String.valueOf(f0));
return f0;
}
/**
* The frame shift time, in seconds.
*
* @return params.skipSizeInSeconds
*/
public double getSkipSizeInSeconds() {
return params.skipSizeInSeconds;
}
/**
* The size of the analysis window, in seconds.
*
* @return params.windowSizeInSeconds
*/
public double getWindowSizeInSeconds() {
return params.windowSizeInSeconds;
}
public double[] getF0Contour() {
return f0s;
}
public static void main(String[] args) throws Exception {
F0TrackerAutocorrelationHeuristic tracker = new F0TrackerAutocorrelationHeuristic(new PitchFileHeader());
tracker.pitchAnalyzeWavFile(args[0]);
FunctionGraph f0Graph = new FunctionGraph(0, tracker.params.skipSizeInSeconds, tracker.f0s);
f0Graph.showInJFrame("F0 curve for " + args[0], false, true);
}
}