/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.signalproc.analysis.distance;
import java.io.File;
import java.io.IOException;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.signalproc.analysis.LpcAnalyser;
import marytts.signalproc.analysis.LsfAnalyser;
import marytts.signalproc.analysis.LpcAnalyser.LpCoeffs;
import marytts.signalproc.window.HammingWindow;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.signal.SignalProcUtils;
/**
* A class for general purpose distance metrics to compare speech/audio signals objectively
*
* @author Oytun Türk
*/
public class SpectralDistanceMeasures {
public static double MAX_SPECTRAL_DISTANCE = 1e+10;
/*
* Inverse harmonic weighting based LSF distance Performs LP analysis by first preemphasizing and windowing speech frames
* <speechFrame1> and <speechFrame2> Then, computes LSFs for the two speech frames Finally, the perceptual distance between
* two lsf vectors are estimated. Note that the input speech frames are not changed during distance computation LP order is
* automatically decided depending on the sampling rate
*
* speechFrame1: First speech frame (not windowed) speechFrame2: Second speech frame (not windowed) samplingRate: Sampling
* rate in Hz
*/
public static double lsfDist(double[] speechFrame1, double[] speechFrame2, int samplingRate) {
return lsfDist(speechFrame1, speechFrame2, samplingRate, SignalProcUtils.getLPOrder(samplingRate));
}
/*
* Inverse harmonic weighting based LSF distance Performs LP analysis by first preemphasizing and windowing speech frames
* <speechFrame1> and <speechFrame2> Then, computes LSFs for the two speech frames Finally, the perceptual distance between
* two lsf vectors are estimated. Note that the input speech frames are not changed during distance computation
*
* speechFrame1: First speech frame (not windowed) speechFrame2: Second speech frame (not windowed) samplingRate: Sampling
* rate in Hz lpOrder: Desired LP analysis order
*/
public static double lsfDist(double[] speechFrame1, double[] speechFrame2, int samplingRate, int lpOrder) {
// Preemphasis
double[] windowedSpeechFrame1 = new double[speechFrame1.length];
System.arraycopy(speechFrame1, 0, windowedSpeechFrame1, 0, speechFrame1.length);
windowedSpeechFrame1 = SignalProcUtils.applyPreemphasis(windowedSpeechFrame1, 0.97);
double[] windowedSpeechFrame2 = new double[speechFrame2.length];
System.arraycopy(speechFrame2, 0, windowedSpeechFrame2, 0, speechFrame2.length);
windowedSpeechFrame2 = SignalProcUtils.applyPreemphasis(windowedSpeechFrame2, 0.97);
//
// Windowing
HammingWindow w1 = new HammingWindow(speechFrame1.length);
w1.apply(windowedSpeechFrame1, 0);
HammingWindow w2 = new HammingWindow(speechFrame2.length);
w2.apply(windowedSpeechFrame2, 0);
//
// LP analysis
LpCoeffs lpcs1 = LpcAnalyser.calcLPC(windowedSpeechFrame1, lpOrder);
LpCoeffs lpcs2 = LpcAnalyser.calcLPC(windowedSpeechFrame2, lpOrder);
//
// LPC to LSF conversion
double[] lsfs1 = LsfAnalyser.lpc2lsfInHz(lpcs1.getOneMinusA(), samplingRate);
double[] lsfs2 = LsfAnalyser.lpc2lsfInHz(lpcs2.getOneMinusA(), samplingRate);
//
return getLsfDist(lsfs1, lsfs2, samplingRate);
}
public static double getLsfDist(double[] lsfs1, double[] lsfs2, int samplingRate) {
double[] lsfWgt = getInverseHarmonicLSFWeights(lsfs1);
return getLsfDist(lsfs1, lsfs2, samplingRate, lsfWgt);
}
public static double getLsfDist(double[] lsfs1, double[] lsfs2, int samplingRate, double[] lsfWgt) {
assert lsfs1.length == lsfs2.length;
assert lsfs1.length == lsfWgt.length;
double dist = 0.0;
for (int i = 0; i < lsfs1.length; i++)
dist += 0.1 * lsfWgt[i] * Math.abs(lsfs1[i] - lsfs2[i]);
dist = dist * 16000 / samplingRate;
dist = Math.min(20.0, dist / lsfs1.length);
dist = 10 * (dist + 1e-36);
return dist;
}
public static double[] getInverseHarmonicLSFWeights(double[] lsfs) {
assert lsfs.length > 1;
int P = lsfs.length;
int i;
double[] lsfWgt = new double[P];
lsfWgt[0] = 1.0 / Math.abs(lsfs[1] - lsfs[0]);
lsfWgt[P - 1] = 0.5 / Math.abs(lsfs[P - 1] - lsfs[P - 2]);
for (i = 1; i <= P - 2; i++)
lsfWgt[i] = 1.0 / Math.min(Math.abs(lsfs[i] - lsfs[i - 1]), Math.abs(lsfs[i + 1] - lsfs[i]));
// Emphasize low frequency LSFs
double tmp = 0.0;
for (i = 0; i < P; i++) {
lsfWgt[i] = Math.exp(-0.05 * i) * lsfWgt[i];
tmp += lsfWgt[i];
}
// Normalize
for (i = 0; i < P; i++)
lsfWgt[i] /= tmp;
// Compress dynamic range
tmp = 0.0;
for (i = 0; i <= P - 1; i++) {
lsfWgt[i] = Math.sqrt(lsfWgt[i]);
tmp += lsfWgt[i];
}
// Normalize
for (i = 0; i < P; i++)
lsfWgt[i] /= tmp;
return lsfWgt;
}
public static double rmsLogSpectralDist(double[] speechFrame1, double[] speechFrame2, int fftSize, int lpOrder) {
// Windowing
double[] windowedSpeechFrame1 = new double[speechFrame1.length];
System.arraycopy(speechFrame1, 0, windowedSpeechFrame1, 0, speechFrame1.length);
double[] windowedSpeechFrame2 = new double[speechFrame2.length];
System.arraycopy(speechFrame2, 0, windowedSpeechFrame2, 0, speechFrame2.length);
HammingWindow w1 = new HammingWindow(speechFrame1.length);
w1.apply(windowedSpeechFrame1, 0);
HammingWindow w2 = new HammingWindow(speechFrame2.length);
w2.apply(windowedSpeechFrame2, 0);
//
double[] Xabs1 = LpcAnalyser.calcSpecFrameLinear(speechFrame1, lpOrder, fftSize);
double[] Xabs2 = LpcAnalyser.calcSpecFrameLinear(speechFrame2, lpOrder, fftSize);
int w;
double dist = 0.0;
for (w = 0; w < Xabs1.length; w++)
dist += 10 * Math.log10(Xabs1[w] * Xabs1[w]) - 10 * Math.log10(Xabs2[w] * Xabs2[w]);
return Math.min(dist, MAX_SPECTRAL_DISTANCE);
}
public static double kullbackLeiblerSpectralDist(double[] speechFrame1, double[] speechFrame2, int fftSize, int lpOrder) {
// Windowing
double[] windowedSpeechFrame1 = new double[speechFrame1.length];
System.arraycopy(speechFrame1, 0, windowedSpeechFrame1, 0, speechFrame1.length);
double[] windowedSpeechFrame2 = new double[speechFrame2.length];
System.arraycopy(speechFrame2, 0, windowedSpeechFrame2, 0, speechFrame2.length);
HammingWindow w1 = new HammingWindow(speechFrame1.length);
w1.apply(windowedSpeechFrame1, 0);
HammingWindow w2 = new HammingWindow(speechFrame2.length);
w2.apply(windowedSpeechFrame2, 0);
//
double[] Xabs1 = LpcAnalyser.calcSpecFrameLinear(speechFrame1, lpOrder, fftSize);
double[] Xabs2 = LpcAnalyser.calcSpecFrameLinear(speechFrame2, lpOrder, fftSize);
int w;
double klDist = 0.0;
for (w = 0; w < Xabs1.length; w++)
klDist += Xabs1[w] * Math.log(Xabs1[w] / (Xabs2[w] + 1e-20) + 1e-20);
return Math.min(klDist, MAX_SPECTRAL_DISTANCE);
}
public static double kullbackLeiblerSymmetricSpectralDist(double[] speechFrame1, double[] speechFrame2, int fftSize,
int lpOrder) {
// Windowing
double[] windowedSpeechFrame1 = new double[speechFrame1.length];
System.arraycopy(speechFrame1, 0, windowedSpeechFrame1, 0, speechFrame1.length);
double[] windowedSpeechFrame2 = new double[speechFrame2.length];
System.arraycopy(speechFrame2, 0, windowedSpeechFrame2, 0, speechFrame2.length);
HammingWindow w1 = new HammingWindow(speechFrame1.length);
w1.apply(windowedSpeechFrame1, 0);
HammingWindow w2 = new HammingWindow(speechFrame2.length);
w2.apply(windowedSpeechFrame2, 0);
//
double[] Xabs1 = LpcAnalyser.calcSpecFrameLinear(speechFrame1, lpOrder, fftSize);
double[] Xabs2 = LpcAnalyser.calcSpecFrameLinear(speechFrame2, lpOrder, fftSize);
int w;
double klDist12 = 0.0;
double klDist21 = 0.0;
for (w = 0; w < Xabs1.length; w++)
klDist12 += Xabs1[w] * Math.log(Xabs1[w] / (Xabs2[w] + 1e-20) + 1e-20);
for (w = 0; w < Xabs2.length; w++)
klDist21 += Xabs2[w] * Math.log(Xabs2[w] / (Xabs1[w] + 1e-20) + 1e-20);
return Math.min(0.5 * (klDist12 + klDist21), MAX_SPECTRAL_DISTANCE);
}
public static double itakuraSaitoDistSymmetric(double[] speechFrame1, double[] speechFrame2, int fftSize, int lpOrder) {
return 0.5 * itakuraSaitoDist(speechFrame1, speechFrame2, fftSize, lpOrder) + 0.5
* itakuraSaitoDist(speechFrame2, speechFrame1, fftSize, lpOrder);
}
public static double itakuraSaitoDist(double[] speechFrame1, double[] speechFrame2, int fftSize, int lpOrder) {
double[] preemphasizedFrame1 = SignalProcUtils.applyPreemphasis(speechFrame1, 0.97);
double[] preemphasizedFrame2 = SignalProcUtils.applyPreemphasis(speechFrame2, 0.97);
// Windowing
double[] windowedSpeechFrame1 = new double[preemphasizedFrame1.length];
System.arraycopy(preemphasizedFrame1, 0, windowedSpeechFrame1, 0, preemphasizedFrame1.length);
double[] windowedSpeechFrame2 = new double[preemphasizedFrame2.length];
System.arraycopy(preemphasizedFrame2, 0, windowedSpeechFrame2, 0, preemphasizedFrame2.length);
HammingWindow w1 = new HammingWindow(speechFrame1.length);
w1.apply(windowedSpeechFrame1, 0);
HammingWindow w2 = new HammingWindow(speechFrame2.length);
w2.apply(windowedSpeechFrame2, 0);
//
int w;
double[] Xabs1 = LpcAnalyser.calcSpecFrameLinear(speechFrame1, lpOrder, fftSize);
double[] Xabs2 = LpcAnalyser.calcSpecFrameLinear(speechFrame2, lpOrder, fftSize);
// Itakura-Saito distance using power spectrum: pf1/pf2 - log(pf1/pf2) - 1
double dist = 0.0;
for (w = 0; w < Xabs1.length; w++)
Xabs1[w] = Xabs1[w] * Xabs1[w];
for (w = 0; w < Xabs2.length; w++)
Xabs2[w] = Xabs2[w] * Xabs2[w];
for (w = 0; w < Xabs1.length; w++)
dist += Xabs1[w] / Math.max(Xabs2[w], 1e-20) - Math.log(Math.max(Xabs1[w], 1e-20))
+ Math.log(Math.max(Xabs2[w], 1e-20)) - 1.0;
return Math.min(dist, MAX_SPECTRAL_DISTANCE);
}
// This is in fact the symmetrical version of Itakura-Saito distance
public static double coshDist(double[] speechFrame1, double[] speechFrame2, int fftSize, int lpOrder) {
double[] preemphasizedFrame1 = SignalProcUtils.applyPreemphasis(speechFrame1, 0.97);
double[] preemphasizedFrame2 = SignalProcUtils.applyPreemphasis(speechFrame2, 0.97);
// Windowing
double[] windowedSpeechFrame1 = new double[preemphasizedFrame1.length];
System.arraycopy(preemphasizedFrame1, 0, windowedSpeechFrame1, 0, preemphasizedFrame1.length);
double[] windowedSpeechFrame2 = new double[preemphasizedFrame2.length];
System.arraycopy(preemphasizedFrame2, 0, windowedSpeechFrame2, 0, preemphasizedFrame2.length);
HammingWindow w1 = new HammingWindow(speechFrame1.length);
w1.apply(windowedSpeechFrame1, 0);
HammingWindow w2 = new HammingWindow(speechFrame2.length);
w2.apply(windowedSpeechFrame2, 0);
//
int w;
double[] Xabs1 = LpcAnalyser.calcSpecFrameLinear(speechFrame1, lpOrder, fftSize);
double[] Xabs2 = LpcAnalyser.calcSpecFrameLinear(speechFrame2, lpOrder, fftSize);
// COSH distance using power spectrum:
// dcosh = 0.5(d12+d21) where
// dij = Xi/Xj - log(Xi/Xj) - 1 and Xi, Xj are the power spectra under comparison
double dist12 = 0.0;
double dist21 = 0.0;
for (w = 0; w < Xabs1.length; w++)
Xabs1[w] = Xabs1[w] * Xabs1[w];
for (w = 0; w < Xabs2.length; w++)
Xabs2[w] = Xabs2[w] * Xabs2[w];
for (w = 0; w < Xabs1.length; w++)
dist12 += Xabs1[w] / (Xabs2[w] + 1e-20) - Math.log(Xabs1[w] / (Xabs2[w] + 1e-20) + 1e-20) - 1;
for (w = 0; w < Xabs2.length; w++)
dist21 += Xabs2[w] / (Xabs1[w] + 1e-20) - Math.log(Xabs2[w] / (Xabs1[w] + 1e-20) + 1e-20) - 1;
return Math.min(0.5 * (dist12 + dist21), MAX_SPECTRAL_DISTANCE);
}
public static void main(String[] args) {
AudioInputStream inputAudio = null;
try {
inputAudio = AudioSystem.getAudioInputStream(new File(args[0]));
} catch (UnsupportedAudioFileException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (inputAudio != null) {
int samplingRate = (int) inputAudio.getFormat().getSampleRate();
int ws = (int) (samplingRate * 0.020);
int ss = (int) (samplingRate * 0.010);
AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio);
double[] x = signal.getAllData();
int numfrm = (int) ((x.length - (double) ws) / ss - 2.0);
double[] frm1 = new double[ws];
double[] frm2 = new double[ws];
double lsfDist;
for (int i = 0; i < numfrm; i++) {
System.arraycopy(x, i * ss, frm1, 0, ws);
System.arraycopy(x, (i + 1) * ss, frm2, 0, ws);
lsfDist = SpectralDistanceMeasures.lsfDist(frm1, frm2, samplingRate);
System.out.println("Distance between frame " + String.valueOf(i + 1) + " and frame " + String.valueOf(i + 2)
+ " = " + String.valueOf(lsfDist));
}
}
}
}