/**
* Copyright 2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* Permission is hereby granted, free of charge, to use and distribute
* this software and its documentation without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of this work, and to
* permit persons to whom this work is furnished to do so, subject to
* the following conditions:
*
* 1. The code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* 2. Any modifications must be clearly marked as such.
* 3. Original authors' names are not deleted.
* 4. The authors' names are not used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* DFKI GMBH AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH
* REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL DFKI GMBH NOR THE
* CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
* DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
* PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
* THIS SOFTWARE.
*/
package marytts.signalproc.process;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.exceptions.MaryConfigurationException;
import marytts.modules.phonemiser.Allophone;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.signalproc.analysis.Labels;
import marytts.signalproc.analysis.LpcAnalyser;
import marytts.signalproc.analysis.LsfAnalyser;
import marytts.signalproc.analysis.LpcAnalyser.LpCoeffs;
import marytts.signalproc.filter.HighPassFilter;
import marytts.signalproc.window.HammingWindow;
import marytts.signalproc.window.Window;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.io.FileUtils;
import marytts.util.math.ArrayUtils;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
import marytts.util.string.StringUtils;
/**
* This class implements post-processing of TTS output to make it sound more intelligible when used in a telephone channel.
*
* Several simple ideas are implemented:
*
* (Step 1) Increasing relative energy of consonants (requires labels along with TTS wav outputs, with the same filename and
* folder but with .lab extension) (Step 2) Sharpening formants explicitly using LSFs (Step 3) Increasing relative energy of
* higher formants implicitly by adding highpass filtered version to original (Step 4) Finally, the output is gain adjusted to
* prevent clipping
*
* @author Oytun Türk
*
*/
public class Blizzard09PostProcessor {
public static final boolean LABELS_FROM_REALISED_DURATIONS_FILE = true; // If true reads from realised durations file instead
// of label file
public static final String LABEL_FILE_EXTENSION = ".realised_durations";
// Window and skip sizes for gain processing
public static final double WINDOW_SIZE_IN_SECONDS_LSF = 0.020;
public static final double SKIP_SIZE_IN_SECONDS_LSF = 0.005;
//
// Step 1. Modify LSFs to sharpen formants explicitly
public static final boolean SHARPEN_FORMANTS = true; // Apply explicit formant sharpening using LSFs
public static final double FORMANT_SHARPENING_START_FREQ = 1000.0; // 1000.0; (postproc2) //Lowest frequency in Hz to search
// for LSF pairs
public static final double FORMANT_SHARPENING_END_FREQ = 2500; // 2500.0; (postproc2)//Highest frequency in Hz to search for
// LSF pairs
public static final double RELATIVE_DECREASE_IN_LSF_PAIR_SEPARATION = 15.0; // 5.0; (postproc2)//(0.0-95.0) decrease in LSF
// pair distances in percentage of the original
// distance in frequency
public static final double MAX_LSF_PAIR_SEPARATION_IN_HZ = 300.0; // 300.0; (postproc2)//Maximum LSF pair separation for
// formants
//
// Window and skip sizes for gain processing
public static final double WINDOW_SIZE_IN_SECONDS_GAIN = 0.020;
public static final double SKIP_SIZE_IN_SECONDS_GAIN = 0.001;
//
// Step 2. Modify relative gain of consonants
public static final boolean INCREASE_CONSONANT_GAINS = true; // Apply consonant gain adjustment (increasing)?
// Fixed settings for consonant gain adjustment
public static final double CONSONANT_MAX_GAIN_FACTOR = 1.5; // 1.8; (postproc2)//Peak gain factor to multiply samples at the
// consonant´s center (1.0-Infinity)
public static final double CONSONANT_MAX_GAIN_RELATIVE_DURATION = 50.0; // 60.0; (postproc2)//Relative duration of maximum
// gain at the center of the consonant (0.0-100.0),
// values close to 100.0 will result in
// discontinuities
//
// Step 3. Reduce vowel nuclie energy to reduce reverbaration effects
public static final boolean REDUCE_VOWEL_GAINS = true; // Apply consonant gain adjustment (increasing)?
// Fixed settings for consonant gain adjustment
public static final double VOWEL_MIN_GAIN_FACTOR = 0.7; // 0.6; (postproc2)//Peak gain factor to multiply samples at the
// vowel´s center (0.0-1.0, 1.0 means no change)
public static final double VOWEL_MIN_GAIN_RELATIVE_DURATION = 50.0; // 60.0; (postproc2)//Relative duration of maximum gain at
// the center of the vowel (0.0-100.0), values close to
// 100.0 will result in discontinuities
//
// Step 4. Add highpass filtered version to boost higher frequency formants implicitly
public static final boolean APPLY_HIGHPASS_FILTER = false; // true; (postproc2) v//Apply highpass filtering?
// Fixed settings for higher formant gain adjustment
public static final double HIGHPASS_FILTER_CUTOFF = 2000.0; // Cut-off of highpass filter in Hz
public static final double HIGHPASS_FILTER_RELATIVE_GAIN = 0.05; // (0.0-1.0) Relative gain of the highpass filtered signal
// when it´s being added with the original
// output = (1-relativeGain)*original +
// relativeGain*highpassFilterOutput
//
public static double[] process(double[] x, Labels labels, Allophone[] allophones, int samplingRateInHz, double absMaxOrig) {
boolean[] isConsonants = new boolean[labels.items.length];
boolean[] isVowels = new boolean[labels.items.length];
boolean[] isPauses = new boolean[labels.items.length];
for (int i = 0; i < labels.items.length; i++) {
isConsonants[i] = false;
int allophoneIndex = -1;
for (int j = 0; j < allophones.length; j++) {
if (allophones[j].name().compareTo(labels.items[i].phn) == 0) {
if (allophones[j].isConsonant() && !allophones[j].isPlosive())
isConsonants[i] = true;
break;
}
}
isVowels[i] = false;
allophoneIndex = -1;
for (int j = 0; j < allophones.length; j++) {
if (allophones[j].name().compareTo(labels.items[i].phn) == 0) {
if (allophones[j].isVowel())
isVowels[i] = true;
break;
}
}
isPauses[i] = false;
allophoneIndex = -1;
for (int j = 0; j < allophones.length; j++) {
if (allophones[j].name().compareTo(labels.items[i].phn) == 0) {
if (allophones[j].isPause())
isPauses[i] = true;
break;
}
}
}
double[] y = ArrayUtils.copy(x);
// Step 1
if (SHARPEN_FORMANTS)
y = processLSFs(y, samplingRateInHz, labels, isVowels, isPauses);
// Step 2
if (INCREASE_CONSONANT_GAINS)
y = processGains(y, samplingRateInHz, labels, isConsonants, CONSONANT_MAX_GAIN_FACTOR,
CONSONANT_MAX_GAIN_RELATIVE_DURATION);
// Step 3
if (REDUCE_VOWEL_GAINS)
y = processGains(y, samplingRateInHz, labels, isVowels, VOWEL_MIN_GAIN_FACTOR, VOWEL_MIN_GAIN_RELATIVE_DURATION);
// Step 3
if (APPLY_HIGHPASS_FILTER)
y = processHigherFormantGains(y, samplingRateInHz, labels, isPauses);
//
// Step 4
double absMaxNew = MathUtils.absMax(y);
int startIndex = 0;
int endIndex;
int i, j;
for (i = 0; i < labels.items.length; i++) {
if (!isPauses[i]) {
endIndex = SignalProcUtils.time2sample(labels.items[i].time, samplingRateInHz) - 1;
endIndex = Math.min(endIndex, x.length - 1);
for (j = startIndex; j <= endIndex; j++)
y[j] *= absMaxOrig / absMaxNew;
startIndex = endIndex + 1;
}
}
//
return y;
}
// Multiplies consonant gains with a window to increase their relative energy level
// The window is 1.0 at both ends to ensure continuity
// Maximum gain occurs in the middle of the window
public static double[] processGains(double[] x, int samplingRateInHz, Labels labels, boolean[] toBeProcesseds,
double extremumGainFactor, double extremumGainRelativeDuration) {
assert labels.items.length == toBeProcesseds.length;
boolean isIncreasing = true;
if (extremumGainFactor < 1.0)
isIncreasing = false;
double[] y = null;
double[] w = null;
int startIndex = 0;
int endIndex;
int ws = SignalProcUtils.time2sample(WINDOW_SIZE_IN_SECONDS_GAIN, samplingRateInHz);
int ss = SignalProcUtils.time2sample(SKIP_SIZE_IN_SECONDS_GAIN, samplingRateInHz);
Window wfrm = new HammingWindow(ws);
wfrm.normalizePeakValue(1.0f);
double[] frmWgt = wfrm.getCoeffs();
if (x != null && x.length > 0) {
y = new double[x.length];
w = new double[x.length];
Arrays.fill(y, 0.0);
Arrays.fill(w, 0.0);
double[] frm = new double[ws];
int i, j, k;
for (i = 0; i < labels.items.length; i++) {
boolean bProcessed = false;
endIndex = SignalProcUtils.time2sample(labels.items[i].time, samplingRateInHz) - 1;
endIndex = Math.min(endIndex, x.length - 1);
int numfrm = (int) Math.floor((endIndex - startIndex + 1.0) / (double) ss + 0.5) + 1;
if (numfrm > 0) {
int windowLen = (int) Math.floor(numfrm * (1.0 - extremumGainRelativeDuration / 100.0) + 0.5);
double[] wgt = new double[numfrm];
if (toBeProcesseds[i])
Arrays.fill(wgt, extremumGainFactor);
else
Arrays.fill(wgt, 1.0);
if (windowLen > 0 && toBeProcesseds[i]) {
Window wConsonant = new HammingWindow(windowLen);
if (isIncreasing)
wConsonant.normalizeRange(1.0f, (float) extremumGainFactor);
else
wConsonant.normalizeRange((float) extremumGainFactor, 1.0f);
double[] lWgt = null;
double[] rWgt = null;
if (isIncreasing) {
lWgt = wConsonant.getCoeffsLeftHalf();
rWgt = wConsonant.getCoeffsRightHalf();
} else {
lWgt = wConsonant.getCoeffsRightHalf();
rWgt = wConsonant.getCoeffsLeftHalf();
}
if (lWgt != null) {
for (j = 0; j < lWgt.length; j++)
wgt[j] = lWgt[j];
}
if (rWgt != null) {
for (j = 0; j < rWgt.length; j++)
wgt[j + numfrm - rWgt.length] = rWgt[j];
}
// MaryUtils.plot(wgt);
for (j = 0; j < numfrm; j++) {
System.arraycopy(x, j * ss + startIndex, frm, 0, Math.min(ws, x.length - (j * ss + startIndex)));
for (k = 0; k < Math.min(ws, x.length - (j * ss + startIndex)); k++) {
y[j * ss + startIndex + k] += x[j * ss + startIndex + k] * frmWgt[k] * wgt[j];
w[j * ss + startIndex + k] += frmWgt[k];
}
}
} else {
Window wShort = new HammingWindow(endIndex - startIndex + 1);
double[] wShortWgt = wShort.getCoeffs();
for (k = startIndex; k <= endIndex; k++) {
y[k] += x[k] * wShortWgt[k - startIndex];
w[k] += wShortWgt[k - startIndex];
}
}
} else {
Window wShort = new HammingWindow(endIndex - startIndex + 1);
double[] wShortWgt = wShort.getCoeffs();
for (k = startIndex; k <= endIndex; k++) {
y[k] += x[k] * wShortWgt[k - startIndex];
w[k] += wShortWgt[k - startIndex];
}
}
startIndex = endIndex + 1;
}
for (i = 0; i < x.length; i++) {
if (w[i] > 0.0)
y[i] /= w[i];
}
}
return y;
}
// Detects closest LSF pairs within a frequency range
// Makes these LSF pairs closer
// Re-synthesizes the output using modified LSFs and frequency domain AR filtering
public static double[] processLSFs(double[] x, int samplingRateInHz, Labels labels, boolean[] isVowels, boolean[] isPauses) {
assert labels.items.length == isVowels.length;
assert labels.items.length == isPauses.length;
double[] y = null;
double[] w = null;
int startIndex = 0;
int endIndex;
int ws = SignalProcUtils.time2sample(WINDOW_SIZE_IN_SECONDS_LSF, samplingRateInHz);
int ss = SignalProcUtils.time2sample(SKIP_SIZE_IN_SECONDS_LSF, samplingRateInHz);
Window wfrm = new HammingWindow(ws);
wfrm.normalizePeakValue(1.0f);
double[] frmWgt = wfrm.getCoeffs();
if (x != null && x.length > 0) {
int lpOrder = SignalProcUtils.getLPOrder(samplingRateInHz);
y = new double[x.length];
w = new double[x.length];
Arrays.fill(y, 0.0);
Arrays.fill(w, 0.0);
double[] frm = new double[ws];
int i, j, k;
int fftSize = SignalProcUtils.getDFTSize(samplingRateInHz);
for (i = 0; i < labels.items.length; i++) {
boolean bProcessed = false;
endIndex = SignalProcUtils.time2sample(labels.items[i].time + WINDOW_SIZE_IN_SECONDS_LSF, samplingRateInHz) - 1;
int numfrm = (int) Math.floor((endIndex - startIndex + 1.0) / (double) ss + 0.5) + 1;
// if (isVowels[i] && numfrm>0)
if (numfrm > 0) {
for (j = 0; j < numfrm; j++) {
Arrays.fill(frm, 0.0);
if (j * ss + startIndex < x.length) {
System.arraycopy(x, j * ss + startIndex, frm, 0, Math.min(ws, x.length - (j * ss + startIndex)));
double[] frmOrig = ArrayUtils.copy(frm);
double origEn = SignalProcUtils.energy(frmOrig);
wfrm.apply(frm, 0);
LpCoeffs lpcs = LpcAnalyser.calcLPC(frm, lpOrder, 0.0f);
double[] lsfs = LsfAnalyser.lpc2lsfInHz(lpcs.getOneMinusA(), samplingRateInHz);
double[] lsfsMod = ArrayUtils.copy(lsfs);
if (isVowels[i]) {
double[] dists = new double[lsfs.length - 1];
for (k = 0; k < lsfsMod.length - 1; k++)
dists[k] = lsfs[k + 1] - lsfs[k];
for (k = 1; k < dists.length - 1; k++) {
if (dists[k] < Math.min(dists[k + 1], MAX_LSF_PAIR_SEPARATION_IN_HZ)) // lsfs[k] and lsfs[k+1]
// might be pairs
{
double meanFreq = 0.5 * (lsfs[k] + lsfs[k + 1]);
if (meanFreq >= FORMANT_SHARPENING_START_FREQ && meanFreq < FORMANT_SHARPENING_END_FREQ) {
double shift = 0.5 * RELATIVE_DECREASE_IN_LSF_PAIR_SEPARATION / 100.0 * dists[k];
lsfsMod[k] = lsfs[k - 1] + shift;
lsfsMod[k + 1] = lsfs[k - 1] - shift;
k += 2;
}
} else if (dists[k + 1] < Math.min(dists[k], MAX_LSF_PAIR_SEPARATION_IN_HZ)) // lsfs[k+1] and
// lsfs[k+2]
// might be
// pairs
{
double meanFreq = 0.5 * (lsfs[k + 1] + lsfs[k + 2]);
if (meanFreq >= FORMANT_SHARPENING_START_FREQ && meanFreq < FORMANT_SHARPENING_END_FREQ) {
double shift = 0.5 * RELATIVE_DECREASE_IN_LSF_PAIR_SEPARATION / 100.0 * dists[k];
lsfsMod[k + 1] = lsfs[k + 1] + shift;
lsfsMod[k + 2] = lsfs[k + 2] - shift;
k += 2;
}
}
}
}
double[] newOneMinusAs = LsfAnalyser.lsfInHz2lpc(lsfsMod, samplingRateInHz);
double[] newLpcs = ArrayUtils.subarray(newOneMinusAs, 1, lpOrder);
newLpcs = MathUtils.multiply(newLpcs, -1.0);
double[] H = LpcAnalyser.calcSpecLinear(lpcs.getA(), lpcs.getGain(), fftSize);
double[] HNew = LpcAnalyser.calcSpecLinear(newLpcs, lpcs.getGain(), fftSize);
// MaryUtils.plot(MathUtils.amp2db(H));
// MaryUtils.plot(MathUtils.amp2db(HNew));
double[] HT = MathUtils.divide(HNew, H);
// SignalProcUtils.displayDFTSpectrumInDB(frmOrig);
frm = SignalProcUtils.filterfd(HT, frmOrig, samplingRateInHz);
// SignalProcUtils.displayDFTSpectrumInDB(frm);
double newEn = SignalProcUtils.energy(frm);
double gain = Math.sqrt(origEn) / Math.sqrt(newEn);
for (k = 0; k < Math.min(ws, x.length - (j * ss + startIndex)); k++) {
y[j * ss + startIndex + k] += gain * frm[k] * frmWgt[k];
w[j * ss + startIndex + k] += frmWgt[k];
}
}
}
} else {
Window wShort = new HammingWindow(endIndex - startIndex + 1);
double[] wShortWgt = wShort.getCoeffs();
for (k = startIndex; k <= endIndex; k++) {
y[k] += x[k] * wShortWgt[k - startIndex];
w[k] += wShortWgt[k - startIndex];
}
}
startIndex = endIndex - ws;
}
for (i = 0; i < x.length; i++) {
if (w[i] > 0.0)
y[i] /= w[i];
}
}
return y;
}
public static double[] processHigherFormantGains(double[] x, int samplingRateInHz, Labels labels, boolean[] isPauses) {
assert labels.items.length == isPauses.length;
double[] y = null;
if (x != null && x.length > 0) {
int i, j;
HighPassFilter hpf = new HighPassFilter(HIGHPASS_FILTER_CUTOFF / samplingRateInHz);
double[] xhpf = hpf.apply(x);
for (i = 0; i < x.length; i++)
xhpf[i] = (1.0 - HIGHPASS_FILTER_RELATIVE_GAIN) * x[i] + HIGHPASS_FILTER_RELATIVE_GAIN * xhpf[i];
y = new double[x.length];
int startIndex = 0;
int endIndex;
for (i = 0; i < labels.items.length; i++) {
endIndex = SignalProcUtils.time2sample(labels.items[i].time, samplingRateInHz) - 1;
endIndex = Math.min(endIndex, x.length - 1);
if (isPauses[i])
System.arraycopy(x, startIndex, y, startIndex, endIndex - startIndex + 1);
else
System.arraycopy(xhpf, startIndex, y, startIndex, endIndex - startIndex + 1);
startIndex = endIndex + 1;
}
}
return y;
}
public static void mainSingleFile(String inputWavFile, String outputWavFile, Allophone[] allophones)
throws UnsupportedAudioFileException, IOException {
// File input
AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(inputWavFile));
int samplingRate = (int) inputAudio.getFormat().getSampleRate();
AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio);
double[] x = signal.getAllData();
double absMaxOrig = MathUtils.absMax(x);
String strLabFile = StringUtils.modifyExtension(inputWavFile, LABEL_FILE_EXTENSION);
if (!FileUtils.exists(strLabFile)) // Labels required for transients analysis (unless we design an automatic algorithm)
{
System.out.println("Label file not found: " + strLabFile + "...skipping...");
} else {
Labels labels = new Labels(strLabFile);
//
double[] y = Blizzard09PostProcessor.process(x, labels, allophones, samplingRate, absMaxOrig);
DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(y), inputAudio.getFormat());
AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputWavFile));
}
}
public static void main(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException {
if (args.length < 3) {
System.out.println("Missing parameters:");
System.out.println("<input wav file or directory> <output wav file or directory> <full path of phone set file>");
System.out.println("Example phone set file: .../lib/modules/en/us/lexicon/allophones.en_US.xml");
} else {
String phoneSetFile = args[2];
AllophoneSet allophoneSet = AllophoneSet.getAllophoneSet(phoneSetFile);
Set<String> tmpPhonemes = allophoneSet.getAllophoneNames();
int count = 0;
Allophone[] allophones = new Allophone[tmpPhonemes.size()];
for (Iterator<String> it = tmpPhonemes.iterator(); it.hasNext();) {
allophones[count] = allophoneSet.getAllophone(it.next());
count++;
if (count >= tmpPhonemes.size())
break;
}
if (FileUtils.isDirectory(args[0])) // Process folder
{
if (!FileUtils.exists(args[1]))
FileUtils.createDirectory(args[1]);
String[] fileList = FileUtils.getFileList(args[0], "wav");
String outputFolder = StringUtils.checkLastSlash(args[1]);
if (fileList != null) {
for (int i = 0; i < fileList.length; i++) {
String baseFileName = StringUtils.getFileName(fileList[i], true);
String outputFile = outputFolder + baseFileName + ".wav";
mainSingleFile(fileList[i], outputFile, allophones);
System.out.println("Processing completed for file " + String.valueOf(i + 1) + " of "
+ String.valueOf(fileList.length));
}
} else
System.out.println("No wav files found!");
} else
// Process file
mainSingleFile(args[0], args[1], allophones);
System.out.println("Processing completed...");
}
}
}