Blizzard09PostProcessor.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 * 
 * Permission is hereby granted, free of charge, to use and distribute
 * this software and its documentation without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of this work, and to
 * permit persons to whom this work is furnished to do so, subject to
 * the following conditions:
 * 
 * 1. The code must retain the above copyright notice, this list of
 *    conditions and the following disclaimer.
 * 2. Any modifications must be clearly marked as such.
 * 3. Original authors' names are not deleted.
 * 4. The authors' names are not used to endorse or promote products
 *    derived from this software without specific prior written
 *    permission.
 *
 * DFKI GMBH AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL DFKI GMBH NOR THE
 * CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 * THIS SOFTWARE.
 */

package marytts.signalproc.process;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;

import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;

import marytts.exceptions.MaryConfigurationException;
import marytts.modules.phonemiser.Allophone;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.signalproc.analysis.Labels;
import marytts.signalproc.analysis.LpcAnalyser;
import marytts.signalproc.analysis.LsfAnalyser;
import marytts.signalproc.analysis.LpcAnalyser.LpCoeffs;
import marytts.signalproc.filter.HighPassFilter;
import marytts.signalproc.window.HammingWindow;
import marytts.signalproc.window.Window;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.io.FileUtils;
import marytts.util.math.ArrayUtils;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
import marytts.util.string.StringUtils;

/**
 * This class implements post-processing of TTS output to make it sound more intelligible when used in a telephone channel.
 * 
 * Several simple ideas are implemented:
 * 
 * (Step 1) Increasing relative energy of consonants (requires labels along with TTS wav outputs, with the same filename and
 * folder but with .lab extension) (Step 2) Sharpening formants explicitly using LSFs (Step 3) Increasing relative energy of
 * higher formants implicitly by adding highpass filtered version to original (Step 4) Finally, the output is gain adjusted to
 * prevent clipping
 * 
 * @author Oytun Türk
 * 
 */
public class Blizzard09PostProcessor {

	public static final boolean LABELS_FROM_REALISED_DURATIONS_FILE = true; // If true reads from realised durations file instead
																			// of label file
	public static final String LABEL_FILE_EXTENSION = ".realised_durations";

	// Window and skip sizes for gain processing
	public static final double WINDOW_SIZE_IN_SECONDS_LSF = 0.020;
	public static final double SKIP_SIZE_IN_SECONDS_LSF = 0.005;
	//

	// Step 1. Modify LSFs to sharpen formants explicitly
	public static final boolean SHARPEN_FORMANTS = true; // Apply explicit formant sharpening using LSFs
	public static final double FORMANT_SHARPENING_START_FREQ = 1000.0; // 1000.0; (postproc2) //Lowest frequency in Hz to search
																		// for LSF pairs
	public static final double FORMANT_SHARPENING_END_FREQ = 2500; // 2500.0; (postproc2)//Highest frequency in Hz to search for
																	// LSF pairs
	public static final double RELATIVE_DECREASE_IN_LSF_PAIR_SEPARATION = 15.0; // 5.0; (postproc2)//(0.0-95.0) decrease in LSF
																				// pair distances in percentage of the original
																				// distance in frequency
	public static final double MAX_LSF_PAIR_SEPARATION_IN_HZ = 300.0; // 300.0; (postproc2)//Maximum LSF pair separation for
																		// formants
	//

	// Window and skip sizes for gain processing
	public static final double WINDOW_SIZE_IN_SECONDS_GAIN = 0.020;
	public static final double SKIP_SIZE_IN_SECONDS_GAIN = 0.001;
	//

	// Step 2. Modify relative gain of consonants
	public static final boolean INCREASE_CONSONANT_GAINS = true; // Apply consonant gain adjustment (increasing)?
	// Fixed settings for consonant gain adjustment
	public static final double CONSONANT_MAX_GAIN_FACTOR = 1.5; // 1.8; (postproc2)//Peak gain factor to multiply samples at the
																// consonant´s center (1.0-Infinity)
	public static final double CONSONANT_MAX_GAIN_RELATIVE_DURATION = 50.0; // 60.0; (postproc2)//Relative duration of maximum
																			// gain at the center of the consonant (0.0-100.0),
																			// values close to 100.0 will result in
																			// discontinuities
	//

	// Step 3. Reduce vowel nuclie energy to reduce reverbaration effects
	public static final boolean REDUCE_VOWEL_GAINS = true; // Apply consonant gain adjustment (increasing)?
	// Fixed settings for consonant gain adjustment
	public static final double VOWEL_MIN_GAIN_FACTOR = 0.7; // 0.6; (postproc2)//Peak gain factor to multiply samples at the
															// vowel´s center (0.0-1.0, 1.0 means no change)
	public static final double VOWEL_MIN_GAIN_RELATIVE_DURATION = 50.0; // 60.0; (postproc2)//Relative duration of maximum gain at
																		// the center of the vowel (0.0-100.0), values close to
																		// 100.0 will result in discontinuities
	//

	// Step 4. Add highpass filtered version to boost higher frequency formants implicitly
	public static final boolean APPLY_HIGHPASS_FILTER = false; // true; (postproc2) v//Apply highpass filtering?
	// Fixed settings for higher formant gain adjustment
	public static final double HIGHPASS_FILTER_CUTOFF = 2000.0; // Cut-off of highpass filter in Hz
	public static final double HIGHPASS_FILTER_RELATIVE_GAIN = 0.05; // (0.0-1.0) Relative gain of the highpass filtered signal
																		// when it´s being added with the original
																		// output = (1-relativeGain)*original +
																		// relativeGain*highpassFilterOutput

	//

	public static double[] process(double[] x, Labels labels, Allophone[] allophones, int samplingRateInHz, double absMaxOrig) {
		boolean[] isConsonants = new boolean[labels.items.length];
		boolean[] isVowels = new boolean[labels.items.length];
		boolean[] isPauses = new boolean[labels.items.length];

		for (int i = 0; i < labels.items.length; i++) {
			isConsonants[i] = false;
			int allophoneIndex = -1;
			for (int j = 0; j < allophones.length; j++) {
				if (allophones[j].name().compareTo(labels.items[i].phn) == 0) {
					if (allophones[j].isConsonant() && !allophones[j].isPlosive())
						isConsonants[i] = true;

					break;
				}
			}

			isVowels[i] = false;
			allophoneIndex = -1;
			for (int j = 0; j < allophones.length; j++) {
				if (allophones[j].name().compareTo(labels.items[i].phn) == 0) {
					if (allophones[j].isVowel())
						isVowels[i] = true;

					break;
				}
			}

			isPauses[i] = false;
			allophoneIndex = -1;
			for (int j = 0; j < allophones.length; j++) {
				if (allophones[j].name().compareTo(labels.items[i].phn) == 0) {
					if (allophones[j].isPause())
						isPauses[i] = true;

					break;
				}
			}
		}

		double[] y = ArrayUtils.copy(x);

		// Step 1
		if (SHARPEN_FORMANTS)
			y = processLSFs(y, samplingRateInHz, labels, isVowels, isPauses);

		// Step 2
		if (INCREASE_CONSONANT_GAINS)
			y = processGains(y, samplingRateInHz, labels, isConsonants, CONSONANT_MAX_GAIN_FACTOR,
					CONSONANT_MAX_GAIN_RELATIVE_DURATION);

		// Step 3
		if (REDUCE_VOWEL_GAINS)
			y = processGains(y, samplingRateInHz, labels, isVowels, VOWEL_MIN_GAIN_FACTOR, VOWEL_MIN_GAIN_RELATIVE_DURATION);

		// Step 3
		if (APPLY_HIGHPASS_FILTER)
			y = processHigherFormantGains(y, samplingRateInHz, labels, isPauses);
		//

		// Step 4
		double absMaxNew = MathUtils.absMax(y);

		int startIndex = 0;
		int endIndex;
		int i, j;
		for (i = 0; i < labels.items.length; i++) {
			if (!isPauses[i]) {
				endIndex = SignalProcUtils.time2sample(labels.items[i].time, samplingRateInHz) - 1;
				endIndex = Math.min(endIndex, x.length - 1);

				for (j = startIndex; j <= endIndex; j++)
					y[j] *= absMaxOrig / absMaxNew;

				startIndex = endIndex + 1;
			}
		}
		//

		return y;
	}

	// Multiplies consonant gains with a window to increase their relative energy level
	// The window is 1.0 at both ends to ensure continuity
	// Maximum gain occurs in the middle of the window
	public static double[] processGains(double[] x, int samplingRateInHz, Labels labels, boolean[] toBeProcesseds,
			double extremumGainFactor, double extremumGainRelativeDuration) {
		assert labels.items.length == toBeProcesseds.length;

		boolean isIncreasing = true;
		if (extremumGainFactor < 1.0)
			isIncreasing = false;

		double[] y = null;
		double[] w = null;
		int startIndex = 0;
		int endIndex;
		int ws = SignalProcUtils.time2sample(WINDOW_SIZE_IN_SECONDS_GAIN, samplingRateInHz);
		int ss = SignalProcUtils.time2sample(SKIP_SIZE_IN_SECONDS_GAIN, samplingRateInHz);
		Window wfrm = new HammingWindow(ws);
		wfrm.normalizePeakValue(1.0f);
		double[] frmWgt = wfrm.getCoeffs();

		if (x != null && x.length > 0) {
			y = new double[x.length];
			w = new double[x.length];
			Arrays.fill(y, 0.0);
			Arrays.fill(w, 0.0);

			double[] frm = new double[ws];
			int i, j, k;
			for (i = 0; i < labels.items.length; i++) {
				boolean bProcessed = false;
				endIndex = SignalProcUtils.time2sample(labels.items[i].time, samplingRateInHz) - 1;
				endIndex = Math.min(endIndex, x.length - 1);

				int numfrm = (int) Math.floor((endIndex - startIndex + 1.0) / (double) ss + 0.5) + 1;

				if (numfrm > 0) {
					int windowLen = (int) Math.floor(numfrm * (1.0 - extremumGainRelativeDuration / 100.0) + 0.5);
					double[] wgt = new double[numfrm];
					if (toBeProcesseds[i])
						Arrays.fill(wgt, extremumGainFactor);
					else
						Arrays.fill(wgt, 1.0);

					if (windowLen > 0 && toBeProcesseds[i]) {
						Window wConsonant = new HammingWindow(windowLen);
						if (isIncreasing)
							wConsonant.normalizeRange(1.0f, (float) extremumGainFactor);
						else
							wConsonant.normalizeRange((float) extremumGainFactor, 1.0f);

						double[] lWgt = null;
						double[] rWgt = null;

						if (isIncreasing) {
							lWgt = wConsonant.getCoeffsLeftHalf();
							rWgt = wConsonant.getCoeffsRightHalf();
						} else {
							lWgt = wConsonant.getCoeffsRightHalf();
							rWgt = wConsonant.getCoeffsLeftHalf();
						}

						if (lWgt != null) {
							for (j = 0; j < lWgt.length; j++)
								wgt[j] = lWgt[j];
						}

						if (rWgt != null) {
							for (j = 0; j < rWgt.length; j++)
								wgt[j + numfrm - rWgt.length] = rWgt[j];
						}

						// MaryUtils.plot(wgt);

						for (j = 0; j < numfrm; j++) {
							System.arraycopy(x, j * ss + startIndex, frm, 0, Math.min(ws, x.length - (j * ss + startIndex)));
							for (k = 0; k < Math.min(ws, x.length - (j * ss + startIndex)); k++) {
								y[j * ss + startIndex + k] += x[j * ss + startIndex + k] * frmWgt[k] * wgt[j];
								w[j * ss + startIndex + k] += frmWgt[k];
							}
						}
					} else {
						Window wShort = new HammingWindow(endIndex - startIndex + 1);
						double[] wShortWgt = wShort.getCoeffs();
						for (k = startIndex; k <= endIndex; k++) {
							y[k] += x[k] * wShortWgt[k - startIndex];
							w[k] += wShortWgt[k - startIndex];
						}
					}
				} else {
					Window wShort = new HammingWindow(endIndex - startIndex + 1);
					double[] wShortWgt = wShort.getCoeffs();
					for (k = startIndex; k <= endIndex; k++) {
						y[k] += x[k] * wShortWgt[k - startIndex];
						w[k] += wShortWgt[k - startIndex];
					}
				}

				startIndex = endIndex + 1;
			}

			for (i = 0; i < x.length; i++) {
				if (w[i] > 0.0)
					y[i] /= w[i];
			}
		}

		return y;
	}

	// Detects closest LSF pairs within a frequency range
	// Makes these LSF pairs closer
	// Re-synthesizes the output using modified LSFs and frequency domain AR filtering
	public static double[] processLSFs(double[] x, int samplingRateInHz, Labels labels, boolean[] isVowels, boolean[] isPauses) {
		assert labels.items.length == isVowels.length;
		assert labels.items.length == isPauses.length;

		double[] y = null;
		double[] w = null;
		int startIndex = 0;
		int endIndex;
		int ws = SignalProcUtils.time2sample(WINDOW_SIZE_IN_SECONDS_LSF, samplingRateInHz);
		int ss = SignalProcUtils.time2sample(SKIP_SIZE_IN_SECONDS_LSF, samplingRateInHz);
		Window wfrm = new HammingWindow(ws);
		wfrm.normalizePeakValue(1.0f);
		double[] frmWgt = wfrm.getCoeffs();

		if (x != null && x.length > 0) {
			int lpOrder = SignalProcUtils.getLPOrder(samplingRateInHz);
			y = new double[x.length];
			w = new double[x.length];
			Arrays.fill(y, 0.0);
			Arrays.fill(w, 0.0);

			double[] frm = new double[ws];
			int i, j, k;
			int fftSize = SignalProcUtils.getDFTSize(samplingRateInHz);

			for (i = 0; i < labels.items.length; i++) {
				boolean bProcessed = false;
				endIndex = SignalProcUtils.time2sample(labels.items[i].time + WINDOW_SIZE_IN_SECONDS_LSF, samplingRateInHz) - 1;
				int numfrm = (int) Math.floor((endIndex - startIndex + 1.0) / (double) ss + 0.5) + 1;

				// if (isVowels[i] && numfrm>0)
				if (numfrm > 0) {
					for (j = 0; j < numfrm; j++) {
						Arrays.fill(frm, 0.0);
						if (j * ss + startIndex < x.length) {
							System.arraycopy(x, j * ss + startIndex, frm, 0, Math.min(ws, x.length - (j * ss + startIndex)));
							double[] frmOrig = ArrayUtils.copy(frm);
							double origEn = SignalProcUtils.energy(frmOrig);

							wfrm.apply(frm, 0);
							LpCoeffs lpcs = LpcAnalyser.calcLPC(frm, lpOrder, 0.0f);
							double[] lsfs = LsfAnalyser.lpc2lsfInHz(lpcs.getOneMinusA(), samplingRateInHz);
							double[] lsfsMod = ArrayUtils.copy(lsfs);

							if (isVowels[i]) {
								double[] dists = new double[lsfs.length - 1];
								for (k = 0; k < lsfsMod.length - 1; k++)
									dists[k] = lsfs[k + 1] - lsfs[k];

								for (k = 1; k < dists.length - 1; k++) {
									if (dists[k] < Math.min(dists[k + 1], MAX_LSF_PAIR_SEPARATION_IN_HZ)) // lsfs[k] and lsfs[k+1]
																											// might be pairs
									{
										double meanFreq = 0.5 * (lsfs[k] + lsfs[k + 1]);
										if (meanFreq >= FORMANT_SHARPENING_START_FREQ && meanFreq < FORMANT_SHARPENING_END_FREQ) {
											double shift = 0.5 * RELATIVE_DECREASE_IN_LSF_PAIR_SEPARATION / 100.0 * dists[k];
											lsfsMod[k] = lsfs[k - 1] + shift;
											lsfsMod[k + 1] = lsfs[k - 1] - shift;
											k += 2;
										}
									} else if (dists[k + 1] < Math.min(dists[k], MAX_LSF_PAIR_SEPARATION_IN_HZ)) // lsfs[k+1] and
																													// lsfs[k+2]
																													// might be
																													// pairs
									{
										double meanFreq = 0.5 * (lsfs[k + 1] + lsfs[k + 2]);
										if (meanFreq >= FORMANT_SHARPENING_START_FREQ && meanFreq < FORMANT_SHARPENING_END_FREQ) {
											double shift = 0.5 * RELATIVE_DECREASE_IN_LSF_PAIR_SEPARATION / 100.0 * dists[k];
											lsfsMod[k + 1] = lsfs[k + 1] + shift;
											lsfsMod[k + 2] = lsfs[k + 2] - shift;
											k += 2;
										}
									}
								}
							}

							double[] newOneMinusAs = LsfAnalyser.lsfInHz2lpc(lsfsMod, samplingRateInHz);
							double[] newLpcs = ArrayUtils.subarray(newOneMinusAs, 1, lpOrder);
							newLpcs = MathUtils.multiply(newLpcs, -1.0);
							double[] H = LpcAnalyser.calcSpecLinear(lpcs.getA(), lpcs.getGain(), fftSize);
							double[] HNew = LpcAnalyser.calcSpecLinear(newLpcs, lpcs.getGain(), fftSize);
							// MaryUtils.plot(MathUtils.amp2db(H));
							// MaryUtils.plot(MathUtils.amp2db(HNew));
							double[] HT = MathUtils.divide(HNew, H);

							// SignalProcUtils.displayDFTSpectrumInDB(frmOrig);
							frm = SignalProcUtils.filterfd(HT, frmOrig, samplingRateInHz);
							// SignalProcUtils.displayDFTSpectrumInDB(frm);

							double newEn = SignalProcUtils.energy(frm);
							double gain = Math.sqrt(origEn) / Math.sqrt(newEn);

							for (k = 0; k < Math.min(ws, x.length - (j * ss + startIndex)); k++) {
								y[j * ss + startIndex + k] += gain * frm[k] * frmWgt[k];
								w[j * ss + startIndex + k] += frmWgt[k];
							}
						}
					}
				} else {
					Window wShort = new HammingWindow(endIndex - startIndex + 1);
					double[] wShortWgt = wShort.getCoeffs();
					for (k = startIndex; k <= endIndex; k++) {
						y[k] += x[k] * wShortWgt[k - startIndex];
						w[k] += wShortWgt[k - startIndex];
					}
				}

				startIndex = endIndex - ws;
			}

			for (i = 0; i < x.length; i++) {
				if (w[i] > 0.0)
					y[i] /= w[i];
			}
		}

		return y;
	}

	public static double[] processHigherFormantGains(double[] x, int samplingRateInHz, Labels labels, boolean[] isPauses) {
		assert labels.items.length == isPauses.length;

		double[] y = null;

		if (x != null && x.length > 0) {
			int i, j;
			HighPassFilter hpf = new HighPassFilter(HIGHPASS_FILTER_CUTOFF / samplingRateInHz);
			double[] xhpf = hpf.apply(x);

			for (i = 0; i < x.length; i++)
				xhpf[i] = (1.0 - HIGHPASS_FILTER_RELATIVE_GAIN) * x[i] + HIGHPASS_FILTER_RELATIVE_GAIN * xhpf[i];

			y = new double[x.length];

			int startIndex = 0;
			int endIndex;
			for (i = 0; i < labels.items.length; i++) {
				endIndex = SignalProcUtils.time2sample(labels.items[i].time, samplingRateInHz) - 1;
				endIndex = Math.min(endIndex, x.length - 1);
				if (isPauses[i])
					System.arraycopy(x, startIndex, y, startIndex, endIndex - startIndex + 1);
				else
					System.arraycopy(xhpf, startIndex, y, startIndex, endIndex - startIndex + 1);

				startIndex = endIndex + 1;
			}
		}

		return y;
	}

	public static void mainSingleFile(String inputWavFile, String outputWavFile, Allophone[] allophones)
			throws UnsupportedAudioFileException, IOException {
		// File input
		AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(inputWavFile));
		int samplingRate = (int) inputAudio.getFormat().getSampleRate();
		AudioDoubleDataSource signal = new AudioDoubleDataSource(inputAudio);
		double[] x = signal.getAllData();
		double absMaxOrig = MathUtils.absMax(x);

		String strLabFile = StringUtils.modifyExtension(inputWavFile, LABEL_FILE_EXTENSION);
		if (!FileUtils.exists(strLabFile)) // Labels required for transients analysis (unless we design an automatic algorithm)
		{
			System.out.println("Label file not found: " + strLabFile + "...skipping...");
		} else {
			Labels labels = new Labels(strLabFile);
			//

			double[] y = Blizzard09PostProcessor.process(x, labels, allophones, samplingRate, absMaxOrig);

			DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(y), inputAudio.getFormat());
			AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputWavFile));
		}
	}

	public static void main(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException {
		if (args.length < 3) {
			System.out.println("Missing parameters:");
			System.out.println("<input wav file or directory> <output wav file or directory> <full path of phone set file>");
			System.out.println("Example phone set file: .../lib/modules/en/us/lexicon/allophones.en_US.xml");
		} else {
			String phoneSetFile = args[2];
			AllophoneSet allophoneSet = AllophoneSet.getAllophoneSet(phoneSetFile);

			Set<String> tmpPhonemes = allophoneSet.getAllophoneNames();
			int count = 0;
			Allophone[] allophones = new Allophone[tmpPhonemes.size()];
			for (Iterator<String> it = tmpPhonemes.iterator(); it.hasNext();) {
				allophones[count] = allophoneSet.getAllophone(it.next());
				count++;

				if (count >= tmpPhonemes.size())
					break;
			}

			if (FileUtils.isDirectory(args[0])) // Process folder
			{
				if (!FileUtils.exists(args[1]))
					FileUtils.createDirectory(args[1]);

				String[] fileList = FileUtils.getFileList(args[0], "wav");
				String outputFolder = StringUtils.checkLastSlash(args[1]);
				if (fileList != null) {
					for (int i = 0; i < fileList.length; i++) {
						String baseFileName = StringUtils.getFileName(fileList[i], true);
						String outputFile = outputFolder + baseFileName + ".wav";
						mainSingleFile(fileList[i], outputFile, allophones);
						System.out.println("Processing completed for file " + String.valueOf(i + 1) + " of "
								+ String.valueOf(fileList.length));
					}
				} else
					System.out.println("No wav files found!");
			} else
				// Process file
				mainSingleFile(args[0], args[1], allophones);

			System.out.println("Processing completed...");
		}
	}
}