/**
* Copyright 2000-2009 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.signalproc.process;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.Arrays;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.signalproc.analysis.F0TrackerAutocorrelationHeuristic;
import marytts.signalproc.analysis.PitchMarks;
import marytts.signalproc.analysis.PitchReaderWriter;
import marytts.signalproc.window.DynamicWindow;
import marytts.signalproc.window.Window;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.Datagram;
import marytts.util.data.DatagramDoubleDataSource;
import marytts.util.data.DoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.io.FileUtils;
import marytts.util.io.LEDataInputStream;
import marytts.util.io.LEDataOutputStream;
import marytts.util.math.ComplexArray;
import marytts.util.math.FFTMixedRadix;
import marytts.util.math.MathUtils;
import marytts.util.signal.SignalProcUtils;
public class FDPSOLAProcessor extends VocalTractModifier {
public static int WAVEFORM_MODIFICATION = 1;
public static int TTS_MODIFICATION = 2;
protected DoubleDataSource input;
protected AudioInputStream inputAudio;
protected DDSAudioInputStream outputAudio;
protected VoiceModificationParametersPreprocessor modParams;
protected int numfrm;
protected int numfrmFixed;
protected int lpOrder; // Linear prediction analysis order
protected String outputFile;
protected String tempOutBinaryFile;
protected int origLen;
protected PitchMarks pm;
protected double[] f0s;
protected PsolaFrameProvider psFrm;
protected double wsFixedInSeconds;
protected double ssFixedInSeconds;
protected int numPeriods;
protected static int NUM_PITCH_SYNC_PERIODS = 3;
protected static int FROM_CODE = 0;
protected static int FROM_FILE = 1;
protected static int FROM_TARGET = 2;
public boolean bSilent = true;
protected LEDataOutputStream dout; // Output stream for big-endian wav tests
protected LEDataInputStream din; // Input stream for big-endian wav tests
protected DynamicWindow windowIn;
protected DynamicWindow windowOut;
protected double[] wgt;
protected double[] wgty;
protected int frmSize;
protected int newFrmSize;
protected int newPeriod;
protected int synthFrmInd;
protected double localDurDiff;
protected int repeatSkipCount; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions
// for synthesized frame
protected double localDurDiffSaved;
protected double sumLocalDurDiffs;
protected double nextAdd;
protected int synthSt;
protected int synthTotal;
protected int maxFrmSize;
protected int maxNewFrmSize;
protected int synthFrameInd;
protected boolean bLastFrame;
protected boolean bBroke;
protected int newFftSize;
protected int newMaxFreq;
protected int outBuffLen;
protected double[] outBuff;
protected int outBuffStart;
protected int totalWrittenToFile;
protected double[] ySynthBuff;
protected double[] wSynthBuff;
protected int ySynthInd;
protected double[] frm;
protected boolean bWarp;
protected double[] inputVT;
protected double[] py2;
protected ComplexArray hy;
protected double[] frmy;
protected double frmEn;
protected double frmyEn;
protected double gain;
protected int newSkipSize;
protected int halfWin;
protected double[] newVScales;
protected double[] tmpvsc;
protected boolean isWavFileOutput;
protected int inputFrameIndex;
protected static double MIN_PSCALE = 0.1;
protected static double MAX_PSCALE = 5.0;
protected static double MIN_TSCALE = 0.1;
protected static double MAX_TSCALE = 5.0;
protected double tscaleSingle;
public FDPSOLAProcessor(String strInputFile, String strPitchFile, String strOutputFile, double[] pscales, double[] tscales,
double[] escales, double[] vscales) throws UnsupportedAudioFileException, IOException {
this(strInputFile, strPitchFile, strOutputFile, pscales, tscales, escales, vscales, false);
}
public FDPSOLAProcessor(String strInputFile, String strPitchFile, String strOutputFile, double[] pscales, double[] tscales,
double[] escales, double[] vscales, boolean isFixedRate) throws UnsupportedAudioFileException, IOException {
super();
init(WAVEFORM_MODIFICATION, strInputFile, strPitchFile, strOutputFile, pscales, tscales, escales, vscales, isFixedRate);
}
public FDPSOLAProcessor() {
super();
init(TTS_MODIFICATION);
}
protected void init(int initialisationType) {
init(initialisationType, null, null, null, null, null, null, null, false);
}
protected void init(int initialisationType, String strInputFile, String strPitchFile, String strOutputFile, double[] pscales,
double[] tscales, double[] escales, double[] vscales, boolean isFixedRate) {
isWavFileOutput = false;
inputAudio = null;
input = null;
pm = null;
f0s = null;
wsFixedInSeconds = 0.02;
ssFixedInSeconds = 0.01;
numPeriods = NUM_PITCH_SYNC_PERIODS;
origLen = 0;
fs = 16000;
numfrm = 0; // Total pitch synchronous frames (This is the actual number of frames to be processed)
numfrmFixed = 0; // Total frames if the analysis was fixed skip-rate
modParams = null;
outputFile = null;
tscaleSingle = 1.0;
boolean bContinue = true;
if (initialisationType == WAVEFORM_MODIFICATION) {
isWavFileOutput = true;
if (!FileUtils.exists(strInputFile)) {
System.out.println("Error! Pitch file " + strInputFile + " not found.");
bContinue = false;
}
if (strOutputFile == null || strOutputFile == "") {
System.out.println("Invalid output file...");
bContinue = false;
}
if (bContinue) {
try {
inputAudio = AudioSystem.getAudioInputStream(new File(strInputFile));
} catch (UnsupportedAudioFileException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
input = new AudioDoubleDataSource(inputAudio);
origLen = (int) input.getDataLength();
fs = (int) inputAudio.getFormat().getSampleRate();
if (!FileUtils.exists(strPitchFile)) {
System.out.println("Pitch file cannot be found, computing... " + strPitchFile);
try {
F0TrackerAutocorrelationHeuristic f0Tracker = new F0TrackerAutocorrelationHeuristic(strInputFile,
strPitchFile);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
PitchReaderWriter f0 = new PitchReaderWriter(strPitchFile);
pm = SignalProcUtils.pitchContour2pitchMarks(f0.contour, fs, origLen, f0.header.windowSizeInSeconds,
f0.header.skipSizeInSeconds, true, 0);
numfrmFixed = (int) (Math.floor(((double) (origLen + pm.totalZerosToPadd) / fs - 0.5 * wsFixedInSeconds)
/ ssFixedInSeconds + 0.5) + 2); // Total frames if the analysis was fixed skip-rate
if (!isFixedRate)
numfrm = pm.pitchMarks.length - numPeriods; // Total pitch synchronous frames (This is the actual number of
// frames to be processed)
else
numfrm = numfrmFixed;
f0s = SignalProcUtils.fixedRateF0Values(pm, wsFixedInSeconds, ssFixedInSeconds, numfrmFixed, fs);
lpOrder = SignalProcUtils.getLPOrder(fs);
modParams = new VoiceModificationParametersPreprocessor(fs, lpOrder, pscales, tscales, escales, vscales,
pm.pitchMarks, wsFixedInSeconds, ssFixedInSeconds, numfrm, numfrmFixed, numPeriods, isFixedRate);
tscaleSingle = modParams.tscaleSingle;
outputFile = strOutputFile;
}
} else if (initialisationType == TTS_MODIFICATION) {
// For test purposes, remove this line if you do not need additional wav file output
// outputFile = "d:/tts_out.wav";
lpOrder = SignalProcUtils.getLPOrder(fs);
}
if (bContinue) {
tmpvsc = new double[1];
// bSilent = false;
if (outputFile != null)
tempOutBinaryFile = outputFile + ".bin";
if (isWavFileOutput) {
if (!isFixedRate)
psFrm = new PsolaFrameProvider(input, pm, modParams.fs, modParams.numPeriods);
else
psFrm = new PsolaFrameProvider(input, wsFixedInSeconds, ssFixedInSeconds, modParams.fs, numfrm);
try {
dout = new LEDataOutputStream(tempOutBinaryFile);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
psFrm = null;
dout = null;
}
windowIn = new DynamicWindow(Window.HANNING);
windowOut = new DynamicWindow(Window.HANNING);
frmSize = 0;
newFrmSize = 0;
newPeriod = 0;
synthFrmInd = 0;
localDurDiff = 0.0;
repeatSkipCount = 0; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions for
// synthesized frame
localDurDiffSaved = 0.0;
sumLocalDurDiffs = 0.0;
nextAdd = 0.0;
if (isWavFileOutput)
synthSt = pm.pitchMarks[0];
else
synthSt = 0;
synthTotal = 0;
maxFrmSize = (int) (numPeriods * fs / 40.0);
if ((maxFrmSize % 2) != 0)
maxFrmSize++;
maxNewFrmSize = (int) (Math.floor(maxFrmSize / MIN_PSCALE + 0.5));
if ((maxNewFrmSize % 2) != 0)
maxNewFrmSize++;
synthFrameInd = 0;
bLastFrame = false;
bBroke = false;
fftSize = (int) Math.pow(2, (Math.ceil(Math.log((double) maxFrmSize) / Math.log(2.0))));
maxFreq = fftSize / 2 + 1;
outBuffLen = 500000;
outBuff = MathUtils.zeros(outBuffLen);
outBuffStart = 1;
totalWrittenToFile = 0;
ySynthBuff = MathUtils.zeros(maxNewFrmSize);
wSynthBuff = MathUtils.zeros(maxNewFrmSize);
ySynthInd = 1;
//
}
}
/**
* Functionally equivalent to {@link #process} (but with most of the cruft removed, which should make this easier to modify)
*
* @param datagrams
* array of Datagram arrays, one element per SelectedUnit
* @param rightContexts
* array of Datagrams, one element per SelectedUnit
* @param audioformat
* audioformat
* @param voicings
* array of boolean arrays, matching <b>datagrams</b>
* @param pitchScales
* array of double arrays, matching <b>datagrams</b>, pitch modification factors
* @param timeScales
* array of double arrays, matching <b>datagrams</b>, duration modification factors
* @return modified audio as a DoubleDataSource audio stream
* @throws IOException
* if frames cannot be processed
*/
public DDSAudioInputStream processDecrufted(Datagram[][] datagrams, Datagram[] rightContexts, AudioFormat audioformat,
boolean[][] voicings, double[][] pitchScales, double[][] timeScales) throws IOException {
// obscure dependency on several fields:
tscaleSingle = -1;
origLen = 0;
numfrm = 0;
for (int i = 0; i < datagrams.length; i++) {
for (int j = 0; j < datagrams[i].length; j++) {
origLen += datagrams[i][j].getDuration();
if (j == datagrams[i].length - 1 && rightContexts != null && rightContexts[i] != null) {
origLen += rightContexts[i].getDuration();
}
}
numfrm += datagrams[i].length;
}
// for each unit:
for (int i = 0; i < datagrams.length; i++) {
// for each datagram in that unit:
for (int j = 0; j < datagrams[i].length; j++) {
// awkwardly determine next Datagram, which defaults to silence as long as this Datagram...
int length = datagrams[i][j].getLength();
Datagram nextDatagram = new Datagram(length, new byte[2 * length]);
// ...unless it's not the last in this unit...
if (j < datagrams[i].length - 1) {
nextDatagram = datagrams[i][j + 1];
} else
// ...or we have a right context...
if (rightContexts[i] != null) {
nextDatagram = rightContexts[i];
} else
// ...or we have a next unit
// TODO but what if that unit has no frames?
if (i < datagrams.length - 1) {
nextDatagram = datagrams[i + 1][0];
}
assert nextDatagram.getDuration() > 0;
// ARG #1, actual frame data for this and the next Datagram:
Datagram[] sourceDatagrams = { datagrams[i][j], nextDatagram };
DatagramDoubleDataSource dataSource = new DatagramDoubleDataSource(sourceDatagrams);
double[] frmIn = dataSource.getAllData();
// ARG #2, voicing:
boolean symbolicVoicing = voicings[i][j];
boolean acousticVoicing = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()));
// inflexible hard-coded toggle between symbolic (phonology) and signal based voicing:
boolean isVoiced = symbolicVoicing; // one of: symbolicVoicing, acousticVoicing
// ARGs #5-6, some obscure variables:
double escale = 1.0;
double vscale = 1.0;
// ARG #7, is this the last Datagram?
boolean bLastInputFrame = (i == datagrams.length - 1) && (j == datagrams[i].length - 1);
// ARG #8, duration of this Datagram:
int currentPeriod = (int) datagrams[i][j].getDuration();
// ARG #9, number of frames in this and the next Datagram:
int inputFrameSize = currentPeriod + (int) nextDatagram.getDuration();
// actually process the data using the ARGs:
try {
int bufferStartIndex = outBuffStart;
processFrame(frmIn, isVoiced, pitchScales[i][j], timeScales[i][j], escale, vscale, bLastInputFrame,
currentPeriod, inputFrameSize);
int bufferEndIndex = outBuffStart;
int bufferLength = bufferEndIndex - bufferStartIndex;
// extract processed samples for this datagram from buffer:
double[] samples = new double[bufferLength];
System.arraycopy(outBuff, bufferStartIndex - 1, samples, 0, bufferLength);
// overwrite datagram duration:
datagrams[i][j].setDuration(samples.length);
} catch (IOException e) {
// TODO how can we throw just e, but attach our message?
throw new IOException("Frames could not be processed!", e);
}
}
}
int bufferStartIndex = outBuffStart - 1;
// initialize the output array:
double[] output = null;
try {
output = writeFinal();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// final processed samples (windowed):
int bufferEndIndex = outBuffLen;
int bufferLength = bufferEndIndex - bufferStartIndex;
double[] samples = new double[bufferLength];
System.arraycopy(outBuff, bufferStartIndex, samples, 0, bufferLength);
// update final datagram duration:
Datagram finalDatagram = datagrams[datagrams.length - 1][datagrams[datagrams.length - 1].length - 1];
finalDatagram.setDuration(finalDatagram.getDuration() + samples.length);
BufferedDoubleDataSource buffer = new BufferedDoubleDataSource(output);
DDSAudioInputStream stream = new DDSAudioInputStream(buffer, audioformat);
return stream;
}
// FD-PSOLA using all concatenation units
public DDSAudioInputStream process(Datagram[][] datagrams, Datagram[] rightContexts, AudioFormat audioformat,
boolean[][] voicings, double[][] pitchScales, double[][] timeScales) {
int pitchSpecs = FROM_TARGET;
// int pitchSpecs = FROM_FILE;
// int pitchSpecs = FROM_CODE;
int durationSpecs = FROM_TARGET;
// int durationSpecs = FROM_FILE;
// int durationSpecs = FROM_CODE;
int i, j, k;
double[] output = null;
boolean isVoiced = true;
double pscale = 1.0; // if pitchSpecs==FROM_CODE flag, this value will be used for pitch scaling
double tscale = 1.0; // if durationSpecs==FROM_CODE flag, this value will be used for duration scaling
double escale = 1.0;
double vscale = 1.0;
// Read pscale, tscale, escale and vscale from a text file.
// (For quick testing purposes. It resets the input pichScales and timeScales to the fixed values in the text file.)
if (pitchSpecs == FROM_FILE || durationSpecs == FROM_FILE) {
double[] scales = getScalesFromTextFile("d:/psolaParam.txt");
if (pitchSpecs == FROM_FILE)
pscale = scales[0];
if (durationSpecs == FROM_FILE)
tscale = scales[1];
escale = scales[2];
vscale = scales[3];
}
//
if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE || durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) {
for (i = 0; i < timeScales.length; i++) {
if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE) {
for (j = 0; j < pitchScales[i].length; j++)
pitchScales[i][j] = pscale;
}
if (durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) {
for (j = 0; j < timeScales[i].length; j++)
timeScales[i][j] = tscale;
}
}
}
double firstTScale = timeScales[0][0];
tscaleSingle = firstTScale;
for (i = 0; i < timeScales.length; i++) {
for (j = 0; j < timeScales[i].length; j++) {
if (i != 0 && j != 0 && timeScales[i][j] != firstTScale) {
tscaleSingle = -1.0;
break;
}
}
}
boolean bLastInputFrame = false;
int currentPeriod;
int inputFrameSize;
double[] frmIn = null;
double[] frmTmp = null;
int tmpLen;
double[] yOut = null;
double[] yOutTmp = null;
Datagram[] tmpDatagram = new Datagram[1];
origLen = 0;
numfrm = 0;
for (i = 0; i < datagrams.length; i++) {
for (j = 0; j < datagrams[i].length; j++) {
if (j == datagrams[i].length - 1) {
if (rightContexts != null && rightContexts[i] != null)
origLen += datagrams[i][j].getDuration() + rightContexts[i].getDuration();
else
origLen += datagrams[i][j].getDuration();
} else
origLen += datagrams[i][j].getDuration();
numfrm++;
}
}
int yCounter = -1;
for (i = 0; i < datagrams.length; i++) {
for (j = 0; j < datagrams[i].length; j++) {
if (i == datagrams.length - 1 && j == datagrams[i].length - 1)
bLastInputFrame = true;
else
bLastInputFrame = false;
frmIn = null;
inputFrameSize = 0;
currentPeriod = (int) datagrams[i][j].getDuration();
if (j < datagrams[i].length - 1) {
inputFrameSize = (int) datagrams[i][j].getDuration() + (int) datagrams[i][j + 1].getDuration();
frmIn = new double[inputFrameSize];
tmpDatagram[0] = datagrams[i][j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
tmpDatagram[0] = datagrams[i][j + 1];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length);
} else {
if (rightContexts[i] != null) {
inputFrameSize = (int) datagrams[i][j].getDuration() + (int) rightContexts[i].getDuration();
frmIn = new double[inputFrameSize];
tmpDatagram[0] = datagrams[i][j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
tmpDatagram[0] = rightContexts[i];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length);
} else {
if (i < datagrams.length - 1) {
inputFrameSize = (int) datagrams[i][j].getDuration() + (int) datagrams[i + 1][0].getDuration();
frmIn = new double[inputFrameSize];
tmpDatagram[0] = datagrams[i][j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
tmpDatagram[0] = datagrams[i + 1][0];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length);
} else {
inputFrameSize = 2 * (int) datagrams[i][j].getDuration();
frmIn = MathUtils.zeros(inputFrameSize);
tmpDatagram[0] = datagrams[i][j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
}
}
}
if (frmIn != null) // We have a frame to be processed
{
// isVoiced = voicings[i][j];
isVoiced = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()), 0.35f);
try {
output = processFrame(frmIn, isVoiced, pitchScales[i][j], timeScales[i][j], escale, vscale,
bLastInputFrame, currentPeriod, inputFrameSize);
} catch (IOException e) {
e.printStackTrace();
}
boolean bBroken = false;
if (output != null) {
if (yOut == null) {
yOut = new double[output.length];
System.arraycopy(output, 0, yOut, 0, output.length);
} else {
yOutTmp = new double[yOut.length];
System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length);
yOut = new double[yOutTmp.length + output.length];
System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length);
System.arraycopy(output, 0, yOut, yOutTmp.length, output.length);
}
}
if (bBroken)
break;
}
}
}
try {
output = writeFinal();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (output != null) {
if (yOut == null) {
yOut = new double[output.length];
System.arraycopy(output, 0, yOut, 0, output.length);
} else {
yOutTmp = new double[yOut.length];
System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length);
yOut = new double[yOutTmp.length + output.length];
System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length);
System.arraycopy(output, 0, yOut, yOutTmp.length, output.length);
}
}
double absMax = MathUtils.absMax(yOut);
if (absMax > 32700) {
for (i = 0; i < yOut.length; i++)
yOut[i] = yOut[i] / absMax * 32700;
}
return new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat);
}
// FD-PDSOLA on the whole signal with specified pitch marks
public DDSAudioInputStream process(double[] x, int[] pitchMarks, AudioFormat audioformat, boolean[] voicings,
double[] pitchScales, double[] timeScales) {
int pitchSpecs = FROM_TARGET;
// int pitchSpecs = FROM_FILE;
// int pitchSpecs = FROM_CODE;
int durationSpecs = FROM_TARGET;
// int durationSpecs = FROM_FILE;
// int durationSpecs = FROM_CODE;
int i, j, k;
double[] output = null;
boolean isVoiced = true;
double pscale = 1.0; // if pitchSpecs==FROM_CODE flag, this value will be used for pitch scaling
double tscale = 1.0; // if durationSpecs==FROM_CODE flag, this value will be used for duration scaling
double escale = 1.0;
double vscale = 1.0;
// Read pscale, tscale, escale and vscale from a text file.
// (For quick testing purposes. It resest the input pichScales and timeScales to the fixed values in the text file.)
if (pitchSpecs == FROM_FILE || durationSpecs == FROM_FILE) {
double[] scales = getScalesFromTextFile("d:/psolaParam.txt");
if (pitchSpecs == FROM_FILE)
pscale = scales[0];
if (durationSpecs == FROM_FILE)
tscale = scales[1];
escale = scales[2];
vscale = scales[3];
}
//
if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE || durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) {
if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE) {
for (i = 0; i < pitchScales.length; i++)
pitchScales[i] = pscale;
}
if (durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) {
for (i = 0; i < timeScales.length; i++)
timeScales[i] = tscale;
}
}
double firstTScale = timeScales[0];
tscaleSingle = firstTScale;
for (i = 0; i < timeScales.length; i++) {
if (i != 0 && timeScales[i] != firstTScale) {
tscaleSingle = -1.0;
break;
}
}
boolean bLastInputFrame = false;
int currentPeriod;
int inputFrameSize;
double[] frmIn = null;
double[] frmTmp = null;
int tmpLen;
double[] yOut = null;
double[] yOutTmp = null;
origLen = x.length;
numfrm = pitchMarks.length - numPeriods;
int yCounter = -1;
for (i = 0; i < pitchMarks.length - numPeriods; i++) {
if (i == pitchMarks.length - numPeriods - 1)
bLastInputFrame = true;
else
bLastInputFrame = false;
inputFrameSize = pitchMarks[i + numPeriods] - pitchMarks[i] + 1;
frmIn = new double[inputFrameSize];
System.arraycopy(x, pitchMarks[i], frmIn, 0, inputFrameSize);
currentPeriod = pitchMarks[i + 1] - pitchMarks[i] + 1;
if (frmIn != null) // We have a frame to be processed
{
// isVoiced = voicings[i][j];
isVoiced = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()), 0.35f);
try {
output = processFrame(frmIn, isVoiced, pitchScales[i], timeScales[i], escale, vscale, bLastInputFrame,
currentPeriod, inputFrameSize);
} catch (IOException e) {
e.printStackTrace();
}
boolean bBroken = false;
if (output != null) {
if (yOut == null) {
yOut = new double[output.length];
System.arraycopy(output, 0, yOut, 0, output.length);
} else {
yOutTmp = new double[yOut.length];
System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length);
yOut = new double[yOutTmp.length + output.length];
System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length);
System.arraycopy(output, 0, yOut, yOutTmp.length, output.length);
}
}
if (bBroken)
break;
}
}
try {
output = writeFinal();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (output != null) {
if (yOut == null) {
yOut = new double[output.length];
System.arraycopy(output, 0, yOut, 0, output.length);
} else {
yOutTmp = new double[yOut.length];
System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length);
yOut = new double[yOutTmp.length + output.length];
System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length);
System.arraycopy(output, 0, yOut, yOutTmp.length, output.length);
}
}
double absMax = MathUtils.absMax(yOut);
if (absMax > 32700) {
for (i = 0; i < yOut.length; i++)
yOut[i] = yOut[i] / absMax * 32700;
}
return new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat);
}
// FD-PSOLA on a single concatenation unit
public double[] processDatagram(Datagram[] datagrams, Datagram rightContext, AudioFormat audioformat, boolean[] voicings,
double[] pitchScales, double[] timeScales, boolean bLastDatagram) {
int pitchSpecs = FROM_TARGET;
// int pitchSpecs = FROM_FILE;
// int pitchSpecs = FROM_CODE;
int durationSpecs = FROM_TARGET;
// int durationSpecs = FROM_FILE;
// int durationSpecs = FROM_CODE;
int j, k;
double[] output = null;
boolean isVoiced = true;
double pscale = 1.0; // if pitchSpecs==FROM_CODE flag, this value will be used for pitch scaling
double tscale = 1.0; // if durationSpecs==FROM_CODE flag, this value will be used for duration scaling
double escale = 1.0;
double vscale = 1.0;
// Read pscale, tscale, escale and vscale from a text file.
// (For quick testing purposes. It resest the input pichScales and timeScales to the fixed values in the text file.)
if (pitchSpecs == FROM_FILE || durationSpecs == FROM_FILE) {
double[] scales = getScalesFromTextFile("d:/psolaParam.txt");
if (pitchSpecs == FROM_FILE)
pscale = scales[0];
if (durationSpecs == FROM_FILE)
tscale = scales[1];
escale = scales[2];
vscale = scales[3];
}
//
if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE || durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) {
if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE) {
for (j = 0; j < pitchScales.length; j++)
pitchScales[j] = pscale;
}
if (durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) {
for (j = 0; j < timeScales.length; j++)
timeScales[j] = tscale;
}
}
double firstTScale = timeScales[0];
tscaleSingle = firstTScale;
for (j = 0; j < timeScales.length; j++) {
if (j != 0 && timeScales[j] != firstTScale) {
tscaleSingle = -1.0;
break;
}
}
boolean bLastInputFrame = false;
int currentPeriod;
int inputFrameSize;
double[] frmIn = null;
double[] frmTmp = null;
int tmpLen;
double[] yOut = null;
double[] yOutTmp = null;
Datagram[] tmpDatagram = new Datagram[1];
origLen = 0;
numfrm = 0;
for (j = 0; j < datagrams.length; j++) {
if (j == datagrams.length - 1) {
if (rightContext != null)
origLen += datagrams[j].getDuration() + rightContext.getDuration();
else
origLen += datagrams[j].getDuration();
} else
origLen += datagrams[j].getDuration();
numfrm++;
}
int yCounter = -1;
for (j = 0; j < datagrams.length; j++) {
frmIn = null;
inputFrameSize = 0;
/*
* if (j==datagrams.length-1) bLastInputFrame = true;
*/
if (bLastDatagram && j == datagrams.length - 1)
bLastInputFrame = true;
currentPeriod = (int) datagrams[j].getDuration();
if (j < datagrams.length - 1) {
inputFrameSize = (int) datagrams[j].getDuration() + (int) datagrams[j + 1].getDuration();
frmIn = new double[inputFrameSize];
tmpDatagram[0] = datagrams[j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
tmpDatagram[0] = datagrams[j + 1];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length);
} else {
if (rightContext != null) {
inputFrameSize = (int) datagrams[j].getDuration() + (int) rightContext.getDuration();
frmIn = new double[inputFrameSize];
tmpDatagram[0] = datagrams[j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
tmpDatagram[0] = rightContext;
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length);
} else {
inputFrameSize = 2 * (int) datagrams[j].getDuration();
frmIn = new double[inputFrameSize];
Arrays.fill(frmIn, 0.0);
tmpDatagram[0] = datagrams[j];
frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData();
tmpLen = frmTmp.length;
System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen);
}
}
if (frmIn != null) // We have a frame to be processed
{
// isVoiced = voicings[j];
isVoiced = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()), 0.35f);
try {
output = processFrame(frmIn, isVoiced, pitchScales[j], timeScales[j], escale, vscale, bLastInputFrame,
currentPeriod, inputFrameSize);
} catch (IOException e) {
e.printStackTrace();
}
boolean bBroken = false;
if (output != null) {
if (yOut == null) {
yOut = new double[output.length];
System.arraycopy(output, 0, yOut, 0, output.length);
} else {
yOutTmp = new double[yOut.length];
System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length);
yOut = new double[yOutTmp.length + output.length];
System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length);
System.arraycopy(output, 0, yOut, yOutTmp.length, output.length);
}
}
if (bBroken)
break;
}
}
try {
output = writeFinal();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (output != null) {
if (yOut == null) {
yOut = new double[output.length];
System.arraycopy(output, 0, yOut, 0, output.length);
} else {
yOutTmp = new double[yOut.length];
System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length);
yOut = new double[yOutTmp.length + output.length];
System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length);
System.arraycopy(output, 0, yOut, yOutTmp.length, output.length);
}
}
return yOut;
}
// Read scale factors from a text file for quick testing
public double[] getScalesFromTextFile(String strScaleFile) {
int i;
double[] scales = new double[4];
Reader r = null;
try {
r = new BufferedReader(new FileReader(strScaleFile));
} catch (FileNotFoundException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
StreamTokenizer stok = new StreamTokenizer(r);
stok.parseNumbers();
try {
stok.nextToken();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
for (i = 0; i < scales.length; i++) {
if (stok.ttype == StreamTokenizer.TT_NUMBER)
scales[i] = stok.nval;
try {
stok.nextToken();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
try {
r.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
return scales;
}
public void fdpsolaOnline() throws IOException {
int i;
double[] frmIn;
boolean isLastInputFrame;
int inputFrameSize;
int currentPeriod;
boolean isVoiced;
inputFrameIndex = 0;
for (i = 0; i < numfrm; i++) {
frmIn = psFrm.getNextFrame();
if (bBroke)
break;
if (i == numfrm - 1)
isLastInputFrame = true;
else
isLastInputFrame = false;
currentPeriod = pm.pitchMarks[i + 1] - pm.pitchMarks[i];
inputFrameSize = pm.pitchMarks[i + modParams.numPeriods] - pm.pitchMarks[i] + 1;
isVoiced = pm.f0s[i] > 10.0 ? true : false;
processFrame(frmIn, isVoiced, modParams.pscalesVar[i], modParams.tscalesVar[i], modParams.escalesVar[i],
modParams.vscalesVar[i], isLastInputFrame, currentPeriod, inputFrameSize);
}
writeFinal();
convertToWav(inputAudio.getFormat());
inputAudio.close();
}
public double[] processFrame(double[] frmIn, boolean isVoiced, double pscale, double tscale, double escale, double vscale,
boolean isLastInputFrame, int currentPeriod, int inputFrameSize) throws IOException {
if (pscale < MIN_PSCALE)
pscale = MIN_PSCALE;
if (pscale > MAX_PSCALE)
pscale = MAX_PSCALE;
if (tscale < MIN_TSCALE)
tscale = MIN_TSCALE;
if (tscale > MAX_TSCALE)
tscale = MAX_TSCALE;
double[] output = null;
double[] outputTmp = null;
int j, k, wInd, kMax;
int tmpFix, tmpAdd, tmpMul;
int remain;
int kInd;
repeatSkipCount = 0; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions for
// synthesized frame
// Compute new frame sizes, change in durations due to pitch scaling, and required compensation amount in samples
// &
// Find out which pitch-scaled frames to repeat/skip for overall duration
// compensation
frmSize = inputFrameSize;
if ((frmSize % 2) != 0)
frmSize++;
if (frmSize < 4)
frmSize = 4;
if (isVoiced) {
newFrmSize = (int) (Math.floor(frmSize / pscale + 0.5));
if ((newFrmSize % 2) != 0)
newFrmSize++;
if (newFrmSize < 4)
newFrmSize = 4;
} else
newFrmSize = frmSize;
newPeriod = (int) Math.floor(((double) newFrmSize) / NUM_PITCH_SYNC_PERIODS + 0.5);
// Compute duration compensation required:
// localDurDiffs(i) = (DESIRED)-(AFTER PITCHSCALING)
// (-) if expansion occured, (+) if compression occured
// We aim to make this as close to zero as possible in the following duration compensation step
localDurDiff = nextAdd + (frmSize * tscale - newFrmSize) / NUM_PITCH_SYNC_PERIODS;
nextAdd = 0;
if (localDurDiff < -0.1 * newPeriod) // Expansion occured so skip this frame
{
repeatSkipCount--;
if (!isLastInputFrame) {
nextAdd = localDurDiff + newPeriod;
localDurDiff = 0;
}
} else if (localDurDiff > 0.1 * newPeriod) // Compression occured so repeat this frame
{
while (localDurDiff > 0.1 * newPeriod) {
repeatSkipCount++;
localDurDiff -= newPeriod;
}
if (!isLastInputFrame) {
nextAdd = localDurDiff;
localDurDiff = 0;
}
}
sumLocalDurDiffs += localDurDiff;
if (isLastInputFrame) {
// Check the final length and perform additional repetitions if necessary
localDurDiff = sumLocalDurDiffs;
while (localDurDiff > 0) {
repeatSkipCount++;
localDurDiff -= newPeriod;
}
//
}
if (isLastInputFrame) {
repeatSkipCount++;
bLastFrame = true;
}
if (repeatSkipCount > -1) {
frm = MathUtils.zeros(frmSize);
System.arraycopy(frmIn, 0, frm, 0, Math.min(frmIn.length, frmSize));
wgt = windowIn.values(frmSize);
if (vscale != 1.0)
bWarp = true;
else
bWarp = false;
// if (isVoiced || bWarp) //For forcing FDPSOLA to be applied even when pscale=1.0
if ((isVoiced && pscale != 1.0) || bWarp) {
if (fftSize < frmSize) {
fftSize = (int) Math.pow(2, (Math.ceil(Math.log((double) frmSize) / Math.log(2.0))));
maxFreq = fftSize / 2 + 1;
}
newMaxFreq = (int) Math.floor(maxFreq / pscale + 0.5);
if (newMaxFreq < 3)
newMaxFreq = 3;
if ((newMaxFreq % 2) != 1)
newMaxFreq++;
// This is for being able to use the FFT algorithm that works only with buffers of length power of two
// If you have an FFT algorithm that works with any buffer size, simply remove this line
// newMaxFreq = (int)Math.floor(0.5*MathUtils.closestPowerOfTwoAbove(2*(newMaxFreq-1))+1.5);
//
newFftSize = 2 * (newMaxFreq - 1);
frmEn = SignalProcUtils.getEnergy(frm);
// Compute LP and excitation spectrum
super.initialise(lpOrder, fs, fftSize, true); // Perform only analysis
windowIn.applyInline(frm, 0, frmSize); // Windowing
applyInline(frm, 0, frmSize); // LP analysis
// Expand/Compress the vocal tract spectrum in inverse manner
inputVT = MathUtils.interpolate(vtSpectrum, newMaxFreq); // Interpolated vocal tract spectrum
// Perform vocal tract scaling
if (bWarp) {
tmpvsc[0] = vscale;
newVScales = MathUtils.modifySize(tmpvsc, newMaxFreq); // Modify length to match current length of spectrum
for (k = 0; k < newVScales.length; k++) {
if (newVScales[k] < 0.05) // Put a floor to avoid divide by zero
newVScales[k] = 0.05;
}
py2 = new double[newMaxFreq];
for (k = 0; k < newMaxFreq; k++) {
wInd = (int) Math.floor((k + 1) / newVScales[k] + 0.5); // Find new indices
if (wInd < 1)
wInd = 1;
if (wInd > newMaxFreq)
wInd = newMaxFreq;
py2[k] = inputVT[wInd - 1];
}
System.arraycopy(py2, 0, inputVT, 0, newMaxFreq);
}
// Create output DFT spectrum
hy = new ComplexArray(newFftSize);
hy.real = MathUtils.zeros(newFftSize);
hy.imag = MathUtils.zeros(newFftSize);
System.arraycopy(this.h.real, 0, hy.real, 0, Math.min(maxFreq, newFftSize));
System.arraycopy(this.h.imag, 0, hy.imag, 0, Math.min(maxFreq, newFftSize));
// Copy & paste samples if required (COMPLEX VERSION TO SUPPORT PSCALE<=0.5)
// This version fills the spectrum by flipping and pasting the original freq bins as many times as required.
kMax = 1;
while (newMaxFreq > (kMax + 1) * (maxFreq - 2))
kMax++;
for (k = 1; k <= kMax; k++) {
tmpFix = (maxFreq - 2) * k;
if (k % 2 == 1) // Odd mode
{
tmpAdd = maxFreq + 2;
tmpMul = 1;
} else {
tmpAdd = -1;
tmpMul = -1;
}
for (j = tmpFix + 3; j <= Math.min(newMaxFreq, maxFreq + tmpFix); j++) {
hy.real[j - 1] = this.h.real[tmpMul * (tmpFix - j) + tmpAdd - 1];
hy.imag[j - 1] = this.h.imag[tmpMul * (tmpFix - j) + tmpAdd - 1];
}
}
hy.real[newMaxFreq - 1] = Math.sqrt(hy.real[newMaxFreq - 1] * hy.real[newMaxFreq - 1] + hy.imag[newMaxFreq - 1]
* hy.imag[newMaxFreq - 1]);
hy.imag[newMaxFreq - 1] = 0.0;
// Convolution
for (k = 1; k <= newMaxFreq; k++) {
hy.real[k - 1] *= inputVT[k - 1];
hy.imag[k - 1] *= inputVT[k - 1];
}
for (k = newMaxFreq + 1; k <= newFftSize; k++) {
hy.real[k - 1] = hy.real[2 * newMaxFreq - 1 - k];
hy.imag[k - 1] = -hy.imag[2 * newMaxFreq - 1 - k];
}
// Convert back to time domain
// FFT.transform(hy.real, hy.imag, true);
// hy = FFTArbitraryLength.ifft(hy);
hy = FFTMixedRadix.ifft(hy);
frmy = new double[newFrmSize];
System.arraycopy(hy.real, 0, frmy, 0, newFrmSize);
frmyEn = SignalProcUtils.getEnergy(frmy);
gain = (frmEn / Math.sqrt(frmSize)) / (frmyEn / Math.sqrt(newFrmSize)) * escale;
} else {
if (frmSize < newFrmSize)
newFrmSize = frmSize;
frmy = new double[newFrmSize];
for (k = 0; k < frmSize; k++)
frmy[k] = frm[k] * wgt[k];
gain = escale;
}
// Energy scale compensation + modification
for (k = 0; k < newFrmSize; k++) {
frmy[k] *= gain;
}
for (j = 1; j <= repeatSkipCount + 1; j++) {
if (isVoiced)
newSkipSize = (int) Math.floor(currentPeriod / pscale + 0.5);
else
newSkipSize = (int) Math.floor(currentPeriod + 0.5);
if ((isLastInputFrame && j == repeatSkipCount + 1)) // | (i~=numfrm & all(repeatSkipCounts(i+1:numfrm)==-1)))
bLastFrame = true;
else
bLastFrame = false;
synthFrameInd++;
wgty = windowOut.values(newFrmSize);
if (synthFrameInd == 1) // First frame: Do not window the first half of output speech frame to prevent overflow in
// normalization with hanning coeffs
{
halfWin = (int) Math.floor(newFrmSize / 2.0 + 0.5);
synthTotal = synthSt + newFrmSize;
// Keep output in an overlap-add buffer
if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) {
for (k = ySynthInd; k <= ySynthInd + halfWin - 1; k++) {
ySynthBuff[k - 1] = frmy[k - ySynthInd];
wSynthBuff[k - 1] = 1.0;
}
for (k = ySynthInd + halfWin; k <= ySynthInd + newFrmSize - 1; k++) {
ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd];
wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd];
}
} else {
for (k = ySynthInd; k <= maxNewFrmSize; k++) {
if (k - ySynthInd < halfWin) {
ySynthBuff[k - 1] = frmy[k - ySynthInd];
wSynthBuff[k - 1] = 1.0;
} else {
ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd];
wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd];
}
}
for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) {
if (maxNewFrmSize - ySynthInd + k < halfWin) {
ySynthBuff[k - 1] = frmy[maxNewFrmSize - ySynthInd + k];
wSynthBuff[k - 1] = 1.0;
} else {
ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k];
wSynthBuff[k - 1] += wgty[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k];
}
}
}
//
if (!bSilent)
System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1));
} else if (bLastFrame) // Last frame: Do not window the second half of output speech frame to prevent overflow in
// normalization with hanning coeffs
{
halfWin = (int) Math.floor(newFrmSize / 2.0 + 0.5);
remain = newFrmSize - halfWin;
synthTotal = synthSt + halfWin + remain - 1;
// Keep output in an overlap-add buffer
if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) {
for (k = ySynthInd; k <= ySynthInd + halfWin - 1; k++) {
ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd];
wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd];
}
for (k = ySynthInd + halfWin; k <= ySynthInd + newFrmSize - 1; k++) {
ySynthBuff[k - 1] += frmy[k - ySynthInd];
wSynthBuff[k - 1] = 1.0;
}
} else {
for (k = ySynthInd; k <= maxNewFrmSize; k++) {
if (k - ySynthInd < halfWin) {
ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd];
wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd];
} else {
ySynthBuff[k - 1] += frmy[k - ySynthInd];
wSynthBuff[k - 1] = 1.0;
}
}
for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) {
if (maxNewFrmSize - ySynthInd + k < halfWin) {
ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k];
wSynthBuff[k - 1] += wgty[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k];
} else {
ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k];
wSynthBuff[k - 1] = 1.0;
}
}
}
//
if (!bSilent)
System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1));
} else // Normal frame
{
if (!isVoiced && ((repeatSkipCount % 2) == 1)) // Reverse unvoiced repeated frames once in two consecutive
// repetitions to reduce distortion
frmy = SignalProcUtils.reverse(frmy);
synthTotal = synthSt + newFrmSize;
// Keep output in an overlap-add buffer
if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) {
for (k = ySynthInd; k <= ySynthInd + newFrmSize - 1; k++) {
ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd];
wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd];
}
} else {
for (k = ySynthInd; k <= maxNewFrmSize; k++) {
ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd];
wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd];
}
for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) {
ySynthBuff[k - 1] += frmy[k + maxNewFrmSize - ySynthInd] * wgty[k + maxNewFrmSize - ySynthInd];
wSynthBuff[k - 1] += wgty[k + maxNewFrmSize - ySynthInd] * wgty[k + maxNewFrmSize - ySynthInd];
}
}
//
if (!bSilent) {
if (j == 1)
System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1));
else
System.out.println("Repeated using frame " + String.valueOf(inputFrameIndex + 1));
}
}
// Write to output buffer
for (k = 0; k <= newSkipSize - 1; k++) {
kInd = (k + ySynthInd) % maxNewFrmSize;
if (kInd == 0)
kInd = maxNewFrmSize;
if (wSynthBuff[kInd - 1] > 0.0)
outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1] / wSynthBuff[kInd - 1];
else
outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1];
ySynthBuff[kInd - 1] = 0.0;
wSynthBuff[kInd - 1] = 0.0;
outBuffStart++;
if (outBuffStart > outBuffLen) {
if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffLen <= origLen) {
if (isWavFileOutput)
dout.writeDouble(outBuff, 0, outBuffLen);
else {
if (output == null) {
output = new double[outBuffLen];
System.arraycopy(outBuff, 0, output, 0, outBuffLen);
} else {
outputTmp = new double[output.length];
System.arraycopy(output, 0, outputTmp, 0, output.length);
output = new double[outputTmp.length + outBuffLen];
System.arraycopy(outputTmp, 0, output, 0, outputTmp.length);
System.arraycopy(outBuff, 0, output, outputTmp.length, outBuffLen);
}
}
totalWrittenToFile += outBuffLen;
} else {
if (isWavFileOutput)
dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile);
else {
if (output == null) {
output = new double[origLen - totalWrittenToFile];
System.arraycopy(outBuff, 0, output, 0, origLen - totalWrittenToFile);
} else {
outputTmp = new double[output.length];
System.arraycopy(output, 0, outputTmp, 0, output.length);
output = new double[outputTmp.length + origLen - totalWrittenToFile];
System.arraycopy(outputTmp, 0, output, 0, outputTmp.length);
System.arraycopy(outBuff, 0, output, outputTmp.length, origLen - totalWrittenToFile);
}
}
totalWrittenToFile = origLen;
}
outBuffStart = 1;
}
}
//
synthSt += newSkipSize;
// if (!bLastFrame)
// {
if (ySynthInd + newSkipSize <= maxNewFrmSize)
ySynthInd += newSkipSize;
else
ySynthInd += newSkipSize - maxNewFrmSize;
// }
// ///////
if (bLastFrame) {
bBroke = true;
break;
}
}
} else {
if (!bSilent)
System.out.println("Skipped frame " + String.valueOf(inputFrameIndex + 1));
}
inputFrameIndex++;
return output;
}
public double[] writeFinal() throws IOException {
double[] output = null;
double[] outputTmp = null;
int k, kInd;
if (tscaleSingle == 1.0)
synthTotal = origLen;
if (outBuffLen > synthTotal)
outBuffLen = synthTotal;
// Write the final segment
for (k = synthSt; k <= synthTotal; k++) {
kInd = (k - synthSt + ySynthInd) % maxNewFrmSize;
if (kInd == 0)
kInd = maxNewFrmSize;
if (wSynthBuff[kInd - 1] > 0.0)
outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1] / wSynthBuff[kInd - 1];
else
outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1];
ySynthBuff[kInd - 1] = 0.0;
wSynthBuff[kInd - 1] = 0.0;
outBuffStart++;
if (outBuffStart > outBuffLen) {
if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffLen <= origLen) {
if (isWavFileOutput)
dout.writeDouble(outBuff, 0, outBuffLen);
else {
if (output == null) {
output = new double[outBuffLen];
System.arraycopy(outBuff, 0, output, 0, outBuffLen);
} else {
outputTmp = new double[output.length];
System.arraycopy(output, 0, outputTmp, 0, output.length);
output = new double[outputTmp.length + outBuffLen];
System.arraycopy(outputTmp, 0, output, 0, outputTmp.length);
System.arraycopy(outBuff, 0, output, outputTmp.length, outBuffLen);
}
}
totalWrittenToFile += outBuffLen;
} else {
if (isWavFileOutput)
dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile);
else {
if (output == null) {
output = new double[origLen - totalWrittenToFile];
System.arraycopy(outBuff, 0, output, 0, origLen - totalWrittenToFile);
} else {
outputTmp = new double[output.length];
System.arraycopy(output, 0, outputTmp, 0, output.length);
output = new double[outputTmp.length + origLen - totalWrittenToFile];
System.arraycopy(outputTmp, 0, output, 0, outputTmp.length);
System.arraycopy(outBuff, 0, output, outputTmp.length, origLen - totalWrittenToFile);
}
}
totalWrittenToFile = origLen;
}
outBuffStart = 1;
}
}
if (outBuffStart > 1) {
if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffStart - 1 <= origLen) {
if (isWavFileOutput)
dout.writeDouble(outBuff, 0, outBuffStart - 1);
else {
if (output == null) {
output = new double[outBuffStart - 1];
System.arraycopy(outBuff, 0, output, 0, outBuffStart - 1);
} else {
outputTmp = new double[output.length];
System.arraycopy(output, 0, outputTmp, 0, output.length);
output = new double[outputTmp.length + outBuffStart - 1];
System.arraycopy(outputTmp, 0, output, 0, outputTmp.length);
System.arraycopy(outBuff, 0, output, outputTmp.length, outBuffStart - 1);
}
}
totalWrittenToFile += outBuffStart - 1;
} else {
if (isWavFileOutput)
dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile);
else {
if (output == null) {
output = new double[origLen - totalWrittenToFile];
System.arraycopy(outBuff, 0, output, 0, origLen - totalWrittenToFile);
} else {
outputTmp = new double[output.length];
System.arraycopy(output, 0, outputTmp, 0, output.length);
output = new double[outputTmp.length + origLen - totalWrittenToFile];
System.arraycopy(outputTmp, 0, output, 0, outputTmp.length);
System.arraycopy(outBuff, 0, output, outputTmp.length, origLen - totalWrittenToFile);
}
}
totalWrittenToFile = origLen;
}
}
//
if (dout != null)
dout.close();
return output;
}
public void convertToWav(AudioFormat audioformat) throws IOException {
// Read the temp binary file into a wav file and delete the temp binary file
if (tempOutBinaryFile != null) {
double[] yOut = null;
din = new LEDataInputStream(tempOutBinaryFile);
yOut = din.readDouble(totalWrittenToFile);
din.close();
double tmpMax = MathUtils.getAbsMax(yOut);
if (tmpMax > 1.0) {
for (int n = 0; n < yOut.length; n++)
yOut[n] /= tmpMax;
}
outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat);
AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputFile));
File tmpFile = new File(tempOutBinaryFile);
tmpFile.delete();
//
}
}
public static void mainParametric(String inputWavFile, double[] pscales, double[] tscales, double[] escales, double[] vscales)
throws UnsupportedAudioFileException, IOException {
String strExt = "";
String strTmp;
if (pscales.length == 1 && tscales.length == 1) {
if (pscales[0] != 1.0) {
strTmp = String.valueOf(pscales[0]);
while (strTmp.length() < 4)
strTmp += "0";
strTmp = strTmp.substring(0, 1) + strTmp.substring(2, 3) + strTmp.substring(3, 4);
strExt += "_p" + strTmp;
}
if (tscales[0] != 1.0) {
strTmp = String.valueOf(tscales[0]);
while (strTmp.length() < 4)
strTmp += "0";
strTmp = strTmp.substring(0, 1) + strTmp.substring(2, 3) + strTmp.substring(3, 4);
strExt += "_d" + strTmp;
}
if (pscales[0] == 1.0 && tscales[0] == 1.0)
strExt = "_none";
} else
strExt = "_pvar_dvar";
String strOutputFile = inputWavFile.substring(0, inputWavFile.length() - 4) + "_fd" + strExt + ".wav";
String strPitchFile = inputWavFile.substring(0, inputWavFile.length() - 4) + ".ptc";
FDPSOLAProcessor fd = new FDPSOLAProcessor(inputWavFile, strPitchFile, strOutputFile, pscales, tscales, escales, vscales);
fd.fdpsolaOnline();
}
@SuppressWarnings("unused")
public static void main(String[] args) throws Exception {
if (true) // Test with only one setting
{
// double [] pscales = {0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.05, 1.10, 1.15, 1.20, 1.25, 1.30, 1.35,
// 1.40, 1.45, 1.50};
// double [] tscales = {1.50, 1.45, 1.40, 1.35, 1.30, 1.25, 1.20, 1.15, 1.10, 1.05, 0.95, 0.90, 0.85, 0.80, 0.75,
// 0.70, 0.65, 0.60};
double[] pscales = { 1.0 };
double[] tscales = { 1.2 };
double[] escales = { 1.0 };
double[] vscales = { 1.0 };
mainParametric(args[0], pscales, tscales, escales, vscales);
} else // Test with multiple settings
{
double[] escales = { 1.0 };
double[] vscales = { 1.0 };
double[] pscales = { 1.0 };
double[] tscales = { 1.0 };
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 0.55;
tscales[0] = 1.0;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 0.80;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 1.50;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 2.50;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 1.0;
tscales[0] = 0.55;
mainParametric(args[0], pscales, tscales, escales, vscales);
tscales[0] = 0.80;
mainParametric(args[0], pscales, tscales, escales, vscales);
tscales[0] = 1.50;
mainParametric(args[0], pscales, tscales, escales, vscales);
tscales[0] = 2.50;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 0.55;
tscales[0] = 0.80;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 0.80;
tscales[0] = 2.50;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 1.50;
tscales[0] = 0.55;
mainParametric(args[0], pscales, tscales, escales, vscales);
pscales[0] = 2.50;
tscales[0] = 1.50;
mainParametric(args[0], pscales, tscales, escales, vscales);
double[] pscalesVar = { 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.05, 1.10, 1.15, 1.20, 1.25, 1.30, 1.35,
1.40, 1.45, 1.50 };
double[] tscalesVar = { 1.50, 1.45, 1.40, 1.35, 1.30, 1.25, 1.20, 1.15, 1.10, 1.05, 0.95, 0.90, 0.85, 0.80, 0.75,
0.70, 0.65, 0.60 };
mainParametric(args[0], pscalesVar, tscalesVar, escales, vscales);
}
System.out.println("FDPSOLA test completed...");
}
}