/** * Copyright 2000-2009 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.process; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StreamTokenizer; import java.util.Arrays; import javax.sound.sampled.AudioFileFormat; import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.signalproc.analysis.F0TrackerAutocorrelationHeuristic; import marytts.signalproc.analysis.PitchMarks; import marytts.signalproc.analysis.PitchReaderWriter; import marytts.signalproc.window.DynamicWindow; import marytts.signalproc.window.Window; import marytts.util.data.BufferedDoubleDataSource; import marytts.util.data.Datagram; import marytts.util.data.DatagramDoubleDataSource; import marytts.util.data.DoubleDataSource; import marytts.util.data.audio.AudioDoubleDataSource; import marytts.util.data.audio.DDSAudioInputStream; import marytts.util.io.FileUtils; import marytts.util.io.LEDataInputStream; import marytts.util.io.LEDataOutputStream; import marytts.util.math.ComplexArray; import marytts.util.math.FFTMixedRadix; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; public class FDPSOLAProcessor extends VocalTractModifier { public static int WAVEFORM_MODIFICATION = 1; public static int TTS_MODIFICATION = 2; protected DoubleDataSource input; protected AudioInputStream inputAudio; protected DDSAudioInputStream outputAudio; protected VoiceModificationParametersPreprocessor modParams; protected int numfrm; protected int numfrmFixed; protected int lpOrder; // Linear prediction analysis order protected String outputFile; protected String tempOutBinaryFile; protected int origLen; protected PitchMarks pm; protected double[] f0s; protected PsolaFrameProvider psFrm; protected double wsFixedInSeconds; protected double ssFixedInSeconds; protected int numPeriods; protected static int NUM_PITCH_SYNC_PERIODS = 3; protected static int FROM_CODE = 0; protected static int FROM_FILE = 1; protected static int FROM_TARGET = 2; public boolean bSilent = true; protected LEDataOutputStream dout; // Output stream for big-endian wav tests protected LEDataInputStream din; // Input stream for big-endian wav tests protected DynamicWindow windowIn; protected DynamicWindow windowOut; protected double[] wgt; protected double[] wgty; protected int frmSize; protected int newFrmSize; protected int newPeriod; protected int synthFrmInd; protected double localDurDiff; protected int repeatSkipCount; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions // for synthesized frame protected double localDurDiffSaved; protected double sumLocalDurDiffs; protected double nextAdd; protected int synthSt; protected int synthTotal; protected int maxFrmSize; protected int maxNewFrmSize; protected int synthFrameInd; protected boolean bLastFrame; protected boolean bBroke; protected int newFftSize; protected int newMaxFreq; protected int outBuffLen; protected double[] outBuff; protected int outBuffStart; protected int totalWrittenToFile; protected double[] ySynthBuff; protected double[] wSynthBuff; protected int ySynthInd; protected double[] frm; protected boolean bWarp; protected double[] inputVT; protected double[] py2; protected ComplexArray hy; protected double[] frmy; protected double frmEn; protected double frmyEn; protected double gain; protected int newSkipSize; protected int halfWin; protected double[] newVScales; protected double[] tmpvsc; protected boolean isWavFileOutput; protected int inputFrameIndex; protected static double MIN_PSCALE = 0.1; protected static double MAX_PSCALE = 5.0; protected static double MIN_TSCALE = 0.1; protected static double MAX_TSCALE = 5.0; protected double tscaleSingle; public FDPSOLAProcessor(String strInputFile, String strPitchFile, String strOutputFile, double[] pscales, double[] tscales, double[] escales, double[] vscales) throws UnsupportedAudioFileException, IOException { this(strInputFile, strPitchFile, strOutputFile, pscales, tscales, escales, vscales, false); } public FDPSOLAProcessor(String strInputFile, String strPitchFile, String strOutputFile, double[] pscales, double[] tscales, double[] escales, double[] vscales, boolean isFixedRate) throws UnsupportedAudioFileException, IOException { super(); init(WAVEFORM_MODIFICATION, strInputFile, strPitchFile, strOutputFile, pscales, tscales, escales, vscales, isFixedRate); } public FDPSOLAProcessor() { super(); init(TTS_MODIFICATION); } protected void init(int initialisationType) { init(initialisationType, null, null, null, null, null, null, null, false); } protected void init(int initialisationType, String strInputFile, String strPitchFile, String strOutputFile, double[] pscales, double[] tscales, double[] escales, double[] vscales, boolean isFixedRate) { isWavFileOutput = false; inputAudio = null; input = null; pm = null; f0s = null; wsFixedInSeconds = 0.02; ssFixedInSeconds = 0.01; numPeriods = NUM_PITCH_SYNC_PERIODS; origLen = 0; fs = 16000; numfrm = 0; // Total pitch synchronous frames (This is the actual number of frames to be processed) numfrmFixed = 0; // Total frames if the analysis was fixed skip-rate modParams = null; outputFile = null; tscaleSingle = 1.0; boolean bContinue = true; if (initialisationType == WAVEFORM_MODIFICATION) { isWavFileOutput = true; if (!FileUtils.exists(strInputFile)) { System.out.println("Error! Pitch file " + strInputFile + " not found."); bContinue = false; } if (strOutputFile == null || strOutputFile == "") { System.out.println("Invalid output file..."); bContinue = false; } if (bContinue) { try { inputAudio = AudioSystem.getAudioInputStream(new File(strInputFile)); } catch (UnsupportedAudioFileException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } input = new AudioDoubleDataSource(inputAudio); origLen = (int) input.getDataLength(); fs = (int) inputAudio.getFormat().getSampleRate(); if (!FileUtils.exists(strPitchFile)) { System.out.println("Pitch file cannot be found, computing... " + strPitchFile); try { F0TrackerAutocorrelationHeuristic f0Tracker = new F0TrackerAutocorrelationHeuristic(strInputFile, strPitchFile); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } PitchReaderWriter f0 = new PitchReaderWriter(strPitchFile); pm = SignalProcUtils.pitchContour2pitchMarks(f0.contour, fs, origLen, f0.header.windowSizeInSeconds, f0.header.skipSizeInSeconds, true, 0); numfrmFixed = (int) (Math.floor(((double) (origLen + pm.totalZerosToPadd) / fs - 0.5 * wsFixedInSeconds) / ssFixedInSeconds + 0.5) + 2); // Total frames if the analysis was fixed skip-rate if (!isFixedRate) numfrm = pm.pitchMarks.length - numPeriods; // Total pitch synchronous frames (This is the actual number of // frames to be processed) else numfrm = numfrmFixed; f0s = SignalProcUtils.fixedRateF0Values(pm, wsFixedInSeconds, ssFixedInSeconds, numfrmFixed, fs); lpOrder = SignalProcUtils.getLPOrder(fs); modParams = new VoiceModificationParametersPreprocessor(fs, lpOrder, pscales, tscales, escales, vscales, pm.pitchMarks, wsFixedInSeconds, ssFixedInSeconds, numfrm, numfrmFixed, numPeriods, isFixedRate); tscaleSingle = modParams.tscaleSingle; outputFile = strOutputFile; } } else if (initialisationType == TTS_MODIFICATION) { // For test purposes, remove this line if you do not need additional wav file output // outputFile = "d:/tts_out.wav"; lpOrder = SignalProcUtils.getLPOrder(fs); } if (bContinue) { tmpvsc = new double[1]; // bSilent = false; if (outputFile != null) tempOutBinaryFile = outputFile + ".bin"; if (isWavFileOutput) { if (!isFixedRate) psFrm = new PsolaFrameProvider(input, pm, modParams.fs, modParams.numPeriods); else psFrm = new PsolaFrameProvider(input, wsFixedInSeconds, ssFixedInSeconds, modParams.fs, numfrm); try { dout = new LEDataOutputStream(tempOutBinaryFile); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { psFrm = null; dout = null; } windowIn = new DynamicWindow(Window.HANNING); windowOut = new DynamicWindow(Window.HANNING); frmSize = 0; newFrmSize = 0; newPeriod = 0; synthFrmInd = 0; localDurDiff = 0.0; repeatSkipCount = 0; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions for // synthesized frame localDurDiffSaved = 0.0; sumLocalDurDiffs = 0.0; nextAdd = 0.0; if (isWavFileOutput) synthSt = pm.pitchMarks[0]; else synthSt = 0; synthTotal = 0; maxFrmSize = (int) (numPeriods * fs / 40.0); if ((maxFrmSize % 2) != 0) maxFrmSize++; maxNewFrmSize = (int) (Math.floor(maxFrmSize / MIN_PSCALE + 0.5)); if ((maxNewFrmSize % 2) != 0) maxNewFrmSize++; synthFrameInd = 0; bLastFrame = false; bBroke = false; fftSize = (int) Math.pow(2, (Math.ceil(Math.log((double) maxFrmSize) / Math.log(2.0)))); maxFreq = fftSize / 2 + 1; outBuffLen = 500000; outBuff = MathUtils.zeros(outBuffLen); outBuffStart = 1; totalWrittenToFile = 0; ySynthBuff = MathUtils.zeros(maxNewFrmSize); wSynthBuff = MathUtils.zeros(maxNewFrmSize); ySynthInd = 1; // } } /** * Functionally equivalent to {@link #process} (but with most of the cruft removed, which should make this easier to modify) * * @param datagrams * array of Datagram arrays, one element per SelectedUnit * @param rightContexts * array of Datagrams, one element per SelectedUnit * @param audioformat * audioformat * @param voicings * array of boolean arrays, matching <b>datagrams</b> * @param pitchScales * array of double arrays, matching <b>datagrams</b>, pitch modification factors * @param timeScales * array of double arrays, matching <b>datagrams</b>, duration modification factors * @return modified audio as a DoubleDataSource audio stream * @throws IOException * if frames cannot be processed */ public DDSAudioInputStream processDecrufted(Datagram[][] datagrams, Datagram[] rightContexts, AudioFormat audioformat, boolean[][] voicings, double[][] pitchScales, double[][] timeScales) throws IOException { // obscure dependency on several fields: tscaleSingle = -1; origLen = 0; numfrm = 0; for (int i = 0; i < datagrams.length; i++) { for (int j = 0; j < datagrams[i].length; j++) { origLen += datagrams[i][j].getDuration(); if (j == datagrams[i].length - 1 && rightContexts != null && rightContexts[i] != null) { origLen += rightContexts[i].getDuration(); } } numfrm += datagrams[i].length; } // for each unit: for (int i = 0; i < datagrams.length; i++) { // for each datagram in that unit: for (int j = 0; j < datagrams[i].length; j++) { // awkwardly determine next Datagram, which defaults to silence as long as this Datagram... int length = datagrams[i][j].getLength(); Datagram nextDatagram = new Datagram(length, new byte[2 * length]); // ...unless it's not the last in this unit... if (j < datagrams[i].length - 1) { nextDatagram = datagrams[i][j + 1]; } else // ...or we have a right context... if (rightContexts[i] != null) { nextDatagram = rightContexts[i]; } else // ...or we have a next unit // TODO but what if that unit has no frames? if (i < datagrams.length - 1) { nextDatagram = datagrams[i + 1][0]; } assert nextDatagram.getDuration() > 0; // ARG #1, actual frame data for this and the next Datagram: Datagram[] sourceDatagrams = { datagrams[i][j], nextDatagram }; DatagramDoubleDataSource dataSource = new DatagramDoubleDataSource(sourceDatagrams); double[] frmIn = dataSource.getAllData(); // ARG #2, voicing: boolean symbolicVoicing = voicings[i][j]; boolean acousticVoicing = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate())); // inflexible hard-coded toggle between symbolic (phonology) and signal based voicing: boolean isVoiced = symbolicVoicing; // one of: symbolicVoicing, acousticVoicing // ARGs #5-6, some obscure variables: double escale = 1.0; double vscale = 1.0; // ARG #7, is this the last Datagram? boolean bLastInputFrame = (i == datagrams.length - 1) && (j == datagrams[i].length - 1); // ARG #8, duration of this Datagram: int currentPeriod = (int) datagrams[i][j].getDuration(); // ARG #9, number of frames in this and the next Datagram: int inputFrameSize = currentPeriod + (int) nextDatagram.getDuration(); // actually process the data using the ARGs: try { int bufferStartIndex = outBuffStart; processFrame(frmIn, isVoiced, pitchScales[i][j], timeScales[i][j], escale, vscale, bLastInputFrame, currentPeriod, inputFrameSize); int bufferEndIndex = outBuffStart; int bufferLength = bufferEndIndex - bufferStartIndex; // extract processed samples for this datagram from buffer: double[] samples = new double[bufferLength]; System.arraycopy(outBuff, bufferStartIndex - 1, samples, 0, bufferLength); // overwrite datagram duration: datagrams[i][j].setDuration(samples.length); } catch (IOException e) { // TODO how can we throw just e, but attach our message? throw new IOException("Frames could not be processed!", e); } } } int bufferStartIndex = outBuffStart - 1; // initialize the output array: double[] output = null; try { output = writeFinal(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // final processed samples (windowed): int bufferEndIndex = outBuffLen; int bufferLength = bufferEndIndex - bufferStartIndex; double[] samples = new double[bufferLength]; System.arraycopy(outBuff, bufferStartIndex, samples, 0, bufferLength); // update final datagram duration: Datagram finalDatagram = datagrams[datagrams.length - 1][datagrams[datagrams.length - 1].length - 1]; finalDatagram.setDuration(finalDatagram.getDuration() + samples.length); BufferedDoubleDataSource buffer = new BufferedDoubleDataSource(output); DDSAudioInputStream stream = new DDSAudioInputStream(buffer, audioformat); return stream; } // FD-PSOLA using all concatenation units public DDSAudioInputStream process(Datagram[][] datagrams, Datagram[] rightContexts, AudioFormat audioformat, boolean[][] voicings, double[][] pitchScales, double[][] timeScales) { int pitchSpecs = FROM_TARGET; // int pitchSpecs = FROM_FILE; // int pitchSpecs = FROM_CODE; int durationSpecs = FROM_TARGET; // int durationSpecs = FROM_FILE; // int durationSpecs = FROM_CODE; int i, j, k; double[] output = null; boolean isVoiced = true; double pscale = 1.0; // if pitchSpecs==FROM_CODE flag, this value will be used for pitch scaling double tscale = 1.0; // if durationSpecs==FROM_CODE flag, this value will be used for duration scaling double escale = 1.0; double vscale = 1.0; // Read pscale, tscale, escale and vscale from a text file. // (For quick testing purposes. It resets the input pichScales and timeScales to the fixed values in the text file.) if (pitchSpecs == FROM_FILE || durationSpecs == FROM_FILE) { double[] scales = getScalesFromTextFile("d:/psolaParam.txt"); if (pitchSpecs == FROM_FILE) pscale = scales[0]; if (durationSpecs == FROM_FILE) tscale = scales[1]; escale = scales[2]; vscale = scales[3]; } // if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE || durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) { for (i = 0; i < timeScales.length; i++) { if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE) { for (j = 0; j < pitchScales[i].length; j++) pitchScales[i][j] = pscale; } if (durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) { for (j = 0; j < timeScales[i].length; j++) timeScales[i][j] = tscale; } } } double firstTScale = timeScales[0][0]; tscaleSingle = firstTScale; for (i = 0; i < timeScales.length; i++) { for (j = 0; j < timeScales[i].length; j++) { if (i != 0 && j != 0 && timeScales[i][j] != firstTScale) { tscaleSingle = -1.0; break; } } } boolean bLastInputFrame = false; int currentPeriod; int inputFrameSize; double[] frmIn = null; double[] frmTmp = null; int tmpLen; double[] yOut = null; double[] yOutTmp = null; Datagram[] tmpDatagram = new Datagram[1]; origLen = 0; numfrm = 0; for (i = 0; i < datagrams.length; i++) { for (j = 0; j < datagrams[i].length; j++) { if (j == datagrams[i].length - 1) { if (rightContexts != null && rightContexts[i] != null) origLen += datagrams[i][j].getDuration() + rightContexts[i].getDuration(); else origLen += datagrams[i][j].getDuration(); } else origLen += datagrams[i][j].getDuration(); numfrm++; } } int yCounter = -1; for (i = 0; i < datagrams.length; i++) { for (j = 0; j < datagrams[i].length; j++) { if (i == datagrams.length - 1 && j == datagrams[i].length - 1) bLastInputFrame = true; else bLastInputFrame = false; frmIn = null; inputFrameSize = 0; currentPeriod = (int) datagrams[i][j].getDuration(); if (j < datagrams[i].length - 1) { inputFrameSize = (int) datagrams[i][j].getDuration() + (int) datagrams[i][j + 1].getDuration(); frmIn = new double[inputFrameSize]; tmpDatagram[0] = datagrams[i][j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); tmpDatagram[0] = datagrams[i][j + 1]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length); } else { if (rightContexts[i] != null) { inputFrameSize = (int) datagrams[i][j].getDuration() + (int) rightContexts[i].getDuration(); frmIn = new double[inputFrameSize]; tmpDatagram[0] = datagrams[i][j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); tmpDatagram[0] = rightContexts[i]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length); } else { if (i < datagrams.length - 1) { inputFrameSize = (int) datagrams[i][j].getDuration() + (int) datagrams[i + 1][0].getDuration(); frmIn = new double[inputFrameSize]; tmpDatagram[0] = datagrams[i][j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); tmpDatagram[0] = datagrams[i + 1][0]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length); } else { inputFrameSize = 2 * (int) datagrams[i][j].getDuration(); frmIn = MathUtils.zeros(inputFrameSize); tmpDatagram[0] = datagrams[i][j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); } } } if (frmIn != null) // We have a frame to be processed { // isVoiced = voicings[i][j]; isVoiced = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()), 0.35f); try { output = processFrame(frmIn, isVoiced, pitchScales[i][j], timeScales[i][j], escale, vscale, bLastInputFrame, currentPeriod, inputFrameSize); } catch (IOException e) { e.printStackTrace(); } boolean bBroken = false; if (output != null) { if (yOut == null) { yOut = new double[output.length]; System.arraycopy(output, 0, yOut, 0, output.length); } else { yOutTmp = new double[yOut.length]; System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length); yOut = new double[yOutTmp.length + output.length]; System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length); System.arraycopy(output, 0, yOut, yOutTmp.length, output.length); } } if (bBroken) break; } } } try { output = writeFinal(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (output != null) { if (yOut == null) { yOut = new double[output.length]; System.arraycopy(output, 0, yOut, 0, output.length); } else { yOutTmp = new double[yOut.length]; System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length); yOut = new double[yOutTmp.length + output.length]; System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length); System.arraycopy(output, 0, yOut, yOutTmp.length, output.length); } } double absMax = MathUtils.absMax(yOut); if (absMax > 32700) { for (i = 0; i < yOut.length; i++) yOut[i] = yOut[i] / absMax * 32700; } return new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat); } // FD-PDSOLA on the whole signal with specified pitch marks public DDSAudioInputStream process(double[] x, int[] pitchMarks, AudioFormat audioformat, boolean[] voicings, double[] pitchScales, double[] timeScales) { int pitchSpecs = FROM_TARGET; // int pitchSpecs = FROM_FILE; // int pitchSpecs = FROM_CODE; int durationSpecs = FROM_TARGET; // int durationSpecs = FROM_FILE; // int durationSpecs = FROM_CODE; int i, j, k; double[] output = null; boolean isVoiced = true; double pscale = 1.0; // if pitchSpecs==FROM_CODE flag, this value will be used for pitch scaling double tscale = 1.0; // if durationSpecs==FROM_CODE flag, this value will be used for duration scaling double escale = 1.0; double vscale = 1.0; // Read pscale, tscale, escale and vscale from a text file. // (For quick testing purposes. It resest the input pichScales and timeScales to the fixed values in the text file.) if (pitchSpecs == FROM_FILE || durationSpecs == FROM_FILE) { double[] scales = getScalesFromTextFile("d:/psolaParam.txt"); if (pitchSpecs == FROM_FILE) pscale = scales[0]; if (durationSpecs == FROM_FILE) tscale = scales[1]; escale = scales[2]; vscale = scales[3]; } // if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE || durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) { if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE) { for (i = 0; i < pitchScales.length; i++) pitchScales[i] = pscale; } if (durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) { for (i = 0; i < timeScales.length; i++) timeScales[i] = tscale; } } double firstTScale = timeScales[0]; tscaleSingle = firstTScale; for (i = 0; i < timeScales.length; i++) { if (i != 0 && timeScales[i] != firstTScale) { tscaleSingle = -1.0; break; } } boolean bLastInputFrame = false; int currentPeriod; int inputFrameSize; double[] frmIn = null; double[] frmTmp = null; int tmpLen; double[] yOut = null; double[] yOutTmp = null; origLen = x.length; numfrm = pitchMarks.length - numPeriods; int yCounter = -1; for (i = 0; i < pitchMarks.length - numPeriods; i++) { if (i == pitchMarks.length - numPeriods - 1) bLastInputFrame = true; else bLastInputFrame = false; inputFrameSize = pitchMarks[i + numPeriods] - pitchMarks[i] + 1; frmIn = new double[inputFrameSize]; System.arraycopy(x, pitchMarks[i], frmIn, 0, inputFrameSize); currentPeriod = pitchMarks[i + 1] - pitchMarks[i] + 1; if (frmIn != null) // We have a frame to be processed { // isVoiced = voicings[i][j]; isVoiced = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()), 0.35f); try { output = processFrame(frmIn, isVoiced, pitchScales[i], timeScales[i], escale, vscale, bLastInputFrame, currentPeriod, inputFrameSize); } catch (IOException e) { e.printStackTrace(); } boolean bBroken = false; if (output != null) { if (yOut == null) { yOut = new double[output.length]; System.arraycopy(output, 0, yOut, 0, output.length); } else { yOutTmp = new double[yOut.length]; System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length); yOut = new double[yOutTmp.length + output.length]; System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length); System.arraycopy(output, 0, yOut, yOutTmp.length, output.length); } } if (bBroken) break; } } try { output = writeFinal(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (output != null) { if (yOut == null) { yOut = new double[output.length]; System.arraycopy(output, 0, yOut, 0, output.length); } else { yOutTmp = new double[yOut.length]; System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length); yOut = new double[yOutTmp.length + output.length]; System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length); System.arraycopy(output, 0, yOut, yOutTmp.length, output.length); } } double absMax = MathUtils.absMax(yOut); if (absMax > 32700) { for (i = 0; i < yOut.length; i++) yOut[i] = yOut[i] / absMax * 32700; } return new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat); } // FD-PSOLA on a single concatenation unit public double[] processDatagram(Datagram[] datagrams, Datagram rightContext, AudioFormat audioformat, boolean[] voicings, double[] pitchScales, double[] timeScales, boolean bLastDatagram) { int pitchSpecs = FROM_TARGET; // int pitchSpecs = FROM_FILE; // int pitchSpecs = FROM_CODE; int durationSpecs = FROM_TARGET; // int durationSpecs = FROM_FILE; // int durationSpecs = FROM_CODE; int j, k; double[] output = null; boolean isVoiced = true; double pscale = 1.0; // if pitchSpecs==FROM_CODE flag, this value will be used for pitch scaling double tscale = 1.0; // if durationSpecs==FROM_CODE flag, this value will be used for duration scaling double escale = 1.0; double vscale = 1.0; // Read pscale, tscale, escale and vscale from a text file. // (For quick testing purposes. It resest the input pichScales and timeScales to the fixed values in the text file.) if (pitchSpecs == FROM_FILE || durationSpecs == FROM_FILE) { double[] scales = getScalesFromTextFile("d:/psolaParam.txt"); if (pitchSpecs == FROM_FILE) pscale = scales[0]; if (durationSpecs == FROM_FILE) tscale = scales[1]; escale = scales[2]; vscale = scales[3]; } // if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE || durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) { if (pitchSpecs == FROM_FILE || pitchSpecs == FROM_CODE) { for (j = 0; j < pitchScales.length; j++) pitchScales[j] = pscale; } if (durationSpecs == FROM_FILE || durationSpecs == FROM_CODE) { for (j = 0; j < timeScales.length; j++) timeScales[j] = tscale; } } double firstTScale = timeScales[0]; tscaleSingle = firstTScale; for (j = 0; j < timeScales.length; j++) { if (j != 0 && timeScales[j] != firstTScale) { tscaleSingle = -1.0; break; } } boolean bLastInputFrame = false; int currentPeriod; int inputFrameSize; double[] frmIn = null; double[] frmTmp = null; int tmpLen; double[] yOut = null; double[] yOutTmp = null; Datagram[] tmpDatagram = new Datagram[1]; origLen = 0; numfrm = 0; for (j = 0; j < datagrams.length; j++) { if (j == datagrams.length - 1) { if (rightContext != null) origLen += datagrams[j].getDuration() + rightContext.getDuration(); else origLen += datagrams[j].getDuration(); } else origLen += datagrams[j].getDuration(); numfrm++; } int yCounter = -1; for (j = 0; j < datagrams.length; j++) { frmIn = null; inputFrameSize = 0; /* * if (j==datagrams.length-1) bLastInputFrame = true; */ if (bLastDatagram && j == datagrams.length - 1) bLastInputFrame = true; currentPeriod = (int) datagrams[j].getDuration(); if (j < datagrams.length - 1) { inputFrameSize = (int) datagrams[j].getDuration() + (int) datagrams[j + 1].getDuration(); frmIn = new double[inputFrameSize]; tmpDatagram[0] = datagrams[j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); tmpDatagram[0] = datagrams[j + 1]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length); } else { if (rightContext != null) { inputFrameSize = (int) datagrams[j].getDuration() + (int) rightContext.getDuration(); frmIn = new double[inputFrameSize]; tmpDatagram[0] = datagrams[j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); tmpDatagram[0] = rightContext; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); System.arraycopy(frmTmp, 0, frmIn, tmpLen, frmTmp.length); } else { inputFrameSize = 2 * (int) datagrams[j].getDuration(); frmIn = new double[inputFrameSize]; Arrays.fill(frmIn, 0.0); tmpDatagram[0] = datagrams[j]; frmTmp = new DatagramDoubleDataSource(tmpDatagram).getAllData(); tmpLen = frmTmp.length; System.arraycopy(frmTmp, 0, frmIn, 0, tmpLen); } } if (frmIn != null) // We have a frame to be processed { // isVoiced = voicings[j]; isVoiced = SignalProcUtils.getVoicing(frmIn, (int) (audioformat.getSampleRate()), 0.35f); try { output = processFrame(frmIn, isVoiced, pitchScales[j], timeScales[j], escale, vscale, bLastInputFrame, currentPeriod, inputFrameSize); } catch (IOException e) { e.printStackTrace(); } boolean bBroken = false; if (output != null) { if (yOut == null) { yOut = new double[output.length]; System.arraycopy(output, 0, yOut, 0, output.length); } else { yOutTmp = new double[yOut.length]; System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length); yOut = new double[yOutTmp.length + output.length]; System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length); System.arraycopy(output, 0, yOut, yOutTmp.length, output.length); } } if (bBroken) break; } } try { output = writeFinal(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (output != null) { if (yOut == null) { yOut = new double[output.length]; System.arraycopy(output, 0, yOut, 0, output.length); } else { yOutTmp = new double[yOut.length]; System.arraycopy(yOut, 0, yOutTmp, 0, yOut.length); yOut = new double[yOutTmp.length + output.length]; System.arraycopy(yOutTmp, 0, yOut, 0, yOutTmp.length); System.arraycopy(output, 0, yOut, yOutTmp.length, output.length); } } return yOut; } // Read scale factors from a text file for quick testing public double[] getScalesFromTextFile(String strScaleFile) { int i; double[] scales = new double[4]; Reader r = null; try { r = new BufferedReader(new FileReader(strScaleFile)); } catch (FileNotFoundException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } StreamTokenizer stok = new StreamTokenizer(r); stok.parseNumbers(); try { stok.nextToken(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } for (i = 0; i < scales.length; i++) { if (stok.ttype == StreamTokenizer.TT_NUMBER) scales[i] = stok.nval; try { stok.nextToken(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } try { r.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } return scales; } public void fdpsolaOnline() throws IOException { int i; double[] frmIn; boolean isLastInputFrame; int inputFrameSize; int currentPeriod; boolean isVoiced; inputFrameIndex = 0; for (i = 0; i < numfrm; i++) { frmIn = psFrm.getNextFrame(); if (bBroke) break; if (i == numfrm - 1) isLastInputFrame = true; else isLastInputFrame = false; currentPeriod = pm.pitchMarks[i + 1] - pm.pitchMarks[i]; inputFrameSize = pm.pitchMarks[i + modParams.numPeriods] - pm.pitchMarks[i] + 1; isVoiced = pm.f0s[i] > 10.0 ? true : false; processFrame(frmIn, isVoiced, modParams.pscalesVar[i], modParams.tscalesVar[i], modParams.escalesVar[i], modParams.vscalesVar[i], isLastInputFrame, currentPeriod, inputFrameSize); } writeFinal(); convertToWav(inputAudio.getFormat()); inputAudio.close(); } public double[] processFrame(double[] frmIn, boolean isVoiced, double pscale, double tscale, double escale, double vscale, boolean isLastInputFrame, int currentPeriod, int inputFrameSize) throws IOException { if (pscale < MIN_PSCALE) pscale = MIN_PSCALE; if (pscale > MAX_PSCALE) pscale = MAX_PSCALE; if (tscale < MIN_TSCALE) tscale = MIN_TSCALE; if (tscale > MAX_TSCALE) tscale = MAX_TSCALE; double[] output = null; double[] outputTmp = null; int j, k, wInd, kMax; int tmpFix, tmpAdd, tmpMul; int remain; int kInd; repeatSkipCount = 0; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions for // synthesized frame // Compute new frame sizes, change in durations due to pitch scaling, and required compensation amount in samples // & // Find out which pitch-scaled frames to repeat/skip for overall duration // compensation frmSize = inputFrameSize; if ((frmSize % 2) != 0) frmSize++; if (frmSize < 4) frmSize = 4; if (isVoiced) { newFrmSize = (int) (Math.floor(frmSize / pscale + 0.5)); if ((newFrmSize % 2) != 0) newFrmSize++; if (newFrmSize < 4) newFrmSize = 4; } else newFrmSize = frmSize; newPeriod = (int) Math.floor(((double) newFrmSize) / NUM_PITCH_SYNC_PERIODS + 0.5); // Compute duration compensation required: // localDurDiffs(i) = (DESIRED)-(AFTER PITCHSCALING) // (-) if expansion occured, (+) if compression occured // We aim to make this as close to zero as possible in the following duration compensation step localDurDiff = nextAdd + (frmSize * tscale - newFrmSize) / NUM_PITCH_SYNC_PERIODS; nextAdd = 0; if (localDurDiff < -0.1 * newPeriod) // Expansion occured so skip this frame { repeatSkipCount--; if (!isLastInputFrame) { nextAdd = localDurDiff + newPeriod; localDurDiff = 0; } } else if (localDurDiff > 0.1 * newPeriod) // Compression occured so repeat this frame { while (localDurDiff > 0.1 * newPeriod) { repeatSkipCount++; localDurDiff -= newPeriod; } if (!isLastInputFrame) { nextAdd = localDurDiff; localDurDiff = 0; } } sumLocalDurDiffs += localDurDiff; if (isLastInputFrame) { // Check the final length and perform additional repetitions if necessary localDurDiff = sumLocalDurDiffs; while (localDurDiff > 0) { repeatSkipCount++; localDurDiff -= newPeriod; } // } if (isLastInputFrame) { repeatSkipCount++; bLastFrame = true; } if (repeatSkipCount > -1) { frm = MathUtils.zeros(frmSize); System.arraycopy(frmIn, 0, frm, 0, Math.min(frmIn.length, frmSize)); wgt = windowIn.values(frmSize); if (vscale != 1.0) bWarp = true; else bWarp = false; // if (isVoiced || bWarp) //For forcing FDPSOLA to be applied even when pscale=1.0 if ((isVoiced && pscale != 1.0) || bWarp) { if (fftSize < frmSize) { fftSize = (int) Math.pow(2, (Math.ceil(Math.log((double) frmSize) / Math.log(2.0)))); maxFreq = fftSize / 2 + 1; } newMaxFreq = (int) Math.floor(maxFreq / pscale + 0.5); if (newMaxFreq < 3) newMaxFreq = 3; if ((newMaxFreq % 2) != 1) newMaxFreq++; // This is for being able to use the FFT algorithm that works only with buffers of length power of two // If you have an FFT algorithm that works with any buffer size, simply remove this line // newMaxFreq = (int)Math.floor(0.5*MathUtils.closestPowerOfTwoAbove(2*(newMaxFreq-1))+1.5); // newFftSize = 2 * (newMaxFreq - 1); frmEn = SignalProcUtils.getEnergy(frm); // Compute LP and excitation spectrum super.initialise(lpOrder, fs, fftSize, true); // Perform only analysis windowIn.applyInline(frm, 0, frmSize); // Windowing applyInline(frm, 0, frmSize); // LP analysis // Expand/Compress the vocal tract spectrum in inverse manner inputVT = MathUtils.interpolate(vtSpectrum, newMaxFreq); // Interpolated vocal tract spectrum // Perform vocal tract scaling if (bWarp) { tmpvsc[0] = vscale; newVScales = MathUtils.modifySize(tmpvsc, newMaxFreq); // Modify length to match current length of spectrum for (k = 0; k < newVScales.length; k++) { if (newVScales[k] < 0.05) // Put a floor to avoid divide by zero newVScales[k] = 0.05; } py2 = new double[newMaxFreq]; for (k = 0; k < newMaxFreq; k++) { wInd = (int) Math.floor((k + 1) / newVScales[k] + 0.5); // Find new indices if (wInd < 1) wInd = 1; if (wInd > newMaxFreq) wInd = newMaxFreq; py2[k] = inputVT[wInd - 1]; } System.arraycopy(py2, 0, inputVT, 0, newMaxFreq); } // Create output DFT spectrum hy = new ComplexArray(newFftSize); hy.real = MathUtils.zeros(newFftSize); hy.imag = MathUtils.zeros(newFftSize); System.arraycopy(this.h.real, 0, hy.real, 0, Math.min(maxFreq, newFftSize)); System.arraycopy(this.h.imag, 0, hy.imag, 0, Math.min(maxFreq, newFftSize)); // Copy & paste samples if required (COMPLEX VERSION TO SUPPORT PSCALE<=0.5) // This version fills the spectrum by flipping and pasting the original freq bins as many times as required. kMax = 1; while (newMaxFreq > (kMax + 1) * (maxFreq - 2)) kMax++; for (k = 1; k <= kMax; k++) { tmpFix = (maxFreq - 2) * k; if (k % 2 == 1) // Odd mode { tmpAdd = maxFreq + 2; tmpMul = 1; } else { tmpAdd = -1; tmpMul = -1; } for (j = tmpFix + 3; j <= Math.min(newMaxFreq, maxFreq + tmpFix); j++) { hy.real[j - 1] = this.h.real[tmpMul * (tmpFix - j) + tmpAdd - 1]; hy.imag[j - 1] = this.h.imag[tmpMul * (tmpFix - j) + tmpAdd - 1]; } } hy.real[newMaxFreq - 1] = Math.sqrt(hy.real[newMaxFreq - 1] * hy.real[newMaxFreq - 1] + hy.imag[newMaxFreq - 1] * hy.imag[newMaxFreq - 1]); hy.imag[newMaxFreq - 1] = 0.0; // Convolution for (k = 1; k <= newMaxFreq; k++) { hy.real[k - 1] *= inputVT[k - 1]; hy.imag[k - 1] *= inputVT[k - 1]; } for (k = newMaxFreq + 1; k <= newFftSize; k++) { hy.real[k - 1] = hy.real[2 * newMaxFreq - 1 - k]; hy.imag[k - 1] = -hy.imag[2 * newMaxFreq - 1 - k]; } // Convert back to time domain // FFT.transform(hy.real, hy.imag, true); // hy = FFTArbitraryLength.ifft(hy); hy = FFTMixedRadix.ifft(hy); frmy = new double[newFrmSize]; System.arraycopy(hy.real, 0, frmy, 0, newFrmSize); frmyEn = SignalProcUtils.getEnergy(frmy); gain = (frmEn / Math.sqrt(frmSize)) / (frmyEn / Math.sqrt(newFrmSize)) * escale; } else { if (frmSize < newFrmSize) newFrmSize = frmSize; frmy = new double[newFrmSize]; for (k = 0; k < frmSize; k++) frmy[k] = frm[k] * wgt[k]; gain = escale; } // Energy scale compensation + modification for (k = 0; k < newFrmSize; k++) { frmy[k] *= gain; } for (j = 1; j <= repeatSkipCount + 1; j++) { if (isVoiced) newSkipSize = (int) Math.floor(currentPeriod / pscale + 0.5); else newSkipSize = (int) Math.floor(currentPeriod + 0.5); if ((isLastInputFrame && j == repeatSkipCount + 1)) // | (i~=numfrm & all(repeatSkipCounts(i+1:numfrm)==-1))) bLastFrame = true; else bLastFrame = false; synthFrameInd++; wgty = windowOut.values(newFrmSize); if (synthFrameInd == 1) // First frame: Do not window the first half of output speech frame to prevent overflow in // normalization with hanning coeffs { halfWin = (int) Math.floor(newFrmSize / 2.0 + 0.5); synthTotal = synthSt + newFrmSize; // Keep output in an overlap-add buffer if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) { for (k = ySynthInd; k <= ySynthInd + halfWin - 1; k++) { ySynthBuff[k - 1] = frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } for (k = ySynthInd + halfWin; k <= ySynthInd + newFrmSize - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } } else { for (k = ySynthInd; k <= maxNewFrmSize; k++) { if (k - ySynthInd < halfWin) { ySynthBuff[k - 1] = frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } else { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } } for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) { if (maxNewFrmSize - ySynthInd + k < halfWin) { ySynthBuff[k - 1] = frmy[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] = 1.0; } else { ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] += wgty[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; } } } // if (!bSilent) System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1)); } else if (bLastFrame) // Last frame: Do not window the second half of output speech frame to prevent overflow in // normalization with hanning coeffs { halfWin = (int) Math.floor(newFrmSize / 2.0 + 0.5); remain = newFrmSize - halfWin; synthTotal = synthSt + halfWin + remain - 1; // Keep output in an overlap-add buffer if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) { for (k = ySynthInd; k <= ySynthInd + halfWin - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } for (k = ySynthInd + halfWin; k <= ySynthInd + newFrmSize - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } } else { for (k = ySynthInd; k <= maxNewFrmSize; k++) { if (k - ySynthInd < halfWin) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } else { ySynthBuff[k - 1] += frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } } for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) { if (maxNewFrmSize - ySynthInd + k < halfWin) { ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] += wgty[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; } else { ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] = 1.0; } } } // if (!bSilent) System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1)); } else // Normal frame { if (!isVoiced && ((repeatSkipCount % 2) == 1)) // Reverse unvoiced repeated frames once in two consecutive // repetitions to reduce distortion frmy = SignalProcUtils.reverse(frmy); synthTotal = synthSt + newFrmSize; // Keep output in an overlap-add buffer if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) { for (k = ySynthInd; k <= ySynthInd + newFrmSize - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } } else { for (k = ySynthInd; k <= maxNewFrmSize; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) { ySynthBuff[k - 1] += frmy[k + maxNewFrmSize - ySynthInd] * wgty[k + maxNewFrmSize - ySynthInd]; wSynthBuff[k - 1] += wgty[k + maxNewFrmSize - ySynthInd] * wgty[k + maxNewFrmSize - ySynthInd]; } } // if (!bSilent) { if (j == 1) System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1)); else System.out.println("Repeated using frame " + String.valueOf(inputFrameIndex + 1)); } } // Write to output buffer for (k = 0; k <= newSkipSize - 1; k++) { kInd = (k + ySynthInd) % maxNewFrmSize; if (kInd == 0) kInd = maxNewFrmSize; if (wSynthBuff[kInd - 1] > 0.0) outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1] / wSynthBuff[kInd - 1]; else outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1]; ySynthBuff[kInd - 1] = 0.0; wSynthBuff[kInd - 1] = 0.0; outBuffStart++; if (outBuffStart > outBuffLen) { if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffLen <= origLen) { if (isWavFileOutput) dout.writeDouble(outBuff, 0, outBuffLen); else { if (output == null) { output = new double[outBuffLen]; System.arraycopy(outBuff, 0, output, 0, outBuffLen); } else { outputTmp = new double[output.length]; System.arraycopy(output, 0, outputTmp, 0, output.length); output = new double[outputTmp.length + outBuffLen]; System.arraycopy(outputTmp, 0, output, 0, outputTmp.length); System.arraycopy(outBuff, 0, output, outputTmp.length, outBuffLen); } } totalWrittenToFile += outBuffLen; } else { if (isWavFileOutput) dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile); else { if (output == null) { output = new double[origLen - totalWrittenToFile]; System.arraycopy(outBuff, 0, output, 0, origLen - totalWrittenToFile); } else { outputTmp = new double[output.length]; System.arraycopy(output, 0, outputTmp, 0, output.length); output = new double[outputTmp.length + origLen - totalWrittenToFile]; System.arraycopy(outputTmp, 0, output, 0, outputTmp.length); System.arraycopy(outBuff, 0, output, outputTmp.length, origLen - totalWrittenToFile); } } totalWrittenToFile = origLen; } outBuffStart = 1; } } // synthSt += newSkipSize; // if (!bLastFrame) // { if (ySynthInd + newSkipSize <= maxNewFrmSize) ySynthInd += newSkipSize; else ySynthInd += newSkipSize - maxNewFrmSize; // } // /////// if (bLastFrame) { bBroke = true; break; } } } else { if (!bSilent) System.out.println("Skipped frame " + String.valueOf(inputFrameIndex + 1)); } inputFrameIndex++; return output; } public double[] writeFinal() throws IOException { double[] output = null; double[] outputTmp = null; int k, kInd; if (tscaleSingle == 1.0) synthTotal = origLen; if (outBuffLen > synthTotal) outBuffLen = synthTotal; // Write the final segment for (k = synthSt; k <= synthTotal; k++) { kInd = (k - synthSt + ySynthInd) % maxNewFrmSize; if (kInd == 0) kInd = maxNewFrmSize; if (wSynthBuff[kInd - 1] > 0.0) outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1] / wSynthBuff[kInd - 1]; else outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1]; ySynthBuff[kInd - 1] = 0.0; wSynthBuff[kInd - 1] = 0.0; outBuffStart++; if (outBuffStart > outBuffLen) { if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffLen <= origLen) { if (isWavFileOutput) dout.writeDouble(outBuff, 0, outBuffLen); else { if (output == null) { output = new double[outBuffLen]; System.arraycopy(outBuff, 0, output, 0, outBuffLen); } else { outputTmp = new double[output.length]; System.arraycopy(output, 0, outputTmp, 0, output.length); output = new double[outputTmp.length + outBuffLen]; System.arraycopy(outputTmp, 0, output, 0, outputTmp.length); System.arraycopy(outBuff, 0, output, outputTmp.length, outBuffLen); } } totalWrittenToFile += outBuffLen; } else { if (isWavFileOutput) dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile); else { if (output == null) { output = new double[origLen - totalWrittenToFile]; System.arraycopy(outBuff, 0, output, 0, origLen - totalWrittenToFile); } else { outputTmp = new double[output.length]; System.arraycopy(output, 0, outputTmp, 0, output.length); output = new double[outputTmp.length + origLen - totalWrittenToFile]; System.arraycopy(outputTmp, 0, output, 0, outputTmp.length); System.arraycopy(outBuff, 0, output, outputTmp.length, origLen - totalWrittenToFile); } } totalWrittenToFile = origLen; } outBuffStart = 1; } } if (outBuffStart > 1) { if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffStart - 1 <= origLen) { if (isWavFileOutput) dout.writeDouble(outBuff, 0, outBuffStart - 1); else { if (output == null) { output = new double[outBuffStart - 1]; System.arraycopy(outBuff, 0, output, 0, outBuffStart - 1); } else { outputTmp = new double[output.length]; System.arraycopy(output, 0, outputTmp, 0, output.length); output = new double[outputTmp.length + outBuffStart - 1]; System.arraycopy(outputTmp, 0, output, 0, outputTmp.length); System.arraycopy(outBuff, 0, output, outputTmp.length, outBuffStart - 1); } } totalWrittenToFile += outBuffStart - 1; } else { if (isWavFileOutput) dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile); else { if (output == null) { output = new double[origLen - totalWrittenToFile]; System.arraycopy(outBuff, 0, output, 0, origLen - totalWrittenToFile); } else { outputTmp = new double[output.length]; System.arraycopy(output, 0, outputTmp, 0, output.length); output = new double[outputTmp.length + origLen - totalWrittenToFile]; System.arraycopy(outputTmp, 0, output, 0, outputTmp.length); System.arraycopy(outBuff, 0, output, outputTmp.length, origLen - totalWrittenToFile); } } totalWrittenToFile = origLen; } } // if (dout != null) dout.close(); return output; } public void convertToWav(AudioFormat audioformat) throws IOException { // Read the temp binary file into a wav file and delete the temp binary file if (tempOutBinaryFile != null) { double[] yOut = null; din = new LEDataInputStream(tempOutBinaryFile); yOut = din.readDouble(totalWrittenToFile); din.close(); double tmpMax = MathUtils.getAbsMax(yOut); if (tmpMax > 1.0) { for (int n = 0; n < yOut.length; n++) yOut[n] /= tmpMax; } outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat); AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputFile)); File tmpFile = new File(tempOutBinaryFile); tmpFile.delete(); // } } public static void mainParametric(String inputWavFile, double[] pscales, double[] tscales, double[] escales, double[] vscales) throws UnsupportedAudioFileException, IOException { String strExt = ""; String strTmp; if (pscales.length == 1 && tscales.length == 1) { if (pscales[0] != 1.0) { strTmp = String.valueOf(pscales[0]); while (strTmp.length() < 4) strTmp += "0"; strTmp = strTmp.substring(0, 1) + strTmp.substring(2, 3) + strTmp.substring(3, 4); strExt += "_p" + strTmp; } if (tscales[0] != 1.0) { strTmp = String.valueOf(tscales[0]); while (strTmp.length() < 4) strTmp += "0"; strTmp = strTmp.substring(0, 1) + strTmp.substring(2, 3) + strTmp.substring(3, 4); strExt += "_d" + strTmp; } if (pscales[0] == 1.0 && tscales[0] == 1.0) strExt = "_none"; } else strExt = "_pvar_dvar"; String strOutputFile = inputWavFile.substring(0, inputWavFile.length() - 4) + "_fd" + strExt + ".wav"; String strPitchFile = inputWavFile.substring(0, inputWavFile.length() - 4) + ".ptc"; FDPSOLAProcessor fd = new FDPSOLAProcessor(inputWavFile, strPitchFile, strOutputFile, pscales, tscales, escales, vscales); fd.fdpsolaOnline(); } @SuppressWarnings("unused") public static void main(String[] args) throws Exception { if (true) // Test with only one setting { // double [] pscales = {0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.05, 1.10, 1.15, 1.20, 1.25, 1.30, 1.35, // 1.40, 1.45, 1.50}; // double [] tscales = {1.50, 1.45, 1.40, 1.35, 1.30, 1.25, 1.20, 1.15, 1.10, 1.05, 0.95, 0.90, 0.85, 0.80, 0.75, // 0.70, 0.65, 0.60}; double[] pscales = { 1.0 }; double[] tscales = { 1.2 }; double[] escales = { 1.0 }; double[] vscales = { 1.0 }; mainParametric(args[0], pscales, tscales, escales, vscales); } else // Test with multiple settings { double[] escales = { 1.0 }; double[] vscales = { 1.0 }; double[] pscales = { 1.0 }; double[] tscales = { 1.0 }; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 0.55; tscales[0] = 1.0; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 0.80; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 1.50; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 2.50; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 1.0; tscales[0] = 0.55; mainParametric(args[0], pscales, tscales, escales, vscales); tscales[0] = 0.80; mainParametric(args[0], pscales, tscales, escales, vscales); tscales[0] = 1.50; mainParametric(args[0], pscales, tscales, escales, vscales); tscales[0] = 2.50; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 0.55; tscales[0] = 0.80; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 0.80; tscales[0] = 2.50; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 1.50; tscales[0] = 0.55; mainParametric(args[0], pscales, tscales, escales, vscales); pscales[0] = 2.50; tscales[0] = 1.50; mainParametric(args[0], pscales, tscales, escales, vscales); double[] pscalesVar = { 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.05, 1.10, 1.15, 1.20, 1.25, 1.30, 1.35, 1.40, 1.45, 1.50 }; double[] tscalesVar = { 1.50, 1.45, 1.40, 1.35, 1.30, 1.25, 1.20, 1.15, 1.10, 1.05, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60 }; mainParametric(args[0], pscalesVar, tscalesVar, escales, vscales); } System.out.println("FDPSOLA test completed..."); } }