/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.adaptation; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Arrays; import javax.sound.sampled.AudioFileFormat; import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.signalproc.adaptation.codebook.WeightedCodebook; import marytts.signalproc.adaptation.codebook.WeightedCodebookLsfMatch; import marytts.signalproc.adaptation.codebook.WeightedCodebookMapper; import marytts.signalproc.adaptation.codebook.WeightedCodebookTransformerParams; import marytts.signalproc.adaptation.gmm.jointgmm.JointGMMMapper; import marytts.signalproc.adaptation.gmm.jointgmm.JointGMMMatch; import marytts.signalproc.adaptation.gmm.jointgmm.JointGMMSet; import marytts.signalproc.adaptation.gmm.jointgmm.JointGMMTransformerParams; import marytts.signalproc.adaptation.prosody.PitchMapping; import marytts.signalproc.adaptation.prosody.PitchStatistics; import marytts.signalproc.adaptation.prosody.PitchTransformationData; import marytts.signalproc.adaptation.prosody.PitchTransformer; import marytts.signalproc.adaptation.prosody.ProsodyTransformerParams; import marytts.signalproc.adaptation.smoothing.SmoothingDefinitions; import marytts.signalproc.adaptation.smoothing.SmoothingFile; import marytts.signalproc.adaptation.smoothing.TemporalSmoother; import marytts.signalproc.analysis.F0ReaderWriter; import marytts.signalproc.analysis.Labels; import marytts.signalproc.analysis.LpcAnalyser; import marytts.signalproc.analysis.LsfAnalyser; import marytts.signalproc.analysis.PitchMarks; import marytts.signalproc.analysis.PitchReaderWriter; import marytts.signalproc.analysis.LpcAnalyser.LpCoeffs; import marytts.signalproc.process.PsolaFrameProvider; import marytts.signalproc.process.VoiceModificationParametersPreprocessor; import marytts.signalproc.window.DynamicWindow; import marytts.signalproc.window.Window; import marytts.util.data.BufferedDoubleDataSource; import marytts.util.data.DoubleDataSource; import marytts.util.data.audio.AudioDoubleDataSource; import marytts.util.data.audio.DDSAudioInputStream; import marytts.util.display.DisplayUtils; import marytts.util.io.FileUtils; import marytts.util.io.LEDataInputStream; import marytts.util.io.LEDataOutputStream; import marytts.util.math.ArrayUtils; import marytts.util.math.ComplexArray; import marytts.util.math.FFTMixedRadix; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; /** * A class that supports voice conversion through weighted codebook mapping or joint-GMMs and FDPSOLA based prosody and vocal * tract modifications * * Reference: Moulines, E. and W. Verhelst, 1995, “Time-Domain and Frequency-Domain Techniques for Prosodic Modification of * Speech” in Kleijn and Paliwal (eds.), Speech Coding And Synthesis, pp. 519-555, Elsevier Science B.V., Netherlands. * * @author Oytun Türk * */ public class FdpsolaAdapter { protected DoubleDataSource input; protected AudioInputStream inputAudio; protected DDSAudioInputStream outputAudio; protected VoiceModificationParametersPreprocessor modParams; protected int numfrm; protected int numfrmFixed; protected int lpOrder; // LP analysis order protected String outputFile; protected String tempOutBinaryFile; protected int origLen; protected PitchMarks pm; protected double[] f0s; protected PsolaFrameProvider psFrm; protected double wsFixedInSeconds; protected double ssFixedInSeconds; protected int numPeriods; protected static int NUM_PITCH_SYNC_PERIODS = 3; public boolean bSilent; protected LEDataOutputStream dout; // Output stream for big-endian wav tests protected LEDataInputStream din; // Input stream for big-endian wav tests protected DynamicWindow windowIn; protected DynamicWindow windowOut; protected double[] wgt; protected double[] wgty; protected int frmSize; protected int newFrmSize; protected int newPeriod; protected int synthFrmInd; protected double localDurDiff; protected int repeatSkipCount; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions // for synthesized frame protected double localDurDiffSaved; protected double sumLocalDurDiffs; protected double nextAdd; protected int synthSt; protected int synthTotal; protected int maxFrmSize; protected int maxNewFrmSize; protected int synthFrameInd; protected boolean bLastFrame; protected boolean bBroke; protected int outBuffLen; protected double[] outBuff; protected int outBuffStart; protected int totalWrittenToFile; protected double[] ySynthBuff; protected double[] wSynthBuff; protected int ySynthInd; protected double[] frm; protected boolean bWarp; protected double[] inputVT; protected double[] py2; protected ComplexArray hy; protected double[] frmy; protected double frmEn; protected double frmyEn; protected double gain; protected int newSkipSize; protected int halfWin; protected double[] newVScales; protected double[] tmpvsc; // protected boolean isWavFileOutput; protected int inputFrameIndex; protected static double MIN_PSCALE = 0.1; protected static double MAX_PSCALE = 5.0; protected static double MIN_TSCALE = 0.1; protected static double MAX_TSCALE = 5.0; protected int fs; protected double tscaleSingle; private double desiredFrameTime; private boolean bShowSpectralPlots; private PitchTransformer pitchTransformer; private SmoothingFile smoothingFile; private double[][] smoothedVocalTract; private int smoothedInd; private int[] preselectedIndices; private int[] allIndices; private Labels labels; private Labels targetLabels; private int currentLabelIndex; private double[][] targetLsfs; private BaselineTransformerParams baseParams; public FdpsolaAdapter(BaselineAdaptationItem inputItem, String strOutputFile, WeightedCodebookTransformerParams wctParamsIn, double[] pscales, double[] tscales, double[] escales, double[] vscales) throws UnsupportedAudioFileException, IOException { baseParams = new WeightedCodebookTransformerParams(wctParamsIn); init(inputItem, strOutputFile, pscales, tscales, escales, vscales); } public FdpsolaAdapter(BaselineAdaptationItem inputItem, String strOutputFile, JointGMMTransformerParams jgmmParamsIn, double[] pscales, double[] tscales, double[] escales, double[] vscales) throws UnsupportedAudioFileException, IOException { baseParams = new JointGMMTransformerParams(jgmmParamsIn); init(inputItem, strOutputFile, pscales, tscales, escales, vscales); } public void init(BaselineAdaptationItem inputItem, String strOutputFile, double[] pscales, double[] tscales, double[] escales, double[] vscales) throws IOException { // Smoothing smoothingFile = null; if (baseParams.smoothingState == SmoothingDefinitions.NONE) baseParams.smoothedVocalTractFile = ""; if (baseParams.smoothingState == SmoothingDefinitions.ESTIMATING_SMOOTHED_VOCAL_TRACT) { if (baseParams.smoothedVocalTractFile == "") throw new IllegalArgumentException("smoothedVocalTractFile not valid"); else { smoothingFile = new SmoothingFile(baseParams.smoothedVocalTractFile, SmoothingFile.OPEN_FOR_WRITE); smoothingFile.smoothingMethod = baseParams.smoothingMethod; smoothingFile.writeHeader(); } } if (baseParams.smoothingState == SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT && baseParams.smoothingMethod != SmoothingDefinitions.NO_SMOOTHING) { if (!FileUtils.exists(baseParams.smoothedVocalTractFile)) throw new IllegalArgumentException("smoothedVocalTractFile not found"); else { smoothingFile = new SmoothingFile(baseParams.smoothedVocalTractFile, SmoothingFile.OPEN_FOR_READ); smoothedVocalTract = smoothingFile.readAll(); smoothedInd = 0; } } // pitchTransformer = new PitchTransformer(); inputAudio = null; input = null; pm = null; f0s = null; wsFixedInSeconds = 0.02; ssFixedInSeconds = 0.01; numPeriods = NUM_PITCH_SYNC_PERIODS; origLen = 0; fs = 16000; numfrm = 0; // Total pitch synchronous frames (This is the actual number of frames to be processed) numfrmFixed = 0; // Total frames if the analysis was fixed skip-rate modParams = null; outputFile = null; tscaleSingle = 1.0; boolean bContinue = true; if (!FileUtils.exists(inputItem.audioFile)) { System.out.println("Error! Audio file " + inputItem.audioFile + " not found."); bContinue = false; } if (!FileUtils.exists(inputItem.f0File) && !FileUtils.exists(inputItem.pitchFile)) { System.out.println("Error! No F0 or pitch file found: " + inputItem.f0File + " " + inputItem.pitchFile); bContinue = false; } if (strOutputFile == null || strOutputFile == "") { System.out.println("Invalid output file..."); bContinue = false; } if (bContinue) { boolean isF0File = false; if (FileUtils.exists(inputItem.f0File)) isF0File = true; try { inputAudio = AudioSystem.getAudioInputStream(new File(inputItem.audioFile)); } catch (UnsupportedAudioFileException e) { throw new IOException("Cannot open audio", e); } input = new AudioDoubleDataSource(inputAudio); origLen = (int) input.getDataLength(); fs = (int) inputAudio.getFormat().getSampleRate(); PitchReaderWriter f0 = null; if (FileUtils.exists(inputItem.f0File)) f0 = new F0ReaderWriter(inputItem.f0File); else f0 = new PitchReaderWriter(inputItem.pitchFile); pm = SignalProcUtils.pitchContour2pitchMarks(f0.contour, fs, origLen, f0.header.windowSizeInSeconds, f0.header.skipSizeInSeconds, true, 0); numfrmFixed = (int) (Math.floor(((double) (origLen + pm.totalZerosToPadd) / fs - 0.5 * wsFixedInSeconds) / ssFixedInSeconds + 0.5) + 2); // Total frames if the analysis was fixed skip-rate if (!baseParams.isFixedRateVocalTractConversion) numfrm = pm.pitchMarks.length - numPeriods; // Total pitch synchronous frames (This is the actual number of frames // to be processed) else numfrm = numfrmFixed; f0s = SignalProcUtils.fixedRateF0Values(pm, wsFixedInSeconds, ssFixedInSeconds, numfrmFixed, fs); lpOrder = SignalProcUtils.getLPOrder(fs); // Estimation of time varying pitch, duration, energy, and vocal tract scaling amounts using either: // - pscales, tscales, escales, vscales // - or FESTIVAL_UTT(for pitch and duration) and target wav file(for energy) if (!baseParams.isPitchFromTargetFile && !baseParams.isDurationFromTargetFile && !baseParams.isEnergyFromTargetFile) { modParams = new VoiceModificationParametersPreprocessor(fs, lpOrder, pscales, tscales, escales, vscales, pm.pitchMarks, wsFixedInSeconds, ssFixedInSeconds, numfrm, numfrmFixed, numPeriods, baseParams.isFixedRateVocalTractConversion); tscaleSingle = modParams.tscaleSingle; } else { // inputItem.targetEnergyFile should be computed from inputItem.targetWavFile at this point // inputItem.energyFile should be computed from inputItem.audioFile at this point String targetAlignmentFile = null; if (baseParams.targetAlignmentFileType == BaselineTransformerParams.LABELS) targetAlignmentFile = inputItem.targetLabelFile; else if (baseParams.targetAlignmentFileType == BaselineTransformerParams.FESTIVAL_UTT) targetAlignmentFile = inputItem.targetFestivalUttFile; String sourcePitchContourFile; if (isF0File) sourcePitchContourFile = inputItem.f0File; else sourcePitchContourFile = inputItem.pitchFile; String targetPitchContourFile; if (isF0File) targetPitchContourFile = inputItem.targetF0File; else targetPitchContourFile = inputItem.targetPitchFile; modParams = new VoiceModificationParametersPreprocessor(sourcePitchContourFile, isF0File, inputItem.labelFile, inputItem.audioFile, targetPitchContourFile, inputItem.targetWavFile, baseParams.isPitchFromTargetFile, baseParams.pitchFromTargetMethod, baseParams.isDurationFromTargetFile, baseParams.durationFromTargetMethod, baseParams.isEnergyFromTargetFile, baseParams.targetAlignmentFileType, targetAlignmentFile, pm.pitchMarks, wsFixedInSeconds, ssFixedInSeconds, numfrm, numfrmFixed, numPeriods, baseParams.isFixedRateVocalTractConversion); tscaleSingle = 1.0; for (int i = 0; i < modParams.tscalesVar.length; i++) { if (modParams.tscalesVar[i] != 1.0) { tscaleSingle = -1.0; break; } } } outputFile = strOutputFile; if (inputItem.labelFile != "" && FileUtils.exists(inputItem.labelFile)) labels = new Labels(inputItem.labelFile); else labels = null; if (inputItem.targetLabelFile != "" && FileUtils.exists(inputItem.targetLabelFile)) targetLabels = new Labels(inputItem.targetLabelFile); else targetLabels = null; if (inputItem.targetWavFile != "" && FileUtils.exists(inputItem.targetWavFile) && baseParams.isLsfsFromTargetFile) { try { targetLsfs = LsfAnalyser.lsfAnalyzeWavFile(inputItem.targetWavFile, baseParams.lsfParams); } catch (UnsupportedAudioFileException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else targetLsfs = null; } if (bContinue) { tmpvsc = new double[1]; bSilent = false; if (outputFile != null) tempOutBinaryFile = outputFile + ".bin"; if (!baseParams.isFixedRateVocalTractConversion) psFrm = new PsolaFrameProvider(input, pm, modParams.fs, modParams.numPeriods); else psFrm = new PsolaFrameProvider(input, wsFixedInSeconds, ssFixedInSeconds, modParams.fs, numfrm); try { dout = new LEDataOutputStream(tempOutBinaryFile); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } windowIn = new DynamicWindow(Window.HANNING); windowOut = new DynamicWindow(Window.HANNING); frmSize = 0; newFrmSize = 0; newPeriod = 0; synthFrmInd = 0; localDurDiff = 0.0; repeatSkipCount = 0; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions for // synthesized frame localDurDiffSaved = 0.0; sumLocalDurDiffs = 0.0; nextAdd = 0.0; synthSt = pm.pitchMarks[0]; synthTotal = 0; maxFrmSize = (int) (numPeriods * fs / 40.0); if ((maxFrmSize % 2) != 0) maxFrmSize++; maxNewFrmSize = (int) (Math.floor(maxFrmSize / MIN_PSCALE + 0.5)); if ((maxNewFrmSize % 2) != 0) maxNewFrmSize++; synthFrameInd = 0; bLastFrame = false; bBroke = false; outBuffLen = 500000; outBuff = MathUtils.zeros(outBuffLen); outBuffStart = 1; totalWrittenToFile = 0; ySynthBuff = MathUtils.zeros(maxNewFrmSize); wSynthBuff = MathUtils.zeros(maxNewFrmSize); ySynthInd = 1; } } public void fdpsolaOnline(VocalTractTransformationFunction vtMapper, VocalTractTransformationData vtData, PitchTransformationData ptData) throws IOException { int i; double[] frmIn; boolean isLastInputFrame; int inputFrameSize; int currentPeriod; desiredFrameTime = 1.05; bShowSpectralPlots = false; fs = (int) inputAudio.getFormat().getSampleRate(); PitchStatistics inputF0Statistics = new PitchStatistics(baseParams.prosodyParams.pitchStatisticsType, f0s); double[] targetF0s = null; if (!baseParams.isPitchFromTargetFile) { if (ptData instanceof PitchMapping) { targetF0s = pitchTransformer.transform(baseParams.prosodyParams, ((PitchMapping) ptData).f0StatisticsMapping, inputF0Statistics, f0s, modParams.pscalesVar); } else { baseParams.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.USE_ONLY_PSCALES; targetF0s = pitchTransformer.transform(baseParams.prosodyParams, ((PitchMapping) ptData).f0StatisticsMapping, inputF0Statistics, f0s, modParams.pscalesVar); } } else { targetF0s = new double[numfrm]; } preselectedIndices = null; allIndices = null; if (vtData instanceof WeightedCodebook && baseParams instanceof WeightedCodebookTransformerParams && !((WeightedCodebookTransformerParams) baseParams).isContextBasedPreselection) { // Whole codebook allIndices = new int[((WeightedCodebook) vtData).entries.length]; for (i = 0; i < allIndices.length; i++) allIndices[i] = i; } inputFrameIndex = 0; for (i = 0; i < numfrm; i++) { frmIn = psFrm.getNextFrame(); if (bBroke) break; if (i == numfrm - 1) isLastInputFrame = true; else isLastInputFrame = false; if (!baseParams.isFixedRateVocalTractConversion) { currentPeriod = pm.pitchMarks[i + 1] - pm.pitchMarks[i]; inputFrameSize = pm.pitchMarks[i + modParams.numPeriods] - pm.pitchMarks[i] + 1; } else { currentPeriod = -1; inputFrameSize = frmIn.length; } double targetF0Value = 0.0; double currentF0Value; int index = (int) (Math.floor((psFrm.getCurrentTime() - 0.5 * wsFixedInSeconds) / ssFixedInSeconds + 0.5)); if (!baseParams.isPitchFromTargetFile) { index = MathUtils.CheckLimits(index, 0, targetF0s.length - 1); targetF0Value = targetF0s[index]; } currentF0Value = f0s[index]; if (baseParams.isPitchFromTargetFile) { targetF0Value = currentF0Value * modParams.pscalesVar[i]; targetF0s[i] = targetF0Value; } boolean isVoiced; if (!baseParams.isFixedRateVocalTractConversion) { if (pm.f0s[i] > 10.0) isVoiced = true; else isVoiced = false; } else { if (currentF0Value > 10.0) isVoiced = true; else isVoiced = false; } double currentF0; if (isVoiced) currentF0 = fs / currentPeriod; else currentF0 = 0.0; if (labels != null) currentLabelIndex = SignalProcUtils.time2LabelIndex(psFrm.getCurrentTime(), labels); else currentLabelIndex = -1; processFrame(frmIn, isVoiced, currentF0, targetF0Value, modParams.tscalesVar[i], modParams.escalesVar[i], modParams.vscalesVar[i], isLastInputFrame, currentPeriod, inputFrameSize, vtMapper, vtData); if (baseParams.isVocalTractTransformation && baseParams.smoothingState == SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT && baseParams.smoothingMethod != SmoothingDefinitions.NO_SMOOTHING) { smoothedInd++; if (smoothedInd > smoothedVocalTract.length - 1) smoothedInd = smoothedVocalTract.length - 1; } } writeFinal(); convertToWav(inputAudio.getFormat()); inputAudio.close(); // Perform smoothing on the vocal tract parameter file if (baseParams.smoothingState == SmoothingDefinitions.ESTIMATING_SMOOTHED_VOCAL_TRACT) { if (baseParams.isVocalTractTransformation) { System.out.println("Temporal smoothing started using " + String.valueOf(baseParams.smoothingNumNeighbours) + " neighbours..."); smoothingFile.close(); smoothingFile = new SmoothingFile(baseParams.smoothedVocalTractFile, SmoothingFile.OPEN_FOR_READ); double[][] vts = smoothingFile.readAll(); double[] tmp1 = new double[vts.length]; for (i = 0; i < vts.length; i++) tmp1[i] = vts[i][20]; vts = TemporalSmoother.smooth(vts, baseParams.smoothingNumNeighbours); double[] tmp2 = new double[vts.length]; for (i = 0; i < vts.length; i++) tmp2[i] = vts[i][20]; smoothingFile = new SmoothingFile(baseParams.smoothedVocalTractFile, SmoothingFile.OPEN_FOR_WRITE, baseParams.smoothingMethod); smoothingFile.writeAll(vts); System.out.println("Temporal smoothing completed..."); } } else if (baseParams.smoothingState == SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT) FileUtils.delete(baseParams.smoothedVocalTractFile); // /* * MaryUtils.plot(f0s); MaryUtils.plot(targetF0s); */ } /* * Voice conversion version of FD-PSOLA algorithm. The original FD-PSOLA is described in: * * Moulines, E. and W. Verhelst, 1995, “Time-Domain and Frequency-Domain Techniques for Prosodic Modification of Speech” in * Kleijn and Paliwal (eds.), Speech Coding And Synthesis, pp. 519-555, Elsevier Science B.V., Netherlands. * * and the spectral conversion using codebook mapping in: * * Arslan, L. M., 1999, “Speaker Transformation Algorithm using Segmental Codebooks”, Speech Communication, 28, pp. 211-226. */ public double[] processFrame(double[] frmIn, boolean isVoiced, double currentF0, double targetF0, double tscale, double escale, double vscale, boolean isLastInputFrame, int currentPeriod, int inputFrameSize, VocalTractTransformationFunction mapper, VocalTractTransformationData data) throws IOException { double pscale; if (currentF0 > 10.0) pscale = targetF0 / currentF0; else pscale = 1.0; if (pscale < MIN_PSCALE) pscale = MIN_PSCALE; if (pscale > MAX_PSCALE) pscale = MAX_PSCALE; if (tscale < MIN_TSCALE) tscale = MIN_TSCALE; if (tscale > MAX_TSCALE) tscale = MAX_TSCALE; double[] output = null; double[] outputTmp = null; int j, k, wInd, kMax; int tmpFix, tmpAdd, tmpMul; int remain; int kInd; VocalTractMatch match = null; windowIn = new DynamicWindow(baseParams.lsfParams.windowType); windowOut = new DynamicWindow(baseParams.lsfParams.windowType); repeatSkipCount = 0; // -1:skip frame, 0:no repetition (use synthesized frame as it is), >0: number of repetitions for // synthesized frame // Compute new frame sizes, change in durations due to pitch scaling, and required compensation amount in samples // & // Find out which pitch-scaled frames to repeat/skip for overall duration // compensation frmSize = inputFrameSize; if ((frmSize % 2) != 0) frmSize++; if (frmSize < 4) frmSize = 4; if (isVoiced) { newFrmSize = (int) (Math.floor(frmSize / pscale + 0.5)); if ((newFrmSize % 2) != 0) newFrmSize++; if (newFrmSize < 4) newFrmSize = 4; } else newFrmSize = frmSize; newPeriod = (int) Math.floor(((double) newFrmSize) / NUM_PITCH_SYNC_PERIODS + 0.5); // Compute duration compensation required: // localDurDiffs(i) = (DESIRED)-(AFTER PITCHSCALING) // (-) if expansion occured, (+) if compression occured // We aim to make this as close to zero as possible in the following duration compensation step localDurDiff = nextAdd + (frmSize * tscale - newFrmSize) / NUM_PITCH_SYNC_PERIODS; nextAdd = 0; if (localDurDiff < -0.1 * newPeriod) // Expansion occured so skip this frame { repeatSkipCount--; if (!isLastInputFrame) { nextAdd = localDurDiff + newPeriod; localDurDiff = 0; } } else if (localDurDiff > 0.1 * newPeriod) // Compression occured so repeat this frame { while (localDurDiff > 0.1 * newPeriod) { repeatSkipCount++; localDurDiff -= newPeriod; } if (!isLastInputFrame) { nextAdd = localDurDiff; localDurDiff = 0; } } sumLocalDurDiffs += localDurDiff; if (isLastInputFrame) { // Check the final length and perform additional repetitions if necessary localDurDiff = sumLocalDurDiffs; while (localDurDiff > 0) { repeatSkipCount++; localDurDiff -= newPeriod; } // } if (isLastInputFrame) { repeatSkipCount++; bLastFrame = true; } double[] tmpSpec; ComplexArray tmpComp; LpCoeffs inputLPCoeffs = null; double[] inputLpcs = null; double[] inputLsfs = null; double sqrtInputGain; double[] targetLpcs = null; ComplexArray inputDft = null; ComplexArray inputExpTerm = null; ComplexArray outputExpTerm = null; ComplexArray inputResidual = null; ComplexArray outputResidual = null; ComplexArray outputDft = null; double[] inputVocalTractSpectrum = null; double[] interpolatedInputLpcs = null; double[] sourceVocalTractSpectrumEstimate = null; double[] targetVocalTractSpectrumEstimate = null; double[] interpolatedInputVocalTractSpectrum = null; double[] outputVocalTractSpectrum = null; double[] warpedOutputVocalTractSpectrum = null; double[] transformationFilter = null; Context currentContext = null; if (repeatSkipCount > -1) { frm = MathUtils.zeros(frmSize); System.arraycopy(frmIn, 0, frm, 0, Math.min(frmIn.length, frmSize)); wgt = windowIn.values(frmSize); if (vscale != 1.0) bWarp = true; else bWarp = false; boolean isTransformUnvoiced = true; if ((isVoiced && pscale != 1.0) || bWarp || isTransformUnvoiced) { frmEn = SignalProcUtils.getEnergy(frm); wgt = windowIn.values(frmSize); // Windowing for (j = 0; j < frmSize; j++) frm[j] = frm[j] * wgt[j]; // Preemphasis frm = SignalProcUtils.applyPreemphasis(frm, baseParams.lsfParams.preCoef); // Compute LPC coefficients inputLPCoeffs = LpcAnalyser.calcLPC(frm, baseParams.lsfParams.dimension); inputLpcs = inputLPCoeffs.getOneMinusA(); if (!baseParams.lsfParams.isBarkScaled) inputLsfs = LsfAnalyser.lpc2lsfInHz(inputLpcs, fs); else inputLsfs = LsfAnalyser.lpc2lsfInBark(inputLpcs, fs); sqrtInputGain = inputLPCoeffs.getGain(); // Find target estimate from codebook if (baseParams.isVocalTractTransformation) { if (mapper instanceof WeightedCodebookMapper) { if (baseParams instanceof WeightedCodebookTransformerParams && ((WeightedCodebookTransformerParams) baseParams).isContextBasedPreselection) { currentContext = new Context(labels, currentLabelIndex, ((WeightedCodebookTransformerParams) baseParams).totalContextNeighbours); preselectedIndices = ((WeightedCodebookMapper) mapper).preselect(currentContext, (WeightedCodebook) data, ((WeightedCodebookTransformerParams) baseParams).isVocalTractMatchUsingTargetModel, ((WeightedCodebookTransformerParams) baseParams).mapperParams.numBestMatches); } if (preselectedIndices != null) match = ((WeightedCodebookMapper) mapper).transform(inputLsfs, (WeightedCodebook) data, baseParams.isVocalTractMatchUsingTargetModel, preselectedIndices); else match = ((WeightedCodebookMapper) mapper).transform(inputLsfs, (WeightedCodebook) data, baseParams.isVocalTractMatchUsingTargetModel, allIndices); // Use source for testing things. Don´t forget to set isSourceVocalTractFromCodeook=false // codebookMatch = new WeightedCodebookMatch(inputLsfs, inputLsfs); } else if (mapper instanceof JointGMMMapper) { // Different weighting strategies can be tested here, i.e. doing a fuzzy phone classification double[] gmmWeights = new double[((JointGMMSet) data).gmms.length]; Arrays.fill(gmmWeights, 0.0); int currentGmmIndex = -1; if (currentLabelIndex >= 0 && currentLabelIndex < labels.items.length) currentGmmIndex = ((JointGMMSet) data).cgParams.getClassIndex(labels.items[currentLabelIndex].phn); if (currentGmmIndex > 0) gmmWeights[currentGmmIndex] = 1.0; gmmWeights = MathUtils.normalizeToSumUpTo(gmmWeights, 1.0); match = ((JointGMMMapper) mapper).transform(inputLsfs, (JointGMMSet) data, gmmWeights, baseParams.isVocalTractMatchUsingTargetModel); } else if (mapper instanceof TargetLsfCopyMapper) { match = ((TargetLsfCopyMapper) mapper).transform(psFrm.getCurrentTime(), labels, targetLabels, targetLsfs, baseParams.lsfParams.winsize, baseParams.lsfParams.skipsize); } } inputDft = new ComplexArray(frmSize); int maxFreq = frmSize / 2 + 1; System.arraycopy(frm, 0, inputDft.real, 0, Math.min(frmSize, inputDft.real.length)); if (inputDft.real.length > frmSize) Arrays.fill(inputDft.real, inputDft.real.length - frmSize, inputDft.real.length - 1, 0); Arrays.fill(inputDft.imag, 0, inputDft.imag.length - 1, 0); inputDft = FFTMixedRadix.fftComplex(inputDft); // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpComp = new ComplexArray(inputDft); tmpSpec = MathUtils.dft2ampdb(tmpComp, 0, maxFreq); DisplayUtils.plot(tmpSpec, "1.Input DFT"); } // inputExpTerm = LpcAnalyser.calcExpTerm(frmSize, baseParams.lsfParams.dimension); outputExpTerm = LpcAnalyser.calcExpTerm(newFrmSize, baseParams.lsfParams.dimension); inputVocalTractSpectrum = LpcAnalyser.calcSpecLinearFromOneMinusA(inputLPCoeffs.getOneMinusA(), (float) sqrtInputGain, frmSize, inputExpTerm); // Use a weighted codebook estimate of the input vocal tract spectrum. This will result in a smoother // transformation filter if (baseParams.isSourceVocalTractSpectrumFromModel && baseParams.isVocalTractTransformation) { if (mapper instanceof WeightedCodebookMapper) { if (!baseParams.isResynthesizeVocalTractFromSourceModel) { if (!baseParams.lsfParams.isBarkScaled) interpolatedInputLpcs = LsfAnalyser.lsfInHz2lpc( ((WeightedCodebookLsfMatch) match).entry.sourceItem.lsfs, fs); else interpolatedInputLpcs = LsfAnalyser.lsfInBark2lpc( ((WeightedCodebookLsfMatch) match).entry.sourceItem.lsfs, fs); } else { if (!baseParams.lsfParams.isBarkScaled) interpolatedInputLpcs = LsfAnalyser.lsfInHz2lpc( ((WeightedCodebookLsfMatch) match).entry.targetItem.lsfs, fs); else interpolatedInputLpcs = LsfAnalyser.lsfInBark2lpc( ((WeightedCodebookLsfMatch) match).entry.targetItem.lsfs, fs); } } else if (mapper instanceof JointGMMMapper) { if (!baseParams.isResynthesizeVocalTractFromSourceModel) { if (!baseParams.lsfParams.isBarkScaled) interpolatedInputLpcs = LsfAnalyser.lsfInHz2lpc(((JointGMMMatch) match).mappedSourceFeatures, fs); else interpolatedInputLpcs = LsfAnalyser.lsfInBark2lpc(((JointGMMMatch) match).mappedSourceFeatures, fs); } else { if (!baseParams.lsfParams.isBarkScaled) interpolatedInputLpcs = LsfAnalyser.lsfInHz2lpc(((JointGMMMatch) match).outputFeatures, fs); else interpolatedInputLpcs = LsfAnalyser.lsfInBark2lpc(((JointGMMMatch) match).outputFeatures, fs); } } else if (mapper instanceof TargetLsfCopyMapper) interpolatedInputLpcs = ArrayUtils.copy(inputLpcs); sourceVocalTractSpectrumEstimate = LpcAnalyser.calcSpecLinearFromOneMinusA(interpolatedInputLpcs, 1.0f, newFrmSize, outputExpTerm); } // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpSpec = new double[maxFreq]; System.arraycopy(inputVocalTractSpectrum, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "2.Input Vocal Tract"); FileUtils.writeToTextFile(inputVocalTractSpectrum, "d:/hmmTest_inputVT.txt"); } // // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime && baseParams.isSourceVocalTractSpectrumFromModel && baseParams.isVocalTractTransformation) { tmpSpec = new double[maxFreq]; System.arraycopy(sourceVocalTractSpectrumEstimate, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "3.Source Vocal Tract Estimate"); FileUtils.writeToTextFile(sourceVocalTractSpectrumEstimate, "d:/hmmTest_estimateVT.txt"); } // inputResidual = new ComplexArray(frmSize); // Filter out vocal tract to obtain the input residual spectrum (note that this is the real residual spectrum) for (k = 0; k < maxFreq; k++) { inputResidual.real[k] = inputDft.real[k] / inputVocalTractSpectrum[k]; inputResidual.imag[k] = inputDft.imag[k] / inputVocalTractSpectrum[k]; } // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpComp = new ComplexArray(inputResidual); tmpSpec = MathUtils.dft2ampdb(tmpComp, 0, maxFreq - 1); DisplayUtils.plot(tmpSpec, "4.Input Residual"); } // int newMaxFreq = newFrmSize / 2 + 1; if (baseParams.isVocalTractTransformation) { // Smoothing if (baseParams.smoothingMethod == SmoothingDefinitions.OUTPUT_LSFCONTOUR_SMOOTHING) { if (baseParams.smoothingState == SmoothingDefinitions.ESTIMATING_SMOOTHED_VOCAL_TRACT) { if (!baseParams.isResynthesizeVocalTractFromSourceModel) smoothingFile.writeSingle(((WeightedCodebookLsfMatch) match).entry.targetItem.lsfs); else smoothingFile.writeSingle(((WeightedCodebookLsfMatch) match).entry.sourceItem.lsfs); } else if (baseParams.smoothingState == SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT) { if (!baseParams.isResynthesizeVocalTractFromSourceModel) ((WeightedCodebookLsfMatch) match).entry.targetItem.setLsfs(smoothedVocalTract[smoothedInd]); else ((WeightedCodebookLsfMatch) match).entry.sourceItem.setLsfs(smoothedVocalTract[smoothedInd]); } } // if (match instanceof WeightedCodebookLsfMatch) { if (!baseParams.isResynthesizeVocalTractFromSourceModel) { if (!baseParams.lsfParams.isBarkScaled) targetLpcs = LsfAnalyser .lsfInHz2lpc(((WeightedCodebookLsfMatch) match).entry.targetItem.lsfs, fs); else targetLpcs = LsfAnalyser.lsfInBark2lpc(((WeightedCodebookLsfMatch) match).entry.targetItem.lsfs, fs); } else { if (!baseParams.lsfParams.isBarkScaled) targetLpcs = LsfAnalyser .lsfInHz2lpc(((WeightedCodebookLsfMatch) match).entry.sourceItem.lsfs, fs); else targetLpcs = LsfAnalyser.lsfInBark2lpc(((WeightedCodebookLsfMatch) match).entry.sourceItem.lsfs, fs); } } else if (match instanceof JointGMMMatch) { if (!baseParams.isResynthesizeVocalTractFromSourceModel) { if (!baseParams.lsfParams.isBarkScaled) targetLpcs = LsfAnalyser.lsfInHz2lpc(((JointGMMMatch) match).outputFeatures, fs); else targetLpcs = LsfAnalyser.lsfInBark2lpc(((JointGMMMatch) match).outputFeatures, fs); } else { if (!baseParams.lsfParams.isBarkScaled) targetLpcs = LsfAnalyser.lsfInHz2lpc(((JointGMMMatch) match).mappedSourceFeatures, fs); else targetLpcs = LsfAnalyser.lsfInBark2lpc(((JointGMMMatch) match).mappedSourceFeatures, fs); } } else if (match instanceof LsfMatch) { if (!baseParams.lsfParams.isBarkScaled) targetLpcs = LsfAnalyser.lsfInHz2lpc(((LsfMatch) match).lsfs, fs); else targetLpcs = LsfAnalyser.lsfInBark2lpc(((LsfMatch) match).lsfs, fs); } if (frmSize != newFrmSize) { if (outputExpTerm == null || newMaxFreq * baseParams.lsfParams.dimension != outputExpTerm.real.length) outputExpTerm = LpcAnalyser.calcExpTerm(newFrmSize, baseParams.lsfParams.dimension); targetVocalTractSpectrumEstimate = LpcAnalyser.calcSpecLinearFromOneMinusA(targetLpcs, 1.0f, newFrmSize, outputExpTerm); } else targetVocalTractSpectrumEstimate = LpcAnalyser.calcSpecLinearFromOneMinusA(targetLpcs, 1.0f, newFrmSize, inputExpTerm); for (k = 0; k < newMaxFreq; k++) targetVocalTractSpectrumEstimate[k] *= sqrtInputGain; } // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime && baseParams.isVocalTractTransformation) { tmpSpec = new double[newMaxFreq]; System.arraycopy(targetVocalTractSpectrumEstimate, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "5.Target Vocal Tract Estimate"); } // outputVocalTractSpectrum = new double[newMaxFreq]; interpolatedInputVocalTractSpectrum = MathUtils.interpolate(inputVocalTractSpectrum, newMaxFreq); if (baseParams.isVocalTractTransformation) { if (baseParams.isSourceVocalTractSpectrumFromModel) { for (k = 0; k < newMaxFreq; k++) outputVocalTractSpectrum[k] = targetVocalTractSpectrumEstimate[k] / sourceVocalTractSpectrumEstimate[k] * interpolatedInputVocalTractSpectrum[k]; } else { for (k = 0; k < newMaxFreq; k++) outputVocalTractSpectrum[k] = targetVocalTractSpectrumEstimate[k]; } } else { for (k = 0; k < newMaxFreq; k++) outputVocalTractSpectrum[k] = interpolatedInputVocalTractSpectrum[k]; } // MaryUtils.plot(MathUtils.amp2db(inputVocalTractSpectrum)); // MaryUtils.plot(MathUtils.amp2db(interpolatedInputVocalTractSpectrum)); // Estimate transformation filter if (baseParams.isVocalTractTransformation) { transformationFilter = new double[newMaxFreq]; if (baseParams.isSourceVocalTractSpectrumFromModel) { for (k = 0; k < newMaxFreq; k++) transformationFilter[k] = targetVocalTractSpectrumEstimate[k] / sourceVocalTractSpectrumEstimate[k]; } else { for (k = 0; k < newMaxFreq; k++) transformationFilter[k] = targetVocalTractSpectrumEstimate[k] / interpolatedInputVocalTractSpectrum[k]; } // // Smoothing if (baseParams.smoothingMethod == SmoothingDefinitions.TRANSFORMATION_FILTER_SMOOTHING) { if (baseParams.smoothingState == SmoothingDefinitions.ESTIMATING_SMOOTHED_VOCAL_TRACT) { smoothingFile.writeSingle(transformationFilter); // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpSpec = new double[newMaxFreq]; System.arraycopy(transformationFilter, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "6.Transformation filter"); } } else if (baseParams.smoothingState == SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT) { if (baseParams.isSourceVocalTractSpectrumFromModel) { for (k = 0; k < newMaxFreq; k++) outputVocalTractSpectrum[k] = smoothedVocalTract[smoothedInd][k] * sourceVocalTractSpectrumEstimate[k]; } else { for (k = 0; k < newMaxFreq; k++) outputVocalTractSpectrum[k] = smoothedVocalTract[smoothedInd][k] * interpolatedInputVocalTractSpectrum[k]; } // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpSpec = new double[newMaxFreq]; System.arraycopy(smoothedVocalTract[smoothedInd], 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "6.Smoothed transformation filter"); } } else { // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpSpec = new double[newMaxFreq]; System.arraycopy(transformationFilter, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "6.Transformation filter"); } } } else { // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpSpec = new double[newMaxFreq]; System.arraycopy(transformationFilter, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "6.Transformation filter"); } } } // // Perform additional vocal tract scaling if (bWarp) { tmpvsc[0] = vscale; newVScales = MathUtils.modifySize(tmpvsc, newMaxFreq); // Modify length to match current length of spectrum for (k = 0; k < newVScales.length; k++) { if (newVScales[k] < 0.05) // Put a floor to avoid divide by zero newVScales[k] = 0.05; } warpedOutputVocalTractSpectrum = new double[newMaxFreq]; for (k = 0; k < newMaxFreq; k++) { wInd = (int) Math.floor((k + 1) / newVScales[k] + 0.5); // Find new indices if (wInd < 1) wInd = 1; if (wInd > newMaxFreq) wInd = newMaxFreq; warpedOutputVocalTractSpectrum[k] = outputVocalTractSpectrum[wInd - 1]; } System.arraycopy(warpedOutputVocalTractSpectrum, 0, outputVocalTractSpectrum, 0, newMaxFreq); } // Create output DFT spectrum outputResidual = new ComplexArray(newFrmSize); outputResidual.real = MathUtils.zeros(newFrmSize); outputResidual.imag = MathUtils.zeros(newFrmSize); System.arraycopy(inputResidual.real, 0, outputResidual.real, 0, Math.min(maxFreq, newFrmSize)); System.arraycopy(inputResidual.imag, 0, outputResidual.imag, 0, Math.min(maxFreq, newFrmSize)); // Copy & paste samples if required (COMPLEX VERSION TO SUPPORT PSCALE<=0.5) // This version fills the spectrum by flipping and pasting the original freq bins as many times as required. kMax = 1; while (newMaxFreq > (kMax + 1) * (maxFreq - 2)) kMax++; for (k = 1; k <= kMax; k++) { tmpFix = (maxFreq - 2) * k; if (k % 2 == 1) // Odd mode { tmpAdd = maxFreq + 2; tmpMul = 1; } else { tmpAdd = -1; tmpMul = -1; } for (j = tmpFix + 3; j <= Math.min(newMaxFreq, maxFreq + tmpFix); j++) { outputResidual.real[j - 1] = inputResidual.real[tmpMul * (tmpFix - j) + tmpAdd - 1]; outputResidual.imag[j - 1] = inputResidual.imag[tmpMul * (tmpFix - j) + tmpAdd - 1]; } } outputResidual.real[newMaxFreq - 1] = Math.sqrt(outputResidual.real[newMaxFreq - 1] * outputResidual.real[newMaxFreq - 1] + outputResidual.imag[newMaxFreq - 1] * outputResidual.imag[newMaxFreq - 1]); outputResidual.imag[newMaxFreq - 1] = 0.0; // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpComp = new ComplexArray(outputResidual); tmpSpec = MathUtils.dft2ampdb(tmpComp, 0, newMaxFreq - 1); DisplayUtils.plot(tmpSpec, "7.Output Residual"); } // // Filter the output residual with the estimated target vocal tract spectrum outputDft = new ComplexArray(newFrmSize); // Smoothing if (baseParams.smoothingMethod == SmoothingDefinitions.OUTPUT_VOCALTRACTSPECTRUM_SMOOTHING) { if (baseParams.smoothingState == SmoothingDefinitions.ESTIMATING_SMOOTHED_VOCAL_TRACT) { smoothingFile.writeSingle(outputVocalTractSpectrum, newMaxFreq); } else if (baseParams.smoothingState == SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT) { for (k = 0; k < newMaxFreq; k++) outputVocalTractSpectrum[k] = smoothedVocalTract[smoothedInd][k]; } } // // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpSpec = new double[newMaxFreq]; System.arraycopy(outputVocalTractSpectrum, 0, tmpSpec, 0, tmpSpec.length); tmpSpec = MathUtils.amp2db(tmpSpec); DisplayUtils.plot(tmpSpec, "8.Output Vocal Tract"); } // for (k = 1; k <= newMaxFreq; k++) { outputDft.real[k - 1] = outputResidual.real[k - 1] * outputVocalTractSpectrum[k - 1]; outputDft.imag[k - 1] = outputResidual.imag[k - 1] * outputVocalTractSpectrum[k - 1]; } for (k = newMaxFreq + 1; k <= newFrmSize; k++) { outputDft.real[k - 1] = outputDft.real[2 * newMaxFreq - 1 - k]; outputDft.imag[k - 1] = -outputDft.imag[2 * newMaxFreq - 1 - k]; } // For checking if (bShowSpectralPlots && psFrm.getCurrentTime() >= desiredFrameTime) { tmpComp = new ComplexArray(outputDft); tmpSpec = MathUtils.dft2ampdb(tmpComp, 0, newMaxFreq); DisplayUtils.plot(tmpSpec, "9.Output DFT"); bShowSpectralPlots = false; } // // Convert back to time domain outputDft = FFTMixedRadix.ifft(outputDft); frmy = new double[newFrmSize]; System.arraycopy(outputDft.real, 0, frmy, 0, newFrmSize); } else { if (frmSize < newFrmSize) newFrmSize = frmSize; frmy = new double[newFrmSize]; } frmy = SignalProcUtils.removePreemphasis(frmy, baseParams.lsfParams.preCoef); frmyEn = SignalProcUtils.getEnergy(frmy); gain = (frmEn / Math.sqrt(frmSize)) / (frmyEn / Math.sqrt(newFrmSize)) * escale; if (!(isVoiced && pscale != 1.0) && !bWarp && !isTransformUnvoiced) { for (k = 0; k < frmSize; k++) frmy[k] = frm[k] * wgt[k]; } // Energy scale compensation + modification for (k = 0; k < newFrmSize; k++) frmy[k] *= gain; for (j = 1; j <= repeatSkipCount + 1; j++) { if (!baseParams.isFixedRateVocalTractConversion) { if (isVoiced) newSkipSize = (int) Math.floor(currentPeriod / pscale + 0.5); else newSkipSize = (int) Math.floor(currentPeriod + 0.5); } else newSkipSize = (int) Math.floor(ssFixedInSeconds * fs + 0.5); if ((isLastInputFrame && j == repeatSkipCount + 1)) // | (i~=numfrm & all(repeatSkipCounts(i+1:numfrm)==-1))) bLastFrame = true; else bLastFrame = false; synthFrameInd++; wgty = windowOut.values(newFrmSize); if (synthFrameInd == 1) // First frame: Do not window the first half of output speech frame to prevent overflow in // normalization with hanning coeffs { halfWin = (int) Math.floor(newFrmSize / 2.0 + 0.5); synthTotal = synthSt + newFrmSize; // Keep output in an overlap-add buffer if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) { for (k = ySynthInd; k <= ySynthInd + halfWin - 1; k++) { ySynthBuff[k - 1] = frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } for (k = ySynthInd + halfWin; k <= ySynthInd + newFrmSize - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } } else { for (k = ySynthInd; k <= maxNewFrmSize; k++) { if (k - ySynthInd < halfWin) { ySynthBuff[k - 1] = frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } else { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } } for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) { if (maxNewFrmSize - ySynthInd + k < halfWin) { ySynthBuff[k - 1] = frmy[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] = 1.0; } else { ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] += wgty[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; } } } // if (!bSilent) System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1)); } else if (bLastFrame) // Last frame: Do not window the second half of output speech frame to prevent overflow in // normalization with hanning coeffs { halfWin = (int) Math.floor(newFrmSize / 2.0 + 0.5); remain = newFrmSize - halfWin; synthTotal = synthSt + halfWin + remain - 1; // Keep output in an overlap-add buffer if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) { for (k = ySynthInd; k <= ySynthInd + halfWin - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } for (k = ySynthInd + halfWin; k <= ySynthInd + newFrmSize - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } } else { for (k = ySynthInd; k <= maxNewFrmSize; k++) { if (k - ySynthInd < halfWin) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } else { ySynthBuff[k - 1] += frmy[k - ySynthInd]; wSynthBuff[k - 1] = 1.0; } } for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) { if (maxNewFrmSize - ySynthInd + k < halfWin) { ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] += wgty[maxNewFrmSize - ySynthInd + k] * wgty[maxNewFrmSize - ySynthInd + k]; } else { ySynthBuff[k - 1] += frmy[maxNewFrmSize - ySynthInd + k]; wSynthBuff[k - 1] = 1.0; } } } // if (!bSilent) System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1)); } else // Normal frame { if (!isVoiced && ((repeatSkipCount % 2) == 1)) // Reverse unvoiced repeated frames once in two consecutive // repetitions to reduce distortion frmy = SignalProcUtils.reverse(frmy); synthTotal = synthSt + newFrmSize; // Keep output in an overlap-add buffer if (ySynthInd + newFrmSize - 1 <= maxNewFrmSize) { for (k = ySynthInd; k <= ySynthInd + newFrmSize - 1; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } } else { for (k = ySynthInd; k <= maxNewFrmSize; k++) { ySynthBuff[k - 1] += frmy[k - ySynthInd] * wgty[k - ySynthInd]; wSynthBuff[k - 1] += wgty[k - ySynthInd] * wgty[k - ySynthInd]; } for (k = 1; k <= newFrmSize - 1 - maxNewFrmSize + ySynthInd; k++) { ySynthBuff[k - 1] += frmy[k + maxNewFrmSize - ySynthInd] * wgty[k + maxNewFrmSize - ySynthInd]; wSynthBuff[k - 1] += wgty[k + maxNewFrmSize - ySynthInd] * wgty[k + maxNewFrmSize - ySynthInd]; } } // if (!bSilent) { if (j == 1) System.out.println("Synthesized using frame " + String.valueOf(inputFrameIndex + 1)); else System.out.println("Repeated using frame " + String.valueOf(inputFrameIndex + 1)); } } // Write to output buffer for (k = 0; k <= newSkipSize - 1; k++) { kInd = (k + ySynthInd) % maxNewFrmSize; if (kInd == 0) kInd = maxNewFrmSize; if (wSynthBuff[kInd - 1] > 0.0) outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1] / wSynthBuff[kInd - 1]; else outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1]; ySynthBuff[kInd - 1] = 0.0; wSynthBuff[kInd - 1] = 0.0; outBuffStart++; if (outBuffStart > outBuffLen) { if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffLen <= origLen) { dout.writeDouble(outBuff, 0, outBuffLen); totalWrittenToFile += outBuffLen; } else { dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile); totalWrittenToFile = origLen; } outBuffStart = 1; } } // synthSt += newSkipSize; // if (!bLastFrame) // { if (ySynthInd + newSkipSize <= maxNewFrmSize) ySynthInd += newSkipSize; else ySynthInd += newSkipSize - maxNewFrmSize; // } // /////// if (bLastFrame) { bBroke = true; break; } } } else { if (!bSilent) System.out.println("Skipped frame " + String.valueOf(inputFrameIndex + 1)); } inputFrameIndex++; return output; } public double[] writeFinal() throws IOException { double[] output = null; double[] outputTmp = null; int k, kInd; if (tscaleSingle == 1.0) synthTotal = origLen; if (outBuffLen > synthTotal) outBuffLen = synthTotal; // Write the final segment for (k = synthSt; k <= synthTotal; k++) { kInd = (k - synthSt + ySynthInd) % maxNewFrmSize; if (kInd == 0) kInd = maxNewFrmSize; if (wSynthBuff[kInd - 1] > 0.0) outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1] / wSynthBuff[kInd - 1]; else outBuff[outBuffStart - 1] = ySynthBuff[kInd - 1]; ySynthBuff[kInd - 1] = 0.0; wSynthBuff[kInd - 1] = 0.0; outBuffStart++; if (outBuffStart > outBuffLen) { if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffLen <= origLen) { dout.writeDouble(outBuff, 0, outBuffLen); totalWrittenToFile += outBuffLen; } else { dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile); totalWrittenToFile = origLen; } outBuffStart = 1; } } if (outBuffStart > 1) { if (tscaleSingle != 1.0 || totalWrittenToFile + outBuffStart - 1 <= origLen) { dout.writeDouble(outBuff, 0, outBuffStart - 1); totalWrittenToFile += outBuffStart - 1; } else { dout.writeDouble(outBuff, 0, origLen - totalWrittenToFile); totalWrittenToFile = origLen; } } // if (dout != null) dout.close(); return output; } public void convertToWav(AudioFormat audioformat) throws IOException { // Read the temp binary file into a wav file and delete the temp binary file if (tempOutBinaryFile != null) { double[] yOut = null; din = new LEDataInputStream(tempOutBinaryFile); yOut = din.readDouble(totalWrittenToFile); din.close(); double tmpMax = MathUtils.getAbsMax(yOut); if (tmpMax > 1.0) { for (int n = 0; n < yOut.length; n++) yOut[n] /= tmpMax; } outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(yOut), audioformat); AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputFile)); FileUtils.delete(tempOutBinaryFile); // } } }