/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.adaptation.gmm.jointgmm; import java.io.IOException; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.machinelearning.ContextualGMMParams; import marytts.signalproc.adaptation.BaselineAdaptationItem; import marytts.signalproc.adaptation.BaselineAdaptationSet; import marytts.signalproc.adaptation.BaselineFeatureExtractor; import marytts.signalproc.adaptation.BaselinePostprocessor; import marytts.signalproc.adaptation.BaselinePreprocessor; import marytts.signalproc.adaptation.BaselineTransformer; import marytts.signalproc.adaptation.BaselineTransformerParams; import marytts.signalproc.adaptation.FdpsolaAdapter; import marytts.signalproc.adaptation.MfccAdapter; import marytts.signalproc.adaptation.TargetLsfCopyMapper; import marytts.signalproc.adaptation.prosody.PitchMapping; import marytts.signalproc.adaptation.prosody.PitchMappingFile; import marytts.signalproc.adaptation.prosody.PitchStatistics; import marytts.signalproc.adaptation.prosody.PitchTransformationData; import marytts.signalproc.adaptation.prosody.ProsodyTransformerParams; import marytts.signalproc.adaptation.smoothing.SmoothingDefinitions; import marytts.signalproc.analysis.LsfFileHeader; import marytts.signalproc.analysis.MfccFileHeader; import marytts.util.io.BasenameList; import marytts.util.io.FileUtils; import marytts.util.string.StringUtils; /** * * Voice conversion transformation using Joint-GMM approach. * * Reference: A. Kain and M. Macon, “Spectral voice conversion for text-to-speech synthesis,” in Proc. of the IEEE ICASSP 1998, * vol. 1, pp. 285-288. * * @author Oytun Türk */ public class JointGMMTransformer extends BaselineTransformer { public JointGMMTransformerParams params; public JointGMMMapper mapper; public JointGMMSet jointGmmSet; private PitchMappingFile pitchMappingFile; public PitchMapping pitchMapping; public JointGMMTransformer(BaselinePreprocessor pp, BaselineFeatureExtractor fe, BaselinePostprocessor po, JointGMMTransformerParams pa) { super(pp, fe, po, pa); params = new JointGMMTransformerParams(pa); jointGmmSet = null; mapper = null; } public boolean checkParams() throws IOException { params.inputFolder = StringUtils.checkLastSlash(params.inputFolder); params.outputBaseFolder = StringUtils.checkLastSlash(params.outputBaseFolder); // Read joint GMM file JointGMM nonNullGmm = null; if (!FileUtils.exists(params.jointGmmFile)) { System.out.println("Error: Codebook file " + params.jointGmmFile + " not found!"); return false; } else // Read full GMM from the joint GMM file { jointGmmSet = new JointGMMSet(params.jointGmmFile); assert jointGmmSet.gmms != null; for (int i = 0; i < jointGmmSet.gmms.length; i++) { if (jointGmmSet.gmms[i] != null) { nonNullGmm = new JointGMM(jointGmmSet.gmms[i]); break; } } if (nonNullGmm != null) { if (nonNullGmm.featureType == BaselineFeatureExtractor.LSF_FEATURES) params.lsfParams = new LsfFileHeader((LsfFileHeader) nonNullGmm.featureParams); else if (nonNullGmm.featureType == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) params.mfccParams = new MfccFileHeader((MfccFileHeader) nonNullGmm.featureParams); } } if (nonNullGmm == null) { System.out.println("Error! All GMMs are null in " + params.jointGmmFile); return false; } // // Read pitch mapping file if (!FileUtils.exists(params.pitchMappingFile)) { System.out.println("Error: Pitch mapping file " + params.pitchMappingFile + " not found!"); return false; } else // Read lsfParams from the codebook header { pitchMappingFile = new PitchMappingFile(params.pitchMappingFile, PitchMappingFile.OPEN_FOR_READ); pitchMapping = new PitchMapping(); pitchMapping.header = pitchMappingFile.readPitchMappingHeader(); } // if (!FileUtils.exists(params.inputFolder) || !FileUtils.isDirectory(params.inputFolder)) { System.out.println("Error: Input folder " + params.inputFolder + " not found!"); return false; } if (!FileUtils.isDirectory(params.outputBaseFolder)) { System.out.println("Creating output base folder " + params.outputBaseFolder + "..."); FileUtils.createDirectory(params.outputBaseFolder); } if (params.outputFolderInfoString != "") { params.outputFolder = params.outputBaseFolder + params.outputFolderInfoString + "_mixes" + String.valueOf(nonNullGmm.source.totalComponents) + "_prosody" + String.valueOf(params.prosodyParams.pitchStatisticsType) + "x" + String.valueOf(params.prosodyParams.pitchTransformationMethod) + "x" + String.valueOf(params.prosodyParams.durationTransformationMethod); } else { params.outputFolder = params.outputBaseFolder + "_mixes" + String.valueOf(nonNullGmm.source.totalComponents) + "_prosody" + String.valueOf(params.prosodyParams.pitchStatisticsType) + "x" + String.valueOf(params.prosodyParams.pitchTransformationMethod) + "x" + String.valueOf(params.prosodyParams.durationTransformationMethod); } if (!FileUtils.isDirectory(params.outputFolder)) { System.out.println("Creating output folder " + params.outputFolder + "..."); FileUtils.createDirectory(params.outputFolder); } if (!params.isSeparateProsody) params.isSaveVocalTractOnlyVersion = false; if (params.isPitchFromTargetFile) params.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.CUSTOM_TRANSFORMATION; if (params.isDurationFromTargetFile) params.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.CUSTOM_TRANSFORMATION; if (params.isEnergyFromTargetFile) params.prosodyParams.energyTransformationMethod = ProsodyTransformerParams.CUSTOM_TRANSFORMATION; if (!params.isVocalTractTransformation && !params.isLsfsFromTargetFile) params.isTemporalSmoothing = false; return true; } public void run() throws IOException, UnsupportedAudioFileException { if (checkParams()) { BaselineAdaptationSet inputSet = getInputSet(params.inputFolder); if (inputSet == null) System.out.println("No input files found in " + params.inputFolder); else { BaselineAdaptationSet outputSet = getOutputSet(inputSet, params.outputFolder); transform(inputSet, outputSet); } } } // Create list of input files public BaselineAdaptationSet getInputSet(String inputFolder) { BasenameList b = new BasenameList(inputFolder, BaselineAdaptationSet.WAV_EXTENSION_DEFAULT); BaselineAdaptationSet inputSet = new BaselineAdaptationSet(b.getListAsVector().size()); for (int i = 0; i < inputSet.items.length; i++) inputSet.items[i].setFromWavFilename(inputFolder + b.getName(i) + BaselineAdaptationSet.WAV_EXTENSION_DEFAULT); return inputSet; } // // Create list of output files using input set public BaselineAdaptationSet getOutputSet(BaselineAdaptationSet inputSet, String outputFolder) { BaselineAdaptationSet outputSet = null; outputFolder = StringUtils.checkLastSlash(outputFolder); if (inputSet != null && inputSet.items != null) { outputSet = new BaselineAdaptationSet(inputSet.items.length); for (int i = 0; i < inputSet.items.length; i++) { outputSet.items[i].audioFile = outputFolder + StringUtils.getFileName(inputSet.items[i].audioFile) + "_output" + BaselineAdaptationSet.WAV_EXTENSION_DEFAULT; outputSet.items[i].rawMfccFile = StringUtils.modifyExtension(outputSet.items[i].audioFile, BaselineAdaptationSet.RAWMFCC_EXTENSION_DEFAULT); } } return outputSet; } // public void transform(BaselineAdaptationSet inputSet, BaselineAdaptationSet outputSet) throws UnsupportedAudioFileException { System.out.println("Transformation started..."); if (inputSet.items != null && outputSet.items != null) { int numItems = Math.min(inputSet.items.length, outputSet.items.length); if (numItems > 0) { preprocessor.run(inputSet); int desiredFeatures = BaselineFeatureExtractor.F0_FEATURES; try { featureExtractor.run(inputSet, params, desiredFeatures); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } // Read the pitch mapping file pitchMappingFile.readPitchMappingFileExcludingHeader(pitchMapping); // Create a mapper object mapper = new JointGMMMapper(); // Do the transformations now for (int i = 0; i < numItems; i++) { try { transformOneItem(inputSet.items[i], outputSet.items[i], params, mapper, jointGmmSet, pitchMapping); } catch (UnsupportedAudioFileException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Transformed file " + String.valueOf(i + 1) + " of " + String.valueOf(numItems)); } } System.out.println("Transformation completed..."); } // This function performs the actual voice conversion public static void transformOneItem(BaselineAdaptationItem inputItem, BaselineAdaptationItem outputItem, JointGMMTransformerParams wctParams, JointGMMMapper jgMapper, JointGMMSet jgSet, PitchTransformationData pMap) throws UnsupportedAudioFileException, IOException { TargetLsfCopyMapper tcMapper = new TargetLsfCopyMapper(); // LSF transformation is done fully from audio to audio if (jgSet.gmms[0].featureType == BaselineFeatureExtractor.LSF_FEATURES) { if (wctParams.isTemporalSmoothing) // Need to do two pass for smoothing { wctParams.isSeparateProsody = true; wctParams.isFixedRateVocalTractConversion = true; } if (wctParams.isFixedRateVocalTractConversion) { if (wctParams.prosodyParams.pitchTransformationMethod != ProsodyTransformerParams.NO_TRANSFORMATION || wctParams.prosodyParams.pitchTransformationMethod != ProsodyTransformerParams.CUSTOM_TRANSFORMATION || wctParams.prosodyParams.durationTransformationMethod != ProsodyTransformerParams.CUSTOM_TRANSFORMATION || wctParams.prosodyParams.energyTransformationMethod != ProsodyTransformerParams.CUSTOM_TRANSFORMATION) { wctParams.isSeparateProsody = true; } } // Desired values should be specified in the following four parameters double[] pscales = { 1.0 }; double[] tscales = { 1.0 }; double[] escales = { 1.0 }; double[] vscales = { 1.0 }; // These are for fixed rate vocal tract transformation: Do not change these!!! double[] pscalesNone = { 1.0 }; double[] tscalesNone = { 1.0 }; double[] escalesNone = { 1.0 }; double[] vscalesNone = { 1.0 }; // FdpsolaAdapter adapter = null; JointGMMTransformerParams currentWctParams = new JointGMMTransformerParams(wctParams); String firstPassOutputWavFile = ""; String smoothedVocalTractFile = ""; if (currentWctParams.isSeparateProsody) // First pass with no prosody modifications { firstPassOutputWavFile = StringUtils.getFolderName(outputItem.audioFile) + StringUtils.getFileName(outputItem.audioFile) + "_vt.wav"; smoothedVocalTractFile = StringUtils.getFolderName(outputItem.audioFile) + StringUtils.getFileName(outputItem.audioFile) + "_vt.vtf"; int tmpPitchTransformationMethod = currentWctParams.prosodyParams.pitchTransformationMethod; int tmpDurationTransformationMethod = currentWctParams.prosodyParams.durationTransformationMethod; int tmpEnergyTransformationMethod = currentWctParams.prosodyParams.energyTransformationMethod; currentWctParams.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; currentWctParams.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; currentWctParams.prosodyParams.energyTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; boolean tmpPitchFromTargetFile = currentWctParams.isPitchFromTargetFile; boolean tmpDurationFromTargetFile = currentWctParams.isDurationFromTargetFile; boolean tmpEnergyFromTargetFile = currentWctParams.isEnergyFromTargetFile; currentWctParams.isPitchFromTargetFile = false; currentWctParams.isDurationFromTargetFile = false; currentWctParams.isEnergyFromTargetFile = false; if (currentWctParams.isTemporalSmoothing) // This estimates the vocal tract filter but performs no prosody and // vocal tract transformations { currentWctParams.smoothingState = SmoothingDefinitions.ESTIMATING_SMOOTHED_VOCAL_TRACT; currentWctParams.smoothedVocalTractFile = smoothedVocalTractFile; // It is an output at first pass adapter = new FdpsolaAdapter(inputItem, firstPassOutputWavFile, currentWctParams, pscalesNone, tscalesNone, escalesNone, vscalesNone); adapter.bSilent = !currentWctParams.isDisplayProcessingFrameCount; if (!currentWctParams.isLsfsFromTargetFile) adapter.fdpsolaOnline(jgMapper, jgSet, pMap); else adapter.fdpsolaOnline(tcMapper, jgSet, pMap); currentWctParams.smoothingState = SmoothingDefinitions.TRANSFORMING_TO_SMOOTHED_VOCAL_TRACT; currentWctParams.smoothedVocalTractFile = smoothedVocalTractFile; // Now it is an input adapter = new FdpsolaAdapter(inputItem, firstPassOutputWavFile, currentWctParams, pscalesNone, tscalesNone, escalesNone, vscalesNone); } else { currentWctParams.smoothingMethod = SmoothingDefinitions.NO_SMOOTHING; currentWctParams.smoothingState = SmoothingDefinitions.NONE; currentWctParams.smoothedVocalTractFile = ""; adapter = new FdpsolaAdapter(inputItem, firstPassOutputWavFile, currentWctParams, pscalesNone, tscalesNone, escalesNone, vscalesNone); } currentWctParams.isPitchFromTargetFile = tmpPitchFromTargetFile; currentWctParams.isDurationFromTargetFile = tmpDurationFromTargetFile; currentWctParams.isEnergyFromTargetFile = tmpEnergyFromTargetFile; if (adapter != null) { adapter.bSilent = !currentWctParams.isDisplayProcessingFrameCount; if (!currentWctParams.isLsfsFromTargetFile) adapter.fdpsolaOnline(jgMapper, jgSet, pMap); else adapter.fdpsolaOnline(tcMapper, jgSet, pMap); // Then second step: prosody modification (with possible additional vocal tract scaling) if (isScalingsRequired(pscales, tscales, escales, vscales) || tmpPitchTransformationMethod != ProsodyTransformerParams.NO_TRANSFORMATION) { System.out.println("Performing prosody modifications..."); currentWctParams.isVocalTractTransformation = false; // isVocalTractTransformation should be false currentWctParams.isFixedRateVocalTractConversion = false; // isFixedRateVocalTractConversion should be // false to enable prosody modifications with // FD-PSOLA currentWctParams.isResynthesizeVocalTractFromSourceModel = false; // isResynthesizeVocalTractFromSourceCodebook // should be false currentWctParams.isVocalTractMatchUsingTargetModel = false; // isVocalTractMatchUsingTargetCodebook should // be false currentWctParams.prosodyParams.pitchTransformationMethod = tmpPitchTransformationMethod; currentWctParams.prosodyParams.durationTransformationMethod = tmpDurationTransformationMethod; currentWctParams.prosodyParams.energyTransformationMethod = tmpEnergyTransformationMethod; currentWctParams.smoothingMethod = SmoothingDefinitions.NO_SMOOTHING; currentWctParams.smoothingState = SmoothingDefinitions.NONE; currentWctParams.smoothedVocalTractFile = ""; String tmpInputWavFile = inputItem.audioFile; inputItem.audioFile = firstPassOutputWavFile; adapter = new FdpsolaAdapter(inputItem, outputItem.audioFile, currentWctParams, pscales, tscales, escales, vscales); inputItem.audioFile = tmpInputWavFile; adapter.bSilent = true; adapter.fdpsolaOnline(null, jgSet, pMap); } else // Copy output file FileUtils.copy(firstPassOutputWavFile, outputItem.audioFile); // Delete first pass output file if (!currentWctParams.isSaveVocalTractOnlyVersion) FileUtils.delete(firstPassOutputWavFile); System.out.println("Done..."); } } else // Single-pass prosody+vocal tract transformation and modification { currentWctParams.smoothingMethod = SmoothingDefinitions.NO_SMOOTHING; currentWctParams.smoothingState = SmoothingDefinitions.NONE; currentWctParams.smoothedVocalTractFile = ""; adapter = new FdpsolaAdapter(inputItem, outputItem.audioFile, currentWctParams, pscales, tscales, escales, vscales); adapter.bSilent = !wctParams.isDisplayProcessingFrameCount; if (!currentWctParams.isLsfsFromTargetFile) adapter.fdpsolaOnline(jgMapper, jgSet, pMap); else adapter.fdpsolaOnline(tcMapper, jgSet, pMap); } } else if (jgSet.gmms[0].featureType == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { // MFCC transformation is done from MFCC file to MFCC file MfccAdapter adapter = null; JointGMMTransformerParams currentWctParams = new JointGMMTransformerParams(wctParams); adapter = new MfccAdapter(inputItem, outputItem.rawMfccFile, currentWctParams); // Then second step: vocal tract transformation if (adapter != null) { adapter.bSilent = !currentWctParams.isDisplayProcessingFrameCount; adapter.transformOnline(jgMapper, jgSet); // Call voice conversion version System.out.println("Done..."); } } } public static void main(String[] args) throws IOException, UnsupportedAudioFileException { // mainIeeeTaslp2009_rap(args); // mainIeeeTaslp2009_mary(args); // mainInterspeech2008(args); // mainHmmVoiceConversion(args); // mainQuickTest(args); mainQuickTest2(args); } public static void mainHmmVoiceConversion(String[] args) throws UnsupportedAudioFileException, IOException { // String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/hsmmMfcc_25Dimensional/"; // String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/lsp_21Dimensional/"; // String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/mellsp_21Dimensional/"; // String baseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/mfcc_25Dimensional/"; String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest2/"; String sourceTag = "hmmSource_nogv"; String targetTag = "origTarget"; String method = "F"; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); BaselinePostprocessor po = new BaselinePostprocessor(); JointGMMTransformerParams pa = new JointGMMTransformerParams(); int numTrainingFiles = 1092; int i; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 128 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; String inputFolder = wavBaseFolder + "/" + sourceTag + "/test_1/"; String outputBaseFolder; if (!isContextualGMMs) { outputBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/gmm" + method + "_" + String.valueOf(numTrainingFiles); } else { outputBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/gmm" + method + "_" + String.valueOf(numTrainingFiles) + "_" + "context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) outputBaseFolder += "_" + String.valueOf(numComponents[i]); } String baseFile = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/" + sourceTag + method + "_X_" + targetTag + method + "_" + String.valueOf(numTrainingFiles); pa.isForcedAnalysis = false; // pa.isSourceVocalTractSpectrumFromModel = false; pa.isSourceVocalTractSpectrumFromModel = true; pa.isVocalTractTransformation = true; pa.isResynthesizeVocalTractFromSourceModel = false; pa.isVocalTractMatchUsingTargetModel = false; // Smoothing pa.isTemporalSmoothing = false; pa.smoothingNumNeighbours = 3; if (!pa.isTemporalSmoothing) pa.smoothingNumNeighbours = 0; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_LSFCONTOUR_SMOOTHING; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_VOCALTRACTSPECTRUM_SMOOTHING; pa.smoothingMethod = SmoothingDefinitions.TRANSFORMATION_FILTER_SMOOTHING; // pa.isDisplayProcessingFrameCount = true; pa.inputFolder = inputFolder; pa.outputBaseFolder = outputBaseFolder; if (!isContextualGMMs) pa.jointGmmFile = baseFile + "_" + String.valueOf(numComponents[0]) + JointGMMSet.DEFAULT_EXTENSION; else { pa.jointGmmFile = baseFile + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) pa.jointGmmFile += "_" + String.valueOf(numComponents[i]); pa.jointGmmFile += JointGMMSet.DEFAULT_EXTENSION; } pa.pitchMappingFile = baseFile + PitchMappingFile.DEFAULT_EXTENSION; pa.outputFolderInfoString = "isSrc" + String.valueOf(pa.isSourceVocalTractSpectrumFromModel ? 1 : 0) + "_smooth" + String.valueOf(pa.isTemporalSmoothing ? 1 : 0) + "_" + String.valueOf(pa.smoothingNumNeighbours); pa.isSeparateProsody = false; pa.isSaveVocalTractOnlyVersion = false; pa.isFixedRateVocalTractConversion = true; // Prosody transformation pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_HERTZ; // pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_LOGHERTZ; pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.CUSTOM_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_MEAN; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_STDDEV; pa.prosodyParams.isUseInputMeanPitch = false; pa.prosodyParams.isUseInputStdDevPitch = false; pa.prosodyParams.isUseInputRangePitch = false; pa.prosodyParams.isUseInputInterceptPitch = false; pa.prosodyParams.isUseInputSlopePitch = false; // // TTS tests pa.isPitchFromTargetFile = false; pa.isDurationFromTargetFile = false; pa.isEnergyFromTargetFile = false; pa.targetAlignmentFileType = BaselineTransformerParams.LABELS; // JointGMMTransformer t = new JointGMMTransformer(pp, fe, po, pa); t.run(); } public static void mainQuickTest(String[] args) throws UnsupportedAudioFileException, IOException { String wavBaseFolder = "D:/quickTest/"; String sourceTag = "source"; String targetTag = "target"; String method = "F"; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); BaselinePostprocessor po = new BaselinePostprocessor(); JointGMMTransformerParams pa = new JointGMMTransformerParams(); int numTrainingFiles = 50; int i; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 10 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; String inputFolder = wavBaseFolder + "/" + sourceTag + "/test_5/"; String outputBaseFolder; if (!isContextualGMMs) { outputBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/gmm" + method + "_" + String.valueOf(numTrainingFiles); } else { outputBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/gmm" + method + "_" + String.valueOf(numTrainingFiles) + "_" + "context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) outputBaseFolder += "_" + String.valueOf(numComponents[i]); } String baseFile = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/" + sourceTag + method + "_X_" + targetTag + method + "_" + String.valueOf(numTrainingFiles); pa.isForcedAnalysis = false; pa.isSourceVocalTractSpectrumFromModel = false; // pa.isSourceVocalTractSpectrumFromModel = true; pa.isVocalTractTransformation = true; pa.isResynthesizeVocalTractFromSourceModel = false; pa.isVocalTractMatchUsingTargetModel = false; // Smoothing pa.isTemporalSmoothing = true; pa.smoothingNumNeighbours = 10; if (!pa.isTemporalSmoothing) pa.smoothingNumNeighbours = 0; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_LSFCONTOUR_SMOOTHING; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_VOCALTRACTSPECTRUM_SMOOTHING; pa.smoothingMethod = SmoothingDefinitions.TRANSFORMATION_FILTER_SMOOTHING; // pa.isDisplayProcessingFrameCount = true; pa.inputFolder = inputFolder; pa.outputBaseFolder = outputBaseFolder; if (!isContextualGMMs) pa.jointGmmFile = baseFile + "_" + String.valueOf(numComponents[0]) + JointGMMSet.DEFAULT_EXTENSION; else { pa.jointGmmFile = baseFile + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) pa.jointGmmFile += "_" + String.valueOf(numComponents[i]); pa.jointGmmFile += JointGMMSet.DEFAULT_EXTENSION; } pa.pitchMappingFile = baseFile + PitchMappingFile.DEFAULT_EXTENSION; pa.outputFolderInfoString = "isSrc" + String.valueOf(pa.isSourceVocalTractSpectrumFromModel ? 1 : 0) + "_smooth" + String.valueOf(pa.isTemporalSmoothing ? 1 : 0) + "_" + String.valueOf(pa.smoothingNumNeighbours); pa.isSeparateProsody = false; pa.isSaveVocalTractOnlyVersion = false; pa.isFixedRateVocalTractConversion = true; // Prosody transformation pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_HERTZ; // pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_LOGHERTZ; pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.CUSTOM_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_MEAN; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_STDDEV; pa.prosodyParams.isUseInputMeanPitch = false; pa.prosodyParams.isUseInputStdDevPitch = false; pa.prosodyParams.isUseInputRangePitch = false; pa.prosodyParams.isUseInputInterceptPitch = false; pa.prosodyParams.isUseInputSlopePitch = false; // // TTS tests pa.isPitchFromTargetFile = false; pa.isDurationFromTargetFile = false; pa.isEnergyFromTargetFile = false; pa.targetAlignmentFileType = BaselineTransformerParams.LABELS; // JointGMMTransformer t = new JointGMMTransformer(pp, fe, po, pa); t.run(); } /*** * This example uses the ouput of the example: * marytts.signalproc.adaptation.gmm.jointgmm.JointGMMParallelTrainer.mainQuickTest2() Input: * /Neutral-Spike-Conversion/ouput/source2target/sourceF_X_targetF_99_10.jgs /Neutral-Spike-Conversion/source/wav/*.wav (files * to convert, normally these files are different from the training set) Output: * /Neutral-Spike-Conversion/output/source2target/gmmF_99/isSrc0_smooth1_10_mixes10_prosody1x0x0/*.wav transformed files * * @param args * args * @throws UnsupportedAudioFileException * UnsupportedAudioFileException * @throws IOException * IOException */ public static void mainQuickTest2(String[] args) throws UnsupportedAudioFileException, IOException { String wavBaseFolder = "/project/mary/marcela/VoiceConversion/Neutral-Spike-Conversion/"; String sourceTag = "source"; String targetTag = "target"; String method = "F"; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); BaselinePostprocessor po = new BaselinePostprocessor(); JointGMMTransformerParams pa = new JointGMMTransformerParams(); int numTrainingFiles = 99; int i; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 10 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; String inputFolder = wavBaseFolder + "/" + sourceTag + "/wav/"; String outputBaseFolder; if (!isContextualGMMs) { outputBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/gmm" + method + "_" + String.valueOf(numTrainingFiles); } else { outputBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/gmm" + method + "_" + String.valueOf(numTrainingFiles) + "_" + "context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) outputBaseFolder += "_" + String.valueOf(numComponents[i]); } String baseFile = wavBaseFolder + "output/" + sourceTag + "2" + targetTag + "/" + sourceTag + method + "_X_" + targetTag + method + "_" + String.valueOf(numTrainingFiles); pa.isForcedAnalysis = false; pa.isSourceVocalTractSpectrumFromModel = false; // pa.isSourceVocalTractSpectrumFromModel = true; pa.isVocalTractTransformation = true; pa.isResynthesizeVocalTractFromSourceModel = false; pa.isVocalTractMatchUsingTargetModel = false; // Smoothing pa.isTemporalSmoothing = true; pa.smoothingNumNeighbours = 10; if (!pa.isTemporalSmoothing) pa.smoothingNumNeighbours = 0; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_LSFCONTOUR_SMOOTHING; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_VOCALTRACTSPECTRUM_SMOOTHING; pa.smoothingMethod = SmoothingDefinitions.TRANSFORMATION_FILTER_SMOOTHING; // pa.isDisplayProcessingFrameCount = true; pa.inputFolder = inputFolder; pa.outputBaseFolder = outputBaseFolder; if (!isContextualGMMs) pa.jointGmmFile = baseFile + "_" + String.valueOf(numComponents[0]) + JointGMMSet.DEFAULT_EXTENSION; else { pa.jointGmmFile = baseFile + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) pa.jointGmmFile += "_" + String.valueOf(numComponents[i]); pa.jointGmmFile += JointGMMSet.DEFAULT_EXTENSION; } pa.pitchMappingFile = baseFile + PitchMappingFile.DEFAULT_EXTENSION; pa.outputFolderInfoString = "isSrc" + String.valueOf(pa.isSourceVocalTractSpectrumFromModel ? 1 : 0) + "_smooth" + String.valueOf(pa.isTemporalSmoothing ? 1 : 0) + "_" + String.valueOf(pa.smoothingNumNeighbours); pa.isSeparateProsody = false; pa.isSaveVocalTractOnlyVersion = false; pa.isFixedRateVocalTractConversion = true; // Prosody transformation pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_HERTZ; // pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_LOGHERTZ; pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.CUSTOM_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_MEAN; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_STDDEV; pa.prosodyParams.isUseInputMeanPitch = false; pa.prosodyParams.isUseInputStdDevPitch = false; pa.prosodyParams.isUseInputRangePitch = false; pa.prosodyParams.isUseInputInterceptPitch = false; pa.prosodyParams.isUseInputSlopePitch = false; // // TTS tests pa.isPitchFromTargetFile = false; pa.isDurationFromTargetFile = false; pa.isEnergyFromTargetFile = false; pa.targetAlignmentFileType = BaselineTransformerParams.LABELS; // JointGMMTransformer t = new JointGMMTransformer(pp, fe, po, pa); t.run(); } public static void mainInterspeech2008(String[] args) throws IOException, UnsupportedAudioFileException { String emotion = "angry"; String method = "F"; int numTrainingFiles = 200; // 2, 20, 200, 350 int i; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 40 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; String inputFolder = "D:/Oytun/DFKI/voices/Interspeech08/neutral/test_tts_" + emotion; String outputBaseFolder; if (!isContextualGMMs) { outputBaseFolder = "D:/Oytun/DFKI/voices/Interspeech08_out2/neutral2" + emotion + "/neutral2" + emotion + "Out_gmm" + method + "_" + String.valueOf(numTrainingFiles) + "_" + String.valueOf(numComponents[0]); } else { outputBaseFolder = "D:/Oytun/DFKI/voices/Interspeech08_out2/neutral2" + emotion + "/neutral2" + emotion + "Out_gmm" + method + "_" + String.valueOf(numTrainingFiles) + "_" + "context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) outputBaseFolder += "_" + String.valueOf(numComponents[i]); } String baseFile = "D:/Oytun/DFKI/voices/Interspeech08_out2/neutral2" + emotion + "/neutral" + method + "_X_" + emotion + method + "_" + String.valueOf(numTrainingFiles); boolean isSourceVocalTractSpectrumFromModel = false; boolean isTemporalSmoothing = true; int smoothingNumNeighbours = 10; // Note that these two can be true or false together, not yet implemented separate processing boolean isPitchFromTargetFile = true; int pitchFromTargetMethod = ProsodyTransformerParams.FULL_CONTOUR; // int pitchFromTargetMethod = ProsodyTransformerParams.SENTENCE_MEAN; // int pitchFromTargetMethod = ProsodyTransformerParams.SENTENCE_MEAN_STDDEV; boolean isDurationFromTargetFile = true; int durationFromTargetMethod = ProsodyTransformerParams.PHONEME_DURATIONS; // int durationFromTargetMethod = ProsodyTransformerParams.TRIPHONE_DURATIONS; // int durationFromTargetMethod = ProsodyTransformerParams.SENTENCE_DURATION; boolean isEnergyFromTargetFile = false; boolean isLsfsFromTargetFile = false; int targetAlignmentFileType = BaselineTransformerParams.FESTIVAL_UTT; // String outputFolderInfoString = "isSrc" + String.valueOf(isSourceVocalTractSpectrumFromModel ? 1 : 0) + "_smooth" + String.valueOf(isTemporalSmoothing ? 1 : 0) + "_" + String.valueOf(smoothingNumNeighbours) + "_psUtt" + String.valueOf(isPitchFromTargetFile ? 1 : 0) + "_tsUtt" + String.valueOf(isDurationFromTargetFile ? 1 : 0); mainParametric(inputFolder, outputBaseFolder, baseFile, outputFolderInfoString, isSourceVocalTractSpectrumFromModel, isTemporalSmoothing, smoothingNumNeighbours, isPitchFromTargetFile, pitchFromTargetMethod, isDurationFromTargetFile, durationFromTargetMethod, isEnergyFromTargetFile, isLsfsFromTargetFile, targetAlignmentFileType, isContextualGMMs, contextClassificationType, numComponents); } public static void mainIeeeTaslp2009_mary(String[] args) throws IOException, UnsupportedAudioFileException { String[] emotions = { "angry", "happy", "sad" }; for (int emCount = 0; emCount < emotions.length; emCount++) // for (int emCount=0; emCount<1; emCount++) { String emotion = emotions[emCount]; String method = "F"; int numTrainingFiles = 200; // 2, 20, 200, 350 int i; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 40 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; String inputFolder = "D:/publications/IEEE_TASLP/2009/expressiveVC/voice_conversion/test_neutral2" + emotion; // String inputFolder = "D:/publications/IEEE_TASLP/2009/expressiveVC/voice_conversion/test_neutral_short"; String outputBaseString = "D:/publications/IEEE_TASLP/2009/expressiveVC/voice_conversion/out2_neutral2"; // String outputBaseString = "D:/publications/IEEE_TASLP/2009/expressiveVC/voice_conversion/out_short_neutral2"; String outputBaseFolder = outputBaseString + emotion + "/neutral2" + emotion + "Out_gmm" + method + "_" + String.valueOf(numTrainingFiles) + "_"; if (!isContextualGMMs) outputBaseFolder += String.valueOf(numComponents[0]); else { outputBaseFolder += "context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) outputBaseFolder += "_" + String.valueOf(numComponents[i]); } String baseFile = outputBaseString + emotion + "/neutral" + method + "_X_" + emotion + method + "_" + String.valueOf(numTrainingFiles); boolean isSourceVocalTractSpectrumFromModel = false; boolean isTemporalSmoothing = false; int smoothingNumNeighbours = 5; // Note that pitch and duration can be true or false together, not yet implemented separate processing boolean isPitchFromTargetFile = true; // int pitchFromTargetMethod = ProsodyTransformerParams.FULL_CONTOUR; // int pitchFromTargetMethod = ProsodyTransformerParams.SENTENCE_MEAN; int pitchFromTargetMethod = ProsodyTransformerParams.SENTENCE_MEAN_STDDEV; boolean isDurationFromTargetFile = true; int durationFromTargetMethod = ProsodyTransformerParams.PHONEME_DURATIONS; // int durationFromTargetMethod = ProsodyTransformerParams.TRIPHONE_DURATIONS; // int durationFromTargetMethod = ProsodyTransformerParams.SENTENCE_DURATION; boolean isEnergyFromTargetFile = false; boolean isLsfsFromTargetFile = false; int targetAlignmentFileType = BaselineTransformerParams.LABELS; // String outputFolderInfoString = "isSrc" + String.valueOf(isSourceVocalTractSpectrumFromModel ? 1 : 0) + "_smooth" + String.valueOf(isTemporalSmoothing ? 1 : 0) + "_" + String.valueOf(smoothingNumNeighbours) + "_psUtt" + String.valueOf(isPitchFromTargetFile ? 1 : 0) + "_tsUtt" + String.valueOf(isDurationFromTargetFile ? 1 : 0); mainParametric(inputFolder, outputBaseFolder, baseFile, outputFolderInfoString, isSourceVocalTractSpectrumFromModel, isTemporalSmoothing, smoothingNumNeighbours, isPitchFromTargetFile, pitchFromTargetMethod, isDurationFromTargetFile, durationFromTargetMethod, isEnergyFromTargetFile, isLsfsFromTargetFile, targetAlignmentFileType, isContextualGMMs, contextClassificationType, numComponents); } } public static void mainIeeeTaslp2009_rap(String[] args) throws IOException, UnsupportedAudioFileException { int i; String source = "uch"; String target = "target"; String method = "F"; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 32 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; String inputFolder = "D:/Oytun/Papers/IEEE_Transaction_VT/musicVC/final_gmm/uch_test"; String outputBaseFolder; if (!isContextualGMMs) { outputBaseFolder = "D:/Oytun/Papers/IEEE_Transaction_VT/musicVC/final_gmm/out_gmm" + method + "_" + String.valueOf(numComponents[0]); } else { outputBaseFolder = "D:/Oytun/Papers/IEEE_Transaction_VT/musicVC/final_gmm/out_gmm" + method + "_" + String.valueOf(numComponents[0]) + "_" + "context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) outputBaseFolder += "_" + String.valueOf(numComponents[i]); } String baseFile = "D:/Oytun/Papers/IEEE_Transaction_VT/musicVC/final_gmm/" + source + method + "_X_" + target + method; boolean isSourceVocalTractSpectrumFromModel = false; boolean isTemporalSmoothing = true; int smoothingNumNeighbours = 20; // 2 4 10 20 // Note that these two can be true or false together, not yet implemented separate processing boolean isPitchFromTargetFile = false; int pitchFromTargetMethod = ProsodyTransformerParams.FULL_CONTOUR; // int pitchFromTargetMethod = ProsodyTransformerParams.SENTENCE_MEAN; // int pitchFromTargetMethod = ProsodyTransformerParams.SENTENCE_MEAN_STDDEV; boolean isDurationFromTargetFile = false; int durationFromTargetMethod = ProsodyTransformerParams.PHONEME_DURATIONS; // int durationFromTargetMethod = ProsodyTransformerParams.TRIPHONE_DURATIONS; // int durationFromTargetMethod = ProsodyTransformerParams.SENTENCE_DURATION; boolean isEnergyFromTargetFile = false; boolean isLsfsFromTargetFile = false; int targetAlignmentFileType = BaselineTransformerParams.FESTIVAL_UTT; // String outputFolderInfoString = "isSrc" + String.valueOf(isSourceVocalTractSpectrumFromModel ? 1 : 0) + "_smooth" + String.valueOf(isTemporalSmoothing ? 1 : 0) + "_" + String.valueOf(smoothingNumNeighbours) + "_psUtt" + String.valueOf(isPitchFromTargetFile ? 1 : 0) + "_tsUtt" + String.valueOf(isDurationFromTargetFile ? 1 : 0); mainParametric(inputFolder, outputBaseFolder, baseFile, outputFolderInfoString, isSourceVocalTractSpectrumFromModel, isTemporalSmoothing, smoothingNumNeighbours, isPitchFromTargetFile, pitchFromTargetMethod, isDurationFromTargetFile, durationFromTargetMethod, isEnergyFromTargetFile, isLsfsFromTargetFile, targetAlignmentFileType, isContextualGMMs, contextClassificationType, numComponents); } public static void mainParametric(String inputFolder, String outputBaseFolder, String baseFile, String outputFolderInfoString, boolean isSourceVocalTractSpectrumFromModel, boolean isTemporalSmoothing, int smoothingNumNeighbours, boolean isPitchFromTargetFile, int pitchFromTargetMethod, boolean isDurationFromTargetFile, int durationFromTargetMethod, boolean isEnergyFromTargetFile, boolean isLsfsFromTargetFile, int targetAlignmentFileType, boolean isContextualGMMs, int contextClassificationType, int[] numComponents) throws IOException, UnsupportedAudioFileException { BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); BaselinePostprocessor po = new BaselinePostprocessor(); JointGMMTransformerParams pa = new JointGMMTransformerParams(); int i; pa.isDisplayProcessingFrameCount = true; pa.inputFolder = inputFolder; pa.outputBaseFolder = outputBaseFolder; if (!isContextualGMMs) pa.jointGmmFile = baseFile + "_" + String.valueOf(numComponents[0]) + JointGMMSet.DEFAULT_EXTENSION; else { pa.jointGmmFile = baseFile + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) pa.jointGmmFile += "_" + String.valueOf(numComponents[i]); pa.jointGmmFile += JointGMMSet.DEFAULT_EXTENSION; } pa.pitchMappingFile = baseFile + PitchMappingFile.DEFAULT_EXTENSION; pa.outputFolderInfoString = outputFolderInfoString; pa.isForcedAnalysis = false; pa.isSourceVocalTractSpectrumFromModel = isSourceVocalTractSpectrumFromModel; pa.isVocalTractTransformation = true; pa.isResynthesizeVocalTractFromSourceModel = false; pa.isVocalTractMatchUsingTargetModel = false; pa.isSeparateProsody = true; pa.isSaveVocalTractOnlyVersion = false; pa.isFixedRateVocalTractConversion = true; // Prosody transformation pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_HERTZ; // pa.prosodyParams.pitchStatisticsType = PitchStatistics.STATISTICS_IN_LOGHERTZ; pa.prosodyParams.durationTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.NO_TRANSFORMATION; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_MEAN; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_STDDEV; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_RANGE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_SLOPE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_INTERCEPT; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_MEAN_STDDEV; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_MEAN_SLOPE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_INTERCEPT_STDDEV; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.GLOBAL_INTERCEPT_SLOPE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_MEAN; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_STDDEV; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_RANGE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_SLOPE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_INTERCEPT; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_MEAN_STDDEV; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_MEAN_SLOPE; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_INTERCEPT_STDDEV; // pa.prosodyParams.pitchTransformationMethod = ProsodyTransformerParams.SENTENCE_INTERCEPT_SLOPE; pa.prosodyParams.isUseInputMeanPitch = false; pa.prosodyParams.isUseInputStdDevPitch = false; pa.prosodyParams.isUseInputRangePitch = false; pa.prosodyParams.isUseInputInterceptPitch = false; pa.prosodyParams.isUseInputSlopePitch = false; // // Smoothing pa.isTemporalSmoothing = isTemporalSmoothing; pa.smoothingNumNeighbours = smoothingNumNeighbours; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_LSFCONTOUR_SMOOTHING; // pa.smoothingMethod = SmoothingDefinitions.OUTPUT_VOCALTRACTSPECTRUM_SMOOTHING; pa.smoothingMethod = SmoothingDefinitions.TRANSFORMATION_FILTER_SMOOTHING; // // TTS tests pa.isPitchFromTargetFile = isPitchFromTargetFile; pa.pitchFromTargetMethod = pitchFromTargetMethod; pa.isDurationFromTargetFile = isDurationFromTargetFile; pa.durationFromTargetMethod = durationFromTargetMethod; pa.isEnergyFromTargetFile = isEnergyFromTargetFile; pa.isLsfsFromTargetFile = isLsfsFromTargetFile; pa.targetAlignmentFileType = targetAlignmentFileType; // JointGMMTransformer t = new JointGMMTransformer(pp, fe, po, pa); t.run(); } }