/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.adaptation.gmm.jointgmm; import java.io.IOException; import java.util.Arrays; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.exceptions.MaryConfigurationException; import marytts.machinelearning.ContextualGMMParams; import marytts.machinelearning.GMM; import marytts.machinelearning.GMMTrainer; import marytts.machinelearning.GMMTrainerParams; import marytts.modules.phonemiser.AllophoneSet; import marytts.signalproc.adaptation.BaselineFeatureExtractor; import marytts.signalproc.adaptation.BaselinePreprocessor; import marytts.signalproc.adaptation.codebook.WeightedCodebook; import marytts.signalproc.adaptation.codebook.WeightedCodebookFile; import marytts.signalproc.adaptation.codebook.WeightedCodebookFileHeader; import marytts.signalproc.adaptation.codebook.WeightedCodebookParallelTrainer; import marytts.signalproc.adaptation.codebook.WeightedCodebookTrainerParams; import marytts.signalproc.adaptation.outlier.KMeansMappingEliminatorParams; import marytts.signalproc.adaptation.outlier.TotalStandardDeviations; import marytts.signalproc.adaptation.prosody.PitchMappingFile; import marytts.signalproc.analysis.distance.DistanceComputer; import marytts.signalproc.window.Window; import marytts.util.io.FileUtils; import marytts.util.string.StringUtils; /** * Joint-GMM voice conversion training using parallel source and target databases * * Reference: A. Kain and M. Macon, “Spectral voice conversion for text-to-speech synthesis,” in Proc. of the IEEE ICASSP 1998, * vol. 1, pp. 285-288. * * @author Oytun Türk */ public class JointGMMParallelTrainer extends JointGMMTrainer { protected WeightedCodebookParallelTrainer wcpTrainer; protected JointGMMTrainerParams jgParams; public JointGMMParallelTrainer(BaselinePreprocessor pp, BaselineFeatureExtractor fe, WeightedCodebookTrainerParams pa, JointGMMTrainerParams gp, ContextualGMMParams cg) { super(pp, fe, pa, gp, cg); wcpTrainer = new WeightedCodebookParallelTrainer(pp, fe, pa); jgParams = new JointGMMTrainerParams(gp); } public void run() { train(); } public void train() { if (!FileUtils.exists(codebookTrainerParams.codebookFile)) { // Parallel codebook training try { wcpTrainer.run(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (UnsupportedAudioFileException e) { // TODO Auto-generated catch block e.printStackTrace(); } // } // Read parallel codebook WeightedCodebookFile codebookFile = new WeightedCodebookFile(wcpTrainer.wcParams.codebookFile, WeightedCodebookFile.OPEN_FOR_READ); WeightedCodebook codebook = null; try { codebook = codebookFile.readCodebookFile(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // // Get codebook entries in suitable format for GMM training and train joint GMMs if (cgParams == null || cgParams.phoneClasses == null) // No context { JointGMMSet gmmSet = null; GMM gmm = null; if (codebook != null) { double[][] xy = null; boolean bFeatureExisting = false; if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { xy = new double[codebook.entries.length][2 * codebook.header.lsfParams.dimension]; for (int i = 0; i < codebook.entries.length; i++) { System.arraycopy(codebook.entries[i].sourceItem.lsfs, 0, xy[i], 0, codebook.header.lsfParams.dimension); System.arraycopy(codebook.entries[i].targetItem.lsfs, 0, xy[i], codebook.header.lsfParams.dimension, codebook.header.lsfParams.dimension); } bFeatureExisting = true; } else if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { xy = new double[codebook.entries.length][2 * codebook.header.mfccParams.dimension]; for (int i = 0; i < codebook.entries.length; i++) { System.arraycopy(codebook.entries[i].sourceItem.mfccs, 0, xy[i], 0, codebook.header.mfccParams.dimension); System.arraycopy(codebook.entries[i].targetItem.mfccs, 0, xy[i], codebook.header.mfccParams.dimension, codebook.header.mfccParams.dimension); } bFeatureExisting = true; } assert bFeatureExisting; GMMTrainer g = new GMMTrainer(); gmmSet = new JointGMMSet(1, cgParams); gmm = g.train(xy, jgParams.gmmEMTrainerParams); if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) gmmSet.gmms[0] = new JointGMM(gmm, codebook.header.lsfParams); else if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) gmmSet.gmms[0] = new JointGMM(gmm, codebook.header.mfccParams); } // Convert joint GMM into a suitable format for using in transformation and save to a binary output file if (gmmSet != null) gmmSet.write(jgParams.jointGMMFile); } else // Train contextual GMMs - a separate GMM will be trained for each phone class, all GMMs will be written to the // same GMM file { double[][] xy = null; int[] totals = new int[cgParams.phoneClasses.length + 1]; int[] classIndices = new int[codebook.entries.length]; Arrays.fill(totals, 0); int i, n; JointGMMSet gmmSet = new JointGMMSet(totals.length, cgParams); if (codebook != null) { for (i = 0; i < codebook.entries.length; i++) { classIndices[i] = cgParams.getClassIndex(codebook.entries[i].sourceItem.phn); if (classIndices[i] < 0) classIndices[i] = totals.length - 1; totals[classIndices[i]]++; } } for (n = 0; n < totals.length; n++) { GMM gmm = null; int count = 0; if (totals[n] > 0) { if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { xy = new double[totals[n]][2 * codebook.header.lsfParams.dimension]; for (i = 0; i < classIndices.length; i++) { if (count >= totals[n]) break; if (classIndices[i] == n) { System.arraycopy(codebook.entries[i].sourceItem.lsfs, 0, xy[count], 0, codebook.header.lsfParams.dimension); System.arraycopy(codebook.entries[i].targetItem.lsfs, 0, xy[count], codebook.header.lsfParams.dimension, codebook.header.lsfParams.dimension); count++; } } } else if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { xy = new double[totals[n]][2 * codebook.header.mfccParams.dimension]; for (i = 0; i < classIndices.length; i++) { if (count >= totals[n]) break; if (classIndices[i] == n) { System.arraycopy(codebook.entries[i].sourceItem.mfccs, 0, xy[count], 0, codebook.header.mfccParams.dimension); System.arraycopy(codebook.entries[i].targetItem.mfccs, 0, xy[count], codebook.header.mfccParams.dimension, codebook.header.mfccParams.dimension); count++; } } } GMMTrainer g = new GMMTrainer(); gmm = g.train(xy, cgParams.classTrainerParams[n]); if (n < totals.length - 1) { gmm.info = ""; for (i = 0; i < cgParams.phoneClasses[n].length - 1; i++) gmm.info += cgParams.phoneClasses[n][i] + " "; gmm.info += cgParams.phoneClasses[n][cgParams.phoneClasses[n].length - 1]; } else gmm.info = "other"; if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { codebook.header.lsfParams.numfrm = totals[n]; gmmSet.gmms[n] = new JointGMM(gmm, codebook.header.lsfParams); } else if (codebookTrainerParams.codebookHeader.vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { codebook.header.mfccParams.numfrm = totals[n]; gmmSet.gmms[n] = new JointGMM(gmm, codebook.header.mfccParams); } } } // Convert joint GMM into a suitable format for using in transformation and save to a binary output file if (gmmSet != null) gmmSet.write(jgParams.jointGMMFile); // } // System.out.println("Joint source-target GMM training completed..."); } public static void main(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { // mainIEEE_TASLP_2009_rap(args); // mainInterspeech2008(args); // mainHmmVoiceConversion(args); // mainQuickTest(args); mainQuickTest2(args); } public static void mainIEEE_TASLP_2009_rap(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { String wavBaseFolder = "D:/Oytun/Papers/IEEE_Transaction_VT/musicVC/final_gmm/"; String sourceTag = "uch"; String targetTag = "target"; String method; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 32 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); WeightedCodebookTrainerParams pa = new WeightedCodebookTrainerParams(); JointGMMTrainerParams gp = new JointGMMTrainerParams(); ContextualGMMParams cg = null; int i; pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAMES; method = "F"; // Frame-by-frame mapping of features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAME_GROUPS; method = "FG"; // pa.codebookHeader.numNeighboursInFrameGroups = 3; //Mapping of frame average features (no label information but fixed // amount of neighbouring frames is used) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABELS; method = "L"; //Mapping of label average features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABEL_GROUPS; method = "LG"; // pa.codebookHeader.numNeighboursInLabelGroups = 1; //Mapping of average features collected across label groups (i.e. // vowels, consonants, etc) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.SPEECH; method = "S"; //Mapping of average features // collected across all speech parts (i.e. like spectral equalization) pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.LSF_FEATURES; // Use Lsf features - full speech to speech // transformation // pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES; //Use MFCC features - // currently supports only feature to featur etransformation pa.codebookHeader.sourceTag = sourceTag + method; // Source name tag (i.e. style or speaker identity) pa.codebookHeader.targetTag = targetTag + method; // Target name tag (i.e. style or speaker identity) pa.trainingBaseFolder = wavBaseFolder; // Training base directory pa.sourceTrainingFolder = wavBaseFolder + sourceTag + "_train/"; // Source training folder pa.targetTrainingFolder = wavBaseFolder + targetTag + "_train/"; // Target training folder pa.indexMapFileExtension = ".imf"; // Index map file extensions pa.codebookHeader.lsfParams.dimension = 0; // Auto set pa.codebookHeader.lsfParams.preCoef = 0.97f; pa.codebookHeader.lsfParams.skipsize = 0.010f; pa.codebookHeader.lsfParams.winsize = 0.020f; pa.codebookHeader.lsfParams.windowType = Window.HAMMING; pa.codebookHeader.lsfParams.isBarkScaled = true; // Gaussian trainer params: commenting out results in using default value for each gp.vocalTractFeature = pa.codebookHeader.vocalTractFeature; gp.isContextualGMMs = isContextualGMMs; gp.gmmEMTrainerParams.totalComponents = numComponents[0]; gp.gmmEMTrainerParams.isDiagonalCovariance = true; gp.gmmEMTrainerParams.kmeansMaxIterations = 200; gp.gmmEMTrainerParams.kmeansMinClusterChangePercent = 0.1; gp.gmmEMTrainerParams.kmeansMinSamplesInOneCluster = 50; gp.gmmEMTrainerParams.emMinIterations = 100; gp.gmmEMTrainerParams.emMaxIterations = 400; gp.gmmEMTrainerParams.isUpdateCovariances = true; gp.gmmEMTrainerParams.tinyLogLikelihoodChangePercent = 1e-5; gp.gmmEMTrainerParams.minCovarianceAllowed = 1e-4; gp.gmmEMTrainerParams.useNativeCLibTrainer = true; if (gp.isContextualGMMs) { GMMTrainerParams[] gmmParams = new GMMTrainerParams[numComponents.length]; for (i = 0; i < numComponents.length; i++) { gmmParams[i] = new GMMTrainerParams(gp.gmmEMTrainerParams); gmmParams[i].totalComponents = numComponents[i]; } String phonemeSetFile = "D:/Mary TTS New/lib/modules/de/cap/phoneme-list-de.xml"; cg = getContextualGMMParams(phonemeSetFile, gmmParams, contextClassificationType); } String baseFile = StringUtils.checkLastSlash(pa.trainingBaseFolder) + pa.codebookHeader.sourceTag + "_X_" + pa.codebookHeader.targetTag; // pa.codebookFile = baseFile + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + // WeightedCodebookFile.DEFAULT_EXTENSION; pa.codebookFile = baseFile + WeightedCodebookFile.DEFAULT_EXTENSION; pa.pitchMappingFile = baseFile + PitchMappingFile.DEFAULT_EXTENSION; if (!isContextualGMMs) gp.jointGMMFile = baseFile + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + JointGMMSet.DEFAULT_EXTENSION; else { gp.jointGMMFile = baseFile + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) gp.jointGMMFile += "_" + String.valueOf(numComponents[i]); gp.jointGMMFile += JointGMMSet.DEFAULT_EXTENSION; } pa.isForcedAnalysis = false; pa.codebookHeader.ptcParams.windowSizeInSeconds = 0.040; pa.codebookHeader.ptcParams.skipSizeInSeconds = 0.005; pa.codebookHeader.ptcParams.voicingThreshold = 0.30; pa.codebookHeader.ptcParams.isDoublingCheck = false; pa.codebookHeader.ptcParams.isHalvingCheck = false; pa.codebookHeader.ptcParams.minimumF0 = 40.0f; pa.codebookHeader.ptcParams.maximumF0 = 400.0f; pa.codebookHeader.ptcParams.centerClippingRatio = 0.3; pa.codebookHeader.ptcParams.cutOff1 = pa.codebookHeader.ptcParams.minimumF0 - 20.0; pa.codebookHeader.ptcParams.cutOff2 = pa.codebookHeader.ptcParams.maximumF0 + 200.0; pa.codebookHeader.energyParams.windowSizeInSeconds = 0.020; pa.codebookHeader.energyParams.skipSizeInSeconds = 0.010; TotalStandardDeviations tsd = new TotalStandardDeviations(); tsd.lsf = 1.5; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 2.0; // Gaussian outlier eliminator // Decreasing totalStandardDeviations will lead to more outlier eliminations, i.e. smaller codebooks pa.gaussianEliminatorParams.isActive = true; // Set to false if you do not want to use this eliminator at all pa.gaussianEliminatorParams.isCheckLsfOutliers = true; pa.gaussianEliminatorParams.isEliminateTooSimilarLsf = false; pa.gaussianEliminatorParams.isCheckF0Outliers = true; pa.gaussianEliminatorParams.isCheckDurationOutliers = true; pa.gaussianEliminatorParams.isCheckEnergyOutliers = true; pa.gaussianEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // // KMeans one-to-many and many-to-one mapping eliminator pa.kmeansEliminatorParams.isActive = false; // Set to false if you do not want to use this eliminator at all // pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_LEAST_LIKELY_MAPPINGS; // pa.kmeansEliminatorParams.eliminationLikelihood = 0.20; pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_MEAN_DISTANCE_MISMATCHES; pa.kmeansEliminatorParams.distanceType = DistanceComputer.NORMALIZED_EUCLIDEAN_DISTANCE; // pa.kmeansEliminatorParams.distanceType = DistanceComputer.EUCLIDEAN_DISTANCE; pa.kmeansEliminatorParams.isGlobalVariance = true; // pa.kmeansEliminatorParams.eliminationAlgorithm = // KMeansMappingEliminatorParams.ELIMINATE_USING_SUBCLUSTER_MEAN_DISTANCES; pa.kmeansEliminatorParams.isSeparateClustering = false; // Cluster features separately(true) or together(false)? // Effective only when isSeparateClustering clustering is false tsd.general = 0.1; pa.kmeansEliminatorParams.numClusters = 30; // Effective only when isSeparateClustering clustering is true tsd.lsf = 1.0; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 1.0; pa.kmeansEliminatorParams.numClustersLsf = 30; pa.kmeansEliminatorParams.numClustersF0 = 50; pa.kmeansEliminatorParams.numClustersDuration = 5; pa.kmeansEliminatorParams.numClustersEnergy = 5; pa.kmeansEliminatorParams.isCheckLsfOutliers = false; pa.kmeansEliminatorParams.isCheckF0Outliers = false; pa.kmeansEliminatorParams.isCheckDurationOutliers = false; pa.kmeansEliminatorParams.isCheckEnergyOutliers = false; // pa.kmeansEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // JointGMMParallelTrainer t = new JointGMMParallelTrainer(pp, fe, pa, gp, cg); t.run(); } // Testing voice conversion as a post-processor to enhance HMM based synthesis output // The idea is to train a voice conversion function between HMM outputs and natural recordings // Then, any HMM output is to be transformed with the voice conversion function to make it closer to original recordings public static void mainHmmVoiceConversion(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { // String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/hsmmMfcc_25Dimensional/"; // String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/lsp_21Dimensional/"; // String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/mellsp_21Dimensional/"; // String baseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest/mfcc_25Dimensional/"; // String wavBaseFolder = "/home/oytun/"; String wavBaseFolder = "D:/Oytun/DFKI/voices/hmmVoiceConversionTest2/"; String sourceTag = "hmmSource_gv"; String targetTag = "origTarget"; String method; int numTrainingFiles = 1092; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 128 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); WeightedCodebookTrainerParams pa = new WeightedCodebookTrainerParams(); JointGMMTrainerParams gp = new JointGMMTrainerParams(); ContextualGMMParams cg = null; int i; pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAMES; method = "F"; // Frame-by-frame mapping of features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAME_GROUPS; method = "FG"; // pa.codebookHeader.numNeighboursInFrameGroups = 3; //Mapping of frame average features (no label information but fixed // amount of neighbouring frames is used) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABELS; method = "L"; //Mapping of label average features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABEL_GROUPS; method = "LG"; // pa.codebookHeader.numNeighboursInLabelGroups = 1; //Mapping of average features collected across label groups (i.e. // vowels, consonants, etc) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.SPEECH; method = "S"; //Mapping of average features // collected across all speech parts (i.e. like spectral equalization) pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.LSF_FEATURES; // Use Lsf features - full speech to speech // transformation // pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES; //Use MFCC features - // currently supports only feature to featur etransformation pa.codebookHeader.sourceTag = sourceTag + method; // Source name tag (i.e. style or speaker identity) pa.codebookHeader.targetTag = targetTag + method; // Target name tag (i.e. style or speaker identity) pa.trainingBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag; // Training base directory pa.sourceTrainingFolder = wavBaseFolder + sourceTag + "/train_" + String.valueOf(numTrainingFiles) + "/"; // Source // training // folder pa.targetTrainingFolder = wavBaseFolder + targetTag + "/train_" + String.valueOf(numTrainingFiles) + "/"; // Target // training // folder pa.indexMapFileExtension = ".imf"; // Index map file extensions pa.codebookHeader.lsfParams.dimension = 0; // Auto set pa.codebookHeader.lsfParams.preCoef = 0.97f; pa.codebookHeader.lsfParams.skipsize = 0.010f; pa.codebookHeader.lsfParams.winsize = 0.020f; pa.codebookHeader.lsfParams.windowType = Window.HAMMING; // Gaussian trainer params: commenting out results in using default value for each gp.vocalTractFeature = pa.codebookHeader.vocalTractFeature; gp.isContextualGMMs = isContextualGMMs; gp.gmmEMTrainerParams.totalComponents = numComponents[0]; gp.gmmEMTrainerParams.isDiagonalCovariance = true; gp.gmmEMTrainerParams.kmeansMaxIterations = 200; gp.gmmEMTrainerParams.kmeansMinClusterChangePercent = 0.1; gp.gmmEMTrainerParams.kmeansMinSamplesInOneCluster = 50; gp.gmmEMTrainerParams.emMinIterations = 100; gp.gmmEMTrainerParams.emMaxIterations = 400; gp.gmmEMTrainerParams.isUpdateCovariances = true; gp.gmmEMTrainerParams.tinyLogLikelihoodChangePercent = 1e-5; gp.gmmEMTrainerParams.minCovarianceAllowed = 1e-4; gp.gmmEMTrainerParams.useNativeCLibTrainer = true; if (gp.isContextualGMMs) { GMMTrainerParams[] gmmParams = new GMMTrainerParams[numComponents.length]; for (i = 0; i < numComponents.length; i++) { gmmParams[i] = new GMMTrainerParams(gp.gmmEMTrainerParams); gmmParams[i].totalComponents = numComponents[i]; } String phoneSetFile = "D:/Mary TTS New/lib/modules/de/cap/phone-list-de.xml"; cg = getContextualGMMParams(phoneSetFile, gmmParams, contextClassificationType); } String baseFile = StringUtils.checkLastSlash(pa.trainingBaseFolder) + pa.codebookHeader.sourceTag + "_X_" + pa.codebookHeader.targetTag; // pa.codebookFile = baseFile + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + // WeightedCodebookFile.DEFAULT_EXTENSION; pa.codebookFile = baseFile + "_" + String.valueOf(numTrainingFiles) + WeightedCodebookFile.DEFAULT_EXTENSION; pa.pitchMappingFile = baseFile + "_" + String.valueOf(numTrainingFiles) + PitchMappingFile.DEFAULT_EXTENSION; if (!isContextualGMMs) gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + JointGMMSet.DEFAULT_EXTENSION; else { gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) gp.jointGMMFile += "_" + String.valueOf(numComponents[i]); gp.jointGMMFile += JointGMMSet.DEFAULT_EXTENSION; } pa.isForcedAnalysis = false; pa.codebookHeader.ptcParams.windowSizeInSeconds = 0.040; pa.codebookHeader.ptcParams.skipSizeInSeconds = 0.005; pa.codebookHeader.ptcParams.voicingThreshold = 0.30; pa.codebookHeader.ptcParams.isDoublingCheck = false; pa.codebookHeader.ptcParams.isHalvingCheck = false; pa.codebookHeader.ptcParams.minimumF0 = 40.0f; pa.codebookHeader.ptcParams.maximumF0 = 400.0f; pa.codebookHeader.ptcParams.centerClippingRatio = 0.3; pa.codebookHeader.ptcParams.cutOff1 = pa.codebookHeader.ptcParams.minimumF0 - 20.0; pa.codebookHeader.ptcParams.cutOff2 = pa.codebookHeader.ptcParams.maximumF0 + 200.0; pa.codebookHeader.energyParams.windowSizeInSeconds = 0.020; pa.codebookHeader.energyParams.skipSizeInSeconds = 0.010; TotalStandardDeviations tsd = new TotalStandardDeviations(); tsd.lsf = 1.5; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 2.0; // Gaussian outlier eliminator // Decreasing totalStandardDeviations will lead to more outlier eliminations, i.e. smaller codebooks pa.gaussianEliminatorParams.isActive = true; // Set to false if you do not want to use this eliminator at all pa.gaussianEliminatorParams.isCheckLsfOutliers = true; pa.gaussianEliminatorParams.isEliminateTooSimilarLsf = false; pa.gaussianEliminatorParams.isCheckF0Outliers = true; pa.gaussianEliminatorParams.isCheckDurationOutliers = true; pa.gaussianEliminatorParams.isCheckEnergyOutliers = true; pa.gaussianEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // // KMeans one-to-many and many-to-one mapping eliminator pa.kmeansEliminatorParams.isActive = false; // Set to false if you do not want to use this eliminator at all // pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_LEAST_LIKELY_MAPPINGS; // pa.kmeansEliminatorParams.eliminationLikelihood = 0.20; pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_MEAN_DISTANCE_MISMATCHES; pa.kmeansEliminatorParams.distanceType = DistanceComputer.NORMALIZED_EUCLIDEAN_DISTANCE; // pa.kmeansEliminatorParams.distanceType = DistanceComputer.EUCLIDEAN_DISTANCE; pa.kmeansEliminatorParams.isGlobalVariance = true; // pa.kmeansEliminatorParams.eliminationAlgorithm = // KMeansMappingEliminatorParams.ELIMINATE_USING_SUBCLUSTER_MEAN_DISTANCES; pa.kmeansEliminatorParams.isSeparateClustering = false; // Cluster features separately(true) or together(false)? // Effective only when isSeparateClustering clustering is false tsd.general = 0.1; pa.kmeansEliminatorParams.numClusters = 30; // Effective only when isSeparateClustering clustering is true tsd.lsf = 1.0; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 1.0; pa.kmeansEliminatorParams.numClustersLsf = 30; pa.kmeansEliminatorParams.numClustersF0 = 50; pa.kmeansEliminatorParams.numClustersDuration = 5; pa.kmeansEliminatorParams.numClustersEnergy = 5; pa.kmeansEliminatorParams.isCheckLsfOutliers = false; pa.kmeansEliminatorParams.isCheckF0Outliers = false; pa.kmeansEliminatorParams.isCheckDurationOutliers = false; pa.kmeansEliminatorParams.isCheckEnergyOutliers = false; // pa.kmeansEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // JointGMMParallelTrainer t = new JointGMMParallelTrainer(pp, fe, pa, gp, cg); t.run(); } public static void mainInterspeech2008(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { String emotion = "angry"; String method = "F"; int numTrainingFiles = 200; // 2, 20, 200, 350 boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 40 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; mainParametric(numTrainingFiles, numComponents, isContextualGMMs, contextClassificationType, "neutral", emotion, method); /* * mainParametric(numTrainingFiles, numMixes, isContextualGMMs, contextClassificationType, "neutral", "angry", method); * mainParametric(numTrainingFiles, numMixes, isContextualGMMs, contextClassificationType, "neutral", "happy", method); * mainParametric(numTrainingFiles, numMixes, isContextualGMMs, contextClassificationType, "neutral", "sad", method); */ } public static void mainParametric(int numTrainingFiles, int[] numComponents, boolean isContextualGMMs, int contextClassificationType, String sourceTag, String targetTag, String method) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); WeightedCodebookTrainerParams pa = new WeightedCodebookTrainerParams(); JointGMMTrainerParams gp = new JointGMMTrainerParams(); ContextualGMMParams cg = null; int i; pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAMES; // Frame-by-frame mapping of features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAME_GROUPS; pa.codebookHeader.numNeighboursInFrameGroups // = 3; //Mapping of frame average features (no label information but fixed amount of neighbouring frames is used) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABELS; //Mapping of label average features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABEL_GROUPS; pa.codebookHeader.numNeighboursInLabelGroups // = 1; //Mapping of average features collected across label groups (i.e. vowels, consonants, etc) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.SPEECH; //Mapping of average features collected across all // speech parts (i.e. like spectral equalization) pa.codebookHeader.sourceTag = sourceTag + method; // Source name tag (i.e. style or speaker identity) pa.codebookHeader.targetTag = targetTag + method; // Target name tag (i.e. style or speaker identity) pa.trainingBaseFolder = "D:/Oytun/DFKI/voices/Interspeech08_out2/" + sourceTag + "2" + targetTag; // Training base // directory pa.sourceTrainingFolder = "D:/Oytun/DFKI/voices/Interspeech08/" + sourceTag + "/train_" + String.valueOf(numTrainingFiles); // Source training folder pa.targetTrainingFolder = "D:/Oytun/DFKI/voices/Interspeech08/" + targetTag + "/train_" + String.valueOf(numTrainingFiles); // Target training folder pa.indexMapFileExtension = ".imf"; // Index map file extensions pa.codebookHeader.lsfParams.dimension = 0; // Auto set pa.codebookHeader.lsfParams.preCoef = 0.97f; pa.codebookHeader.lsfParams.skipsize = 0.010f; pa.codebookHeader.lsfParams.winsize = 0.020f; pa.codebookHeader.lsfParams.windowType = Window.HAMMING; // Gaussian trainer params: commenting out results in using default value for each gp.isContextualGMMs = isContextualGMMs; gp.gmmEMTrainerParams.totalComponents = numComponents[0]; gp.gmmEMTrainerParams.isDiagonalCovariance = true; gp.gmmEMTrainerParams.kmeansMaxIterations = 200; gp.gmmEMTrainerParams.kmeansMinClusterChangePercent = 0.1; gp.gmmEMTrainerParams.kmeansMinSamplesInOneCluster = 50; gp.gmmEMTrainerParams.emMinIterations = 200; gp.gmmEMTrainerParams.emMaxIterations = 2000; gp.gmmEMTrainerParams.isUpdateCovariances = true; gp.gmmEMTrainerParams.tinyLogLikelihoodChangePercent = 1e-5; gp.gmmEMTrainerParams.minCovarianceAllowed = 1e-4; gp.gmmEMTrainerParams.useNativeCLibTrainer = true; if (gp.isContextualGMMs) { GMMTrainerParams[] gmmParams = new GMMTrainerParams[numComponents.length]; for (i = 0; i < numComponents.length; i++) { gmmParams[i] = new GMMTrainerParams(gp.gmmEMTrainerParams); gmmParams[i].totalComponents = numComponents[i]; } String phoneSetFile = "D:/Mary TTS New/lib/modules/de/cap/phone-list-de.xml"; cg = getContextualGMMParams(phoneSetFile, gmmParams, contextClassificationType); } String baseFile = StringUtils.checkLastSlash(pa.trainingBaseFolder) + pa.codebookHeader.sourceTag + "_X_" + pa.codebookHeader.targetTag; // pa.codebookFile = baseFile + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + // WeightedCodebookFile.DEFAULT_EXTENSION; pa.codebookFile = baseFile + "_" + String.valueOf(numTrainingFiles) + WeightedCodebookFile.DEFAULT_EXTENSION; pa.pitchMappingFile = baseFile + "_" + String.valueOf(numTrainingFiles) + PitchMappingFile.DEFAULT_EXTENSION; if (!isContextualGMMs) gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + JointGMMSet.DEFAULT_EXTENSION; else { gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) gp.jointGMMFile += "_" + String.valueOf(numComponents[i]); gp.jointGMMFile += JointGMMSet.DEFAULT_EXTENSION; } pa.isForcedAnalysis = false; pa.codebookHeader.ptcParams.windowSizeInSeconds = 0.040; pa.codebookHeader.ptcParams.skipSizeInSeconds = 0.005; pa.codebookHeader.ptcParams.voicingThreshold = 0.30; pa.codebookHeader.ptcParams.isDoublingCheck = false; pa.codebookHeader.ptcParams.isHalvingCheck = false; pa.codebookHeader.ptcParams.minimumF0 = 40.0f; pa.codebookHeader.ptcParams.maximumF0 = 400.0f; pa.codebookHeader.ptcParams.centerClippingRatio = 0.3; pa.codebookHeader.ptcParams.cutOff1 = pa.codebookHeader.ptcParams.minimumF0 - 20.0; pa.codebookHeader.ptcParams.cutOff2 = pa.codebookHeader.ptcParams.maximumF0 + 200.0; pa.codebookHeader.energyParams.windowSizeInSeconds = 0.020; pa.codebookHeader.energyParams.skipSizeInSeconds = 0.010; TotalStandardDeviations tsd = new TotalStandardDeviations(); tsd.lsf = 1.5; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 2.0; // Gaussian outlier eliminator // Decreasing totalStandardDeviations will lead to more outlier eliminations, i.e. smaller codebooks pa.gaussianEliminatorParams.isActive = true; // Set to false if you do not want to use this eliminator at all pa.gaussianEliminatorParams.isCheckLsfOutliers = true; pa.gaussianEliminatorParams.isEliminateTooSimilarLsf = true; pa.gaussianEliminatorParams.isCheckF0Outliers = true; pa.gaussianEliminatorParams.isCheckDurationOutliers = true; pa.gaussianEliminatorParams.isCheckEnergyOutliers = true; pa.gaussianEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // // KMeans one-to-many and many-to-one mapping eliminator pa.kmeansEliminatorParams.isActive = true; // Set to false if you do not want to use this eliminator at all // pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_LEAST_LIKELY_MAPPINGS; // pa.kmeansEliminatorParams.eliminationLikelihood = 0.20; pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_MEAN_DISTANCE_MISMATCHES; pa.kmeansEliminatorParams.distanceType = DistanceComputer.NORMALIZED_EUCLIDEAN_DISTANCE; // pa.kmeansEliminatorParams.distanceType = DistanceComputer.EUCLIDEAN_DISTANCE; pa.kmeansEliminatorParams.isGlobalVariance = true; // pa.kmeansEliminatorParams.eliminationAlgorithm = // KMeansMappingEliminatorParams.ELIMINATE_USING_SUBCLUSTER_MEAN_DISTANCES; pa.kmeansEliminatorParams.isSeparateClustering = false; // Cluster features separately(true) or together(false)? // Effective only when isSeparateClustering clustering is false tsd.general = 0.1; pa.kmeansEliminatorParams.numClusters = 30; // Effective only when isSeparateClustering clustering is true tsd.lsf = 1.0; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 1.0; pa.kmeansEliminatorParams.numClustersLsf = 30; pa.kmeansEliminatorParams.numClustersF0 = 50; pa.kmeansEliminatorParams.numClustersDuration = 5; pa.kmeansEliminatorParams.numClustersEnergy = 5; pa.kmeansEliminatorParams.isCheckLsfOutliers = true; pa.kmeansEliminatorParams.isCheckF0Outliers = false; pa.kmeansEliminatorParams.isCheckDurationOutliers = false; pa.kmeansEliminatorParams.isCheckEnergyOutliers = false; // pa.labelsToExcludeFromTraining = new String[1]; pa.labelsToExcludeFromTraining[0] = "_"; pa.kmeansEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // JointGMMParallelTrainer t = new JointGMMParallelTrainer(pp, fe, pa, gp, cg); t.run(); } // Testing voice conversion as a post-processor to enhance HMM based synthesis output // The idea is to train a voice conversion function between HMM outputs and natural recordings // Then, any HMM output is to be transformed with the voice conversion function to make it closer to original recordings public static void mainQuickTest(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { String wavBaseFolder = "D:/quickTest/"; String sourceTag = "source"; String targetTag = "target"; String method; int numTrainingFiles = 50; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 10 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); WeightedCodebookTrainerParams pa = new WeightedCodebookTrainerParams(); JointGMMTrainerParams gp = new JointGMMTrainerParams(); ContextualGMMParams cg = null; int i; pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAMES; method = "F"; // Frame-by-frame mapping of features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAME_GROUPS; method = "FG"; // pa.codebookHeader.numNeighboursInFrameGroups = 3; //Mapping of frame average features (no label information but fixed // amount of neighbouring frames is used) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABELS; method = "L"; //Mapping of label average features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABEL_GROUPS; method = "LG"; // pa.codebookHeader.numNeighboursInLabelGroups = 1; //Mapping of average features collected across label groups (i.e. // vowels, consonants, etc) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.SPEECH; method = "S"; //Mapping of average features // collected across all speech parts (i.e. like spectral equalization) pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.LSF_FEATURES; // Use Lsf features - full speech to speech // transformation // pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES; //Use MFCC features - // currently supports only feature to featur etransformation pa.codebookHeader.sourceTag = sourceTag + method; // Source name tag (i.e. style or speaker identity) pa.codebookHeader.targetTag = targetTag + method; // Target name tag (i.e. style or speaker identity) pa.trainingBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag; // Training base directory pa.sourceTrainingFolder = wavBaseFolder + sourceTag + "/train_" + String.valueOf(numTrainingFiles) + "/"; // Source // training // folder pa.targetTrainingFolder = wavBaseFolder + targetTag + "/train_" + String.valueOf(numTrainingFiles) + "/"; // Target // training // folder pa.indexMapFileExtension = ".imf"; // Index map file extensions pa.codebookHeader.lsfParams.dimension = 0; // Auto set pa.codebookHeader.lsfParams.preCoef = 0.97f; pa.codebookHeader.lsfParams.skipsize = 0.010f; pa.codebookHeader.lsfParams.winsize = 0.020f; pa.codebookHeader.lsfParams.windowType = Window.HAMMING; // Gaussian trainer params: commenting out results in using default value for each gp.vocalTractFeature = pa.codebookHeader.vocalTractFeature; gp.isContextualGMMs = isContextualGMMs; gp.gmmEMTrainerParams.totalComponents = numComponents[0]; gp.gmmEMTrainerParams.isDiagonalCovariance = true; gp.gmmEMTrainerParams.kmeansMaxIterations = 200; gp.gmmEMTrainerParams.kmeansMinClusterChangePercent = 0.1; gp.gmmEMTrainerParams.kmeansMinSamplesInOneCluster = 50; gp.gmmEMTrainerParams.emMinIterations = 100; gp.gmmEMTrainerParams.emMaxIterations = 400; gp.gmmEMTrainerParams.isUpdateCovariances = true; gp.gmmEMTrainerParams.tinyLogLikelihoodChangePercent = 1e-5; gp.gmmEMTrainerParams.minCovarianceAllowed = 1e-4; gp.gmmEMTrainerParams.useNativeCLibTrainer = true; if (gp.isContextualGMMs) { GMMTrainerParams[] gmmParams = new GMMTrainerParams[numComponents.length]; for (i = 0; i < numComponents.length; i++) { gmmParams[i] = new GMMTrainerParams(gp.gmmEMTrainerParams); gmmParams[i].totalComponents = numComponents[i]; } String phoneSetFile = "D:/Mary TTS New/lib/modules/de/cap/phone-list-de.xml"; cg = getContextualGMMParams(phoneSetFile, gmmParams, contextClassificationType); } String baseFile = StringUtils.checkLastSlash(pa.trainingBaseFolder) + pa.codebookHeader.sourceTag + "_X_" + pa.codebookHeader.targetTag; // pa.codebookFile = baseFile + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + // WeightedCodebookFile.DEFAULT_EXTENSION; pa.codebookFile = baseFile + "_" + String.valueOf(numTrainingFiles) + WeightedCodebookFile.DEFAULT_EXTENSION; pa.pitchMappingFile = baseFile + "_" + String.valueOf(numTrainingFiles) + PitchMappingFile.DEFAULT_EXTENSION; if (!isContextualGMMs) gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + JointGMMSet.DEFAULT_EXTENSION; else { gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) gp.jointGMMFile += "_" + String.valueOf(numComponents[i]); gp.jointGMMFile += JointGMMSet.DEFAULT_EXTENSION; } pa.isForcedAnalysis = false; pa.codebookHeader.ptcParams.windowSizeInSeconds = 0.040; pa.codebookHeader.ptcParams.skipSizeInSeconds = 0.005; pa.codebookHeader.ptcParams.voicingThreshold = 0.30; pa.codebookHeader.ptcParams.isDoublingCheck = false; pa.codebookHeader.ptcParams.isHalvingCheck = false; pa.codebookHeader.ptcParams.minimumF0 = 40.0f; pa.codebookHeader.ptcParams.maximumF0 = 400.0f; pa.codebookHeader.ptcParams.centerClippingRatio = 0.3; pa.codebookHeader.ptcParams.cutOff1 = pa.codebookHeader.ptcParams.minimumF0 - 20.0; pa.codebookHeader.ptcParams.cutOff2 = pa.codebookHeader.ptcParams.maximumF0 + 200.0; pa.codebookHeader.energyParams.windowSizeInSeconds = 0.020; pa.codebookHeader.energyParams.skipSizeInSeconds = 0.010; TotalStandardDeviations tsd = new TotalStandardDeviations(); tsd.lsf = 1.5; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 2.0; // Gaussian outlier eliminator // Decreasing totalStandardDeviations will lead to more outlier eliminations, i.e. smaller codebooks pa.gaussianEliminatorParams.isActive = true; // Set to false if you do not want to use this eliminator at all pa.gaussianEliminatorParams.isCheckLsfOutliers = true; pa.gaussianEliminatorParams.isEliminateTooSimilarLsf = false; pa.gaussianEliminatorParams.isCheckF0Outliers = true; pa.gaussianEliminatorParams.isCheckDurationOutliers = true; pa.gaussianEliminatorParams.isCheckEnergyOutliers = true; pa.gaussianEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // // KMeans one-to-many and many-to-one mapping eliminator pa.kmeansEliminatorParams.isActive = false; // Set to false if you do not want to use this eliminator at all // pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_LEAST_LIKELY_MAPPINGS; // pa.kmeansEliminatorParams.eliminationLikelihood = 0.20; pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_MEAN_DISTANCE_MISMATCHES; pa.kmeansEliminatorParams.distanceType = DistanceComputer.NORMALIZED_EUCLIDEAN_DISTANCE; // pa.kmeansEliminatorParams.distanceType = DistanceComputer.EUCLIDEAN_DISTANCE; pa.kmeansEliminatorParams.isGlobalVariance = true; // pa.kmeansEliminatorParams.eliminationAlgorithm = // KMeansMappingEliminatorParams.ELIMINATE_USING_SUBCLUSTER_MEAN_DISTANCES; pa.kmeansEliminatorParams.isSeparateClustering = false; // Cluster features separately(true) or together(false)? // Effective only when isSeparateClustering clustering is false tsd.general = 0.1; pa.kmeansEliminatorParams.numClusters = 30; // Effective only when isSeparateClustering clustering is true tsd.lsf = 1.0; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 1.0; pa.kmeansEliminatorParams.numClustersLsf = 30; pa.kmeansEliminatorParams.numClustersF0 = 50; pa.kmeansEliminatorParams.numClustersDuration = 5; pa.kmeansEliminatorParams.numClustersEnergy = 5; pa.kmeansEliminatorParams.isCheckLsfOutliers = false; pa.kmeansEliminatorParams.isCheckF0Outliers = false; pa.kmeansEliminatorParams.isCheckDurationOutliers = false; pa.kmeansEliminatorParams.isCheckEnergyOutliers = false; // pa.kmeansEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // pa.labelsToExcludeFromTraining = new String[1]; pa.labelsToExcludeFromTraining[0] = "_"; JointGMMParallelTrainer t = new JointGMMParallelTrainer(pp, fe, pa, gp, cg); t.run(); } /** * Depending on the parameters it will train GMMs. For example the ouput in this example will be: sourceF_X_targetF_99_10.jgs * → numTrainingFiles = 99, numComponents = 10 (10 mixes) Input: two directories source and target containing: * /Neutral-Spike-Conversion/source/train_99/*.wav and *.lab /Neutral-Spike-Conversion/target/train_99/*.wav and *.lab In * these directories it will calculate *.lsf, *.ptc, *.ene Output: * /Neutral-Spike-Conversion/ouput/source2target/sourceF_X_targetF_99_10.jgs * * @param args * args * @throws UnsupportedAudioFileException * UnsupportedAudioFileException * @throws IOException * IOException * @throws MaryConfigurationException * MaryConfigurationException */ public static void mainQuickTest2(String[] args) throws UnsupportedAudioFileException, IOException, MaryConfigurationException { String wavBaseFolder = "/project/mary/marcela/VoiceConversion/Neutral-Spike-Conversion/"; String sourceTag = "source"; String targetTag = "target"; String method; int numTrainingFiles = 99; boolean isContextualGMMs = false; int contextClassificationType = ContextualGMMParams.NO_PHONEME_CLASS; int[] numComponents = { 10 }; // int contextClassificationType = ContextualGMMParams.SILENCE_SPEECH; int[] numComponents = {16, 128}; // int contextClassificationType = ContextualGMMParams.VOWEL_SILENCE_CONSONANT; int[] numComponents = {128, 16, 128}; // int contextClassificationType = ContextualGMMParams.PHONOLOGY_CLASS; int[] numComponents = {numMixes}; // int contextClassificationType = ContextualGMMParams.FRICATIVE_GLIDELIQUID_NASAL_PLOSIVE_VOWEL_OTHER; int[] // numComponents = {128, 128, 128, 128, 128, 16}; // int contextClassificationType = ContextualGMMParams.PHONEME_IDENTITY; int[] numComponents = {128}; BaselinePreprocessor pp = new BaselinePreprocessor(); BaselineFeatureExtractor fe = new BaselineFeatureExtractor(); WeightedCodebookTrainerParams pa = new WeightedCodebookTrainerParams(); JointGMMTrainerParams gp = new JointGMMTrainerParams(); ContextualGMMParams cg = null; int i; pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAMES; method = "F"; // Frame-by-frame mapping of features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.FRAME_GROUPS; method = "FG"; // pa.codebookHeader.numNeighboursInFrameGroups = 3; //Mapping of frame average features (no label information but fixed // amount of neighbouring frames is used) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABELS; method = "L"; //Mapping of label average features // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.LABEL_GROUPS; method = "LG"; // pa.codebookHeader.numNeighboursInLabelGroups = 1; //Mapping of average features collected across label groups (i.e. // vowels, consonants, etc) // pa.codebookHeader.codebookType = WeightedCodebookFileHeader.SPEECH; method = "S"; //Mapping of average features // collected across all speech parts (i.e. like spectral equalization) pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.LSF_FEATURES; // Use Lsf features - full speech to speech // transformation // pa.codebookHeader.vocalTractFeature = BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES; //Use MFCC features - // currently supports only feature to featur etransformation pa.codebookHeader.sourceTag = sourceTag + method; // Source name tag (i.e. style or speaker identity) pa.codebookHeader.targetTag = targetTag + method; // Target name tag (i.e. style or speaker identity) pa.trainingBaseFolder = wavBaseFolder + "output/" + sourceTag + "2" + targetTag; // Training base directory pa.sourceTrainingFolder = wavBaseFolder + sourceTag + "/train_" + String.valueOf(numTrainingFiles) + "/"; // Source // training // folder pa.targetTrainingFolder = wavBaseFolder + targetTag + "/train_" + String.valueOf(numTrainingFiles) + "/"; // Target // training // folder pa.indexMapFileExtension = ".imf"; // Index map file extensions pa.codebookHeader.lsfParams.dimension = 0; // Auto set pa.codebookHeader.lsfParams.preCoef = 0.97f; pa.codebookHeader.lsfParams.skipsize = 0.010f; pa.codebookHeader.lsfParams.winsize = 0.020f; pa.codebookHeader.lsfParams.windowType = Window.HAMMING; // Gaussian trainer params: commenting out results in using default value for each gp.vocalTractFeature = pa.codebookHeader.vocalTractFeature; gp.isContextualGMMs = isContextualGMMs; gp.gmmEMTrainerParams.totalComponents = numComponents[0]; gp.gmmEMTrainerParams.isDiagonalCovariance = true; gp.gmmEMTrainerParams.kmeansMaxIterations = 200; gp.gmmEMTrainerParams.kmeansMinClusterChangePercent = 0.1; gp.gmmEMTrainerParams.kmeansMinSamplesInOneCluster = 50; gp.gmmEMTrainerParams.emMinIterations = 100; gp.gmmEMTrainerParams.emMaxIterations = 400; gp.gmmEMTrainerParams.isUpdateCovariances = true; gp.gmmEMTrainerParams.tinyLogLikelihoodChangePercent = 1e-5; gp.gmmEMTrainerParams.minCovarianceAllowed = 1e-4; gp.gmmEMTrainerParams.useNativeCLibTrainer = true; if (gp.isContextualGMMs) { GMMTrainerParams[] gmmParams = new GMMTrainerParams[numComponents.length]; for (i = 0; i < numComponents.length; i++) { gmmParams[i] = new GMMTrainerParams(gp.gmmEMTrainerParams); gmmParams[i].totalComponents = numComponents[i]; } // String phoneSetFile = "D:/Mary TTS New/lib/modules/de/cap/phone-list-de.xml"; String phoneSetFile = "/project/mary/marcela/openmary/lib/modules/de/cap/phone-list-de.xml"; cg = getContextualGMMParams(phoneSetFile, gmmParams, contextClassificationType); } String baseFile = StringUtils.checkLastSlash(pa.trainingBaseFolder) + pa.codebookHeader.sourceTag + "_X_" + pa.codebookHeader.targetTag; // pa.codebookFile = baseFile + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + // WeightedCodebookFile.DEFAULT_EXTENSION; pa.codebookFile = baseFile + "_" + String.valueOf(numTrainingFiles) + WeightedCodebookFile.DEFAULT_EXTENSION; pa.pitchMappingFile = baseFile + "_" + String.valueOf(numTrainingFiles) + PitchMappingFile.DEFAULT_EXTENSION; if (!isContextualGMMs) gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_" + String.valueOf(gp.gmmEMTrainerParams.totalComponents) + JointGMMSet.DEFAULT_EXTENSION; else { gp.jointGMMFile = baseFile + "_" + String.valueOf(numTrainingFiles) + "_context" + String.valueOf(contextClassificationType); for (i = 0; i < numComponents.length; i++) gp.jointGMMFile += "_" + String.valueOf(numComponents[i]); gp.jointGMMFile += JointGMMSet.DEFAULT_EXTENSION; } pa.isForcedAnalysis = false; pa.codebookHeader.ptcParams.windowSizeInSeconds = 0.040; pa.codebookHeader.ptcParams.skipSizeInSeconds = 0.005; pa.codebookHeader.ptcParams.voicingThreshold = 0.30; pa.codebookHeader.ptcParams.isDoublingCheck = false; pa.codebookHeader.ptcParams.isHalvingCheck = false; pa.codebookHeader.ptcParams.minimumF0 = 40.0f; pa.codebookHeader.ptcParams.maximumF0 = 400.0f; pa.codebookHeader.ptcParams.centerClippingRatio = 0.3; pa.codebookHeader.ptcParams.cutOff1 = pa.codebookHeader.ptcParams.minimumF0 - 20.0; pa.codebookHeader.ptcParams.cutOff2 = pa.codebookHeader.ptcParams.maximumF0 + 200.0; pa.codebookHeader.energyParams.windowSizeInSeconds = 0.020; pa.codebookHeader.energyParams.skipSizeInSeconds = 0.010; TotalStandardDeviations tsd = new TotalStandardDeviations(); tsd.lsf = 1.5; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 2.0; // Gaussian outlier eliminator // Decreasing totalStandardDeviations will lead to more outlier eliminations, i.e. smaller codebooks pa.gaussianEliminatorParams.isActive = true; // Set to false if you do not want to use this eliminator at all pa.gaussianEliminatorParams.isCheckLsfOutliers = true; pa.gaussianEliminatorParams.isEliminateTooSimilarLsf = false; pa.gaussianEliminatorParams.isCheckF0Outliers = true; pa.gaussianEliminatorParams.isCheckDurationOutliers = true; pa.gaussianEliminatorParams.isCheckEnergyOutliers = true; pa.gaussianEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // // KMeans one-to-many and many-to-one mapping eliminator pa.kmeansEliminatorParams.isActive = false; // Set to false if you do not want to use this eliminator at all // pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_LEAST_LIKELY_MAPPINGS; // pa.kmeansEliminatorParams.eliminationLikelihood = 0.20; pa.kmeansEliminatorParams.eliminationAlgorithm = KMeansMappingEliminatorParams.ELIMINATE_MEAN_DISTANCE_MISMATCHES; pa.kmeansEliminatorParams.distanceType = DistanceComputer.NORMALIZED_EUCLIDEAN_DISTANCE; // pa.kmeansEliminatorParams.distanceType = DistanceComputer.EUCLIDEAN_DISTANCE; pa.kmeansEliminatorParams.isGlobalVariance = true; // pa.kmeansEliminatorParams.eliminationAlgorithm = // KMeansMappingEliminatorParams.ELIMINATE_USING_SUBCLUSTER_MEAN_DISTANCES; pa.kmeansEliminatorParams.isSeparateClustering = false; // Cluster features separately(true) or together(false)? // Effective only when isSeparateClustering clustering is false tsd.general = 0.1; pa.kmeansEliminatorParams.numClusters = 30; // Effective only when isSeparateClustering clustering is true tsd.lsf = 1.0; tsd.f0 = 1.0; tsd.duration = 1.0; tsd.energy = 1.0; pa.kmeansEliminatorParams.numClustersLsf = 30; pa.kmeansEliminatorParams.numClustersF0 = 50; pa.kmeansEliminatorParams.numClustersDuration = 5; pa.kmeansEliminatorParams.numClustersEnergy = 5; pa.kmeansEliminatorParams.isCheckLsfOutliers = false; pa.kmeansEliminatorParams.isCheckF0Outliers = false; pa.kmeansEliminatorParams.isCheckDurationOutliers = false; pa.kmeansEliminatorParams.isCheckEnergyOutliers = false; // pa.kmeansEliminatorParams.totalStandardDeviations = new TotalStandardDeviations(tsd); // pa.labelsToExcludeFromTraining = new String[1]; pa.labelsToExcludeFromTraining[0] = "_"; JointGMMParallelTrainer t = new JointGMMParallelTrainer(pp, fe, pa, gp, cg); t.run(); } public static ContextualGMMParams getContextualGMMParams(String phoneSetFile, GMMTrainerParams[] params, int contextClassificationType) throws MaryConfigurationException { AllophoneSet allophoneSet = AllophoneSet.getAllophoneSet(phoneSetFile); assert allophoneSet != null; ContextualGMMParams cg = new ContextualGMMParams(allophoneSet, params, contextClassificationType); return cg; } }