/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.adaptation.codebook; import java.io.IOException; import java.util.Arrays; import marytts.signalproc.adaptation.BaselineAdaptationSet; import marytts.signalproc.adaptation.Context; import marytts.signalproc.adaptation.IndexMap; import marytts.signalproc.analysis.EnergyContourRms; import marytts.signalproc.analysis.Labels; import marytts.signalproc.analysis.MfccFileHeader; import marytts.signalproc.analysis.Mfccs; import marytts.signalproc.analysis.PitchReaderWriter; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; /** * * Implements mapping functionality of MFCCs between source and target * * @author Oytun Türk */ public class WeightedCodebookMfccMapper extends WeightedCodebookFeatureMapper { private WeightedCodebookTrainerParams params; public WeightedCodebookMfccMapper(WeightedCodebookTrainerParams pa) { params = new WeightedCodebookTrainerParams(pa); } public void learnMappingFrames(WeightedCodebookFile codebookFile, WeightedCodebookFeatureCollection fcol, BaselineAdaptationSet sourceTrainingSet, BaselineAdaptationSet targetTrainingSet, int[] map) throws IOException { assert params.codebookHeader.codebookType == WeightedCodebookFileHeader.FRAMES; IndexMap imap = new IndexMap(); int i, j, index; WeightedCodebookEntry entry = null; boolean bHeaderWritten = false; // Take directly the corresponding source-target frame vocal tract feature vectors and write them as a new entry for (i = 0; i < fcol.indexMapFiles.length; i++) { System.out.println("MFCC mapping for pair " + String.valueOf(i + 1) + " of " + String.valueOf(fcol.indexMapFiles.length) + ":"); try { imap.readFromFile(fcol.indexMapFiles[i]); // imap keeps information about a single source-target pair only } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (imap.files != null && sourceTrainingSet.items.length > i && targetTrainingSet.items.length > i) { // Mfccs Mfccs srcFeatures = new Mfccs(sourceTrainingSet.items[i].mfccFile); Mfccs tgtFeatures = new Mfccs(targetTrainingSet.items[map[i]].mfccFile); // // Pitch: for outlier elimination not prosody modeling! PitchReaderWriter sourceF0s = new PitchReaderWriter(sourceTrainingSet.items[i].pitchFile); PitchReaderWriter targetF0s = new PitchReaderWriter(targetTrainingSet.items[map[i]].pitchFile); // // Duration Labels sourceLabels = new Labels(sourceTrainingSet.items[i].labelFile); Labels targetLabels = new Labels(targetTrainingSet.items[map[i]].labelFile); // // Energy EnergyContourRms sourceEnergies = EnergyContourRms.ReadEnergyFile(sourceTrainingSet.items[i].energyFile); EnergyContourRms targetEnergies = EnergyContourRms.ReadEnergyFile(targetTrainingSet.items[map[i]].energyFile); // if (!bHeaderWritten) { params.codebookHeader.mfccParams.dimension = ((MfccFileHeader) (srcFeatures.params)).dimension; params.codebookHeader.mfccParams.samplingRate = ((MfccFileHeader) (srcFeatures.params)).samplingRate; codebookFile.writeCodebookHeader(params.codebookHeader); bHeaderWritten = true; } if (srcFeatures.mfccs != null) { for (j = 0; j < imap.files[0].indicesMap.length; j++) // j is the index for labels { if (srcFeatures.mfccs.length > imap.files[0].indicesMap[j][0] && tgtFeatures.mfccs.length > imap.files[0].indicesMap[j][1]) { // Write to codebook file entry = new WeightedCodebookEntry(0, ((MfccFileHeader) (srcFeatures.params)).dimension); entry.setMfccs(srcFeatures.mfccs[imap.files[0].indicesMap[j][0]], tgtFeatures.mfccs[imap.files[0].indicesMap[j][1]]); // Pitch index = MathUtils.linearMap(imap.files[0].indicesMap[j][0], 0, srcFeatures.mfccs.length - 1, 0, sourceF0s.contour.length - 1); entry.sourceItem.f0 = sourceF0s.contour[index]; index = MathUtils.linearMap(imap.files[0].indicesMap[j][1], 0, tgtFeatures.mfccs.length - 1, 0, targetF0s.contour.length - 1); entry.targetItem.f0 = targetF0s.contour[index]; // // Duration & Phone index = SignalProcUtils.frameIndex2LabelIndex(imap.files[0].indicesMap[j][0], sourceLabels, ((MfccFileHeader) (srcFeatures.params)).winsize, ((MfccFileHeader) (srcFeatures.params)).skipsize); if (index > 0) entry.sourceItem.duration = sourceLabels.items[index].time - sourceLabels.items[index - 1].time; else entry.sourceItem.duration = sourceLabels.items[index].time; entry.sourceItem.phn = sourceLabels.items[index].phn; entry.sourceItem.context = new Context(sourceLabels, index, WeightedCodebookTrainerParams.MAXIMUM_CONTEXT); index = SignalProcUtils.frameIndex2LabelIndex(imap.files[0].indicesMap[j][1], targetLabels, ((MfccFileHeader) (tgtFeatures.params)).winsize, ((MfccFileHeader) (tgtFeatures.params)).skipsize); if (index > 0) entry.targetItem.duration = targetLabels.items[index].time - targetLabels.items[index - 1].time; else entry.targetItem.duration = targetLabels.items[index].time; entry.targetItem.phn = targetLabels.items[index].phn; entry.targetItem.context = new Context(targetLabels, index, WeightedCodebookTrainerParams.MAXIMUM_CONTEXT); // // Energy index = MathUtils.linearMap(imap.files[0].indicesMap[j][0], 0, srcFeatures.mfccs.length - 1, 0, sourceEnergies.contour.length - 1); index = MathUtils.CheckLimits(index, 0, sourceEnergies.contour.length - 1); entry.sourceItem.energy = sourceEnergies.contour[index]; index = MathUtils.linearMap(imap.files[0].indicesMap[j][1], 0, tgtFeatures.mfccs.length - 1, 0, targetEnergies.contour.length - 1); index = MathUtils.CheckLimits(index, 0, targetEnergies.contour.length - 1); entry.targetItem.energy = targetEnergies.contour[index]; // if ((entry.sourceItem.f0 > 10.0 && entry.targetItem.f0 > 10.0) || (entry.sourceItem.f0 <= 10.0 && entry.targetItem.f0 <= 10.0)) codebookFile.writeEntry(entry); // } } System.out.println("Frame pairs processed in file " + String.valueOf(i + 1) + " of " + String.valueOf(fcol.indexMapFiles.length)); } } } } public void learnMappingFrameGroups(WeightedCodebookFile codebookFile, WeightedCodebookFeatureCollection fcol, BaselineAdaptationSet sourceTrainingSet, BaselineAdaptationSet targetTrainingSet, int[] map) throws IOException { assert params.codebookHeader.codebookType == WeightedCodebookFileHeader.FRAME_GROUPS; IndexMap imap = new IndexMap(); int i, j, k, n, totalFrames, index; double[] meanSourceEntries = null; double[] meanTargetEntries = null; double sourceAverageF0; double targetAverageF0; double sourceAverageDuration; double targetAverageDuration; double sourceAverageEnergy; double targetAverageEnergy; int sourceTotalVoiceds; int targetTotalVoiceds; int sourceTotal; int targetTotal; String sourcePhn = ""; String targetPhn = ""; Context sourceContext = null; Context targetContext = null; int middle; boolean bSourceOK = false; boolean bTargetOK = false; WeightedCodebookEntry entry = null; boolean bHeaderWritten = false; // Average neighbouring frame lsfs to obtain a smoother estimate of the source and target LSF vectors and write the // averaged versions as a new entry for (i = 0; i < fcol.indexMapFiles.length; i++) { System.out.println("LSF mapping for pair " + String.valueOf(i + 1) + " of " + String.valueOf(fcol.indexMapFiles.length) + ":"); try { imap.readFromFile(fcol.indexMapFiles[i]); // imap keeps information about a single source-target pair only } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (imap.files != null && sourceTrainingSet.items.length > i && targetTrainingSet.items.length > i) { // Mfccs Mfccs srcFeatures = new Mfccs(sourceTrainingSet.items[i].mfccFile); Mfccs tgtFeatures = new Mfccs(targetTrainingSet.items[map[i]].mfccFile); // // Pitch: for outlier elimination not prosody modeling! PitchReaderWriter sourceF0s = new PitchReaderWriter(sourceTrainingSet.items[i].pitchFile); PitchReaderWriter targetF0s = new PitchReaderWriter(targetTrainingSet.items[map[i]].pitchFile); // // Duration Labels sourceLabels = new Labels(sourceTrainingSet.items[i].labelFile); Labels targetLabels = new Labels(targetTrainingSet.items[map[i]].labelFile); // // Energy EnergyContourRms sourceEnergies = EnergyContourRms.ReadEnergyFile(sourceTrainingSet.items[i].energyFile); EnergyContourRms targetEnergies = EnergyContourRms.ReadEnergyFile(targetTrainingSet.items[map[i]].energyFile); // if (!bHeaderWritten) { params.codebookHeader.mfccParams.dimension = ((MfccFileHeader) (srcFeatures.params)).dimension; params.codebookHeader.mfccParams.samplingRate = ((MfccFileHeader) (srcFeatures.params)).samplingRate; codebookFile.writeCodebookHeader(params.codebookHeader); bHeaderWritten = true; } if (i == 0) { meanSourceEntries = new double[((MfccFileHeader) (srcFeatures.params)).dimension]; meanTargetEntries = new double[((MfccFileHeader) (tgtFeatures.params)).dimension]; } else { if (meanSourceEntries.length != ((MfccFileHeader) (srcFeatures.params)).dimension) { System.out.println("Error! LSF vector size mismatch in source lsf file " + sourceTrainingSet.items[i].mfccFile); return; } if (meanTargetEntries.length != ((MfccFileHeader) (tgtFeatures.params)).dimension) { System.out.println("Error! LSF vector size mismatch in target lsf file " + targetTrainingSet.items[map[i]].mfccFile); return; } } if (srcFeatures.mfccs != null && tgtFeatures.mfccs != null) { for (j = 0; j < imap.files[0].indicesMap.length; j++) // j is the index for labels { Arrays.fill(meanSourceEntries, 0.0); Arrays.fill(meanTargetEntries, 0.0); sourceAverageF0 = 0.0; targetAverageF0 = 0.0; sourceAverageDuration = 0.0; targetAverageDuration = 0.0; sourceAverageEnergy = 0.0; targetAverageEnergy = 0.0; sourceTotalVoiceds = 0; targetTotalVoiceds = 0; sourceTotal = 0; targetTotal = 0; totalFrames = 0; bSourceOK = false; middle = (int) Math.floor(0.5 * (imap.files[0].indicesMap[j][0] + imap.files[0].indicesMap[j][1]) + 0.5); for (k = imap.files[0].indicesMap[j][0]; k <= imap.files[0].indicesMap[j][1]; k++) { if (k >= 0 && k < srcFeatures.mfccs.length) { totalFrames++; bSourceOK = true; for (n = 0; n < ((MfccFileHeader) (srcFeatures.params)).dimension; n++) meanSourceEntries[n] += srcFeatures.mfccs[k][n]; // Pitch index = MathUtils.linearMap(k, 0, srcFeatures.mfccs.length - 1, 0, sourceF0s.contour.length - 1); if (sourceF0s.contour[index] > 10.0) { sourceAverageF0 += sourceF0s.contour[index]; sourceTotalVoiceds++; } // // Duration index = SignalProcUtils.frameIndex2LabelIndex(k, sourceLabels, ((MfccFileHeader) (srcFeatures.params)).winsize, ((MfccFileHeader) (srcFeatures.params)).skipsize); if (index > 0) sourceAverageDuration += sourceLabels.items[index].time - sourceLabels.items[index - 1].time; else sourceAverageDuration += sourceLabels.items[index].time; // // Phone: Middle frames phonetic identity if (k == middle) { sourcePhn = sourceLabels.items[index].phn; sourceContext = new Context(sourceLabels, index, WeightedCodebookTrainerParams.MAXIMUM_CONTEXT); } // // Energy index = MathUtils.linearMap(k, 0, srcFeatures.mfccs.length - 1, 0, sourceEnergies.contour.length - 1); index = MathUtils.CheckLimits(index, 0, sourceEnergies.contour.length - 1); sourceAverageEnergy += sourceEnergies.contour[index]; // sourceTotal++; } } if (bSourceOK) { for (n = 0; n < ((MfccFileHeader) (srcFeatures.params)).dimension; n++) meanSourceEntries[n] /= totalFrames; totalFrames = 0; bTargetOK = false; middle = (int) Math .floor(0.5 * (imap.files[0].indicesMap[j][2] + imap.files[0].indicesMap[j][3]) + 0.5); for (k = imap.files[0].indicesMap[j][2]; k <= imap.files[0].indicesMap[j][3]; k++) { if (k >= 0 && k < tgtFeatures.mfccs.length) { totalFrames++; bTargetOK = true; for (n = 0; n < ((MfccFileHeader) (tgtFeatures.params)).dimension; n++) meanTargetEntries[n] += tgtFeatures.mfccs[k][n]; // Pitch index = MathUtils.linearMap(k, 0, tgtFeatures.mfccs.length - 1, 0, targetF0s.contour.length - 1); if (targetF0s.contour[index] > 10.0) { targetAverageF0 += targetF0s.contour[index]; targetTotalVoiceds++; } // // Duration index = SignalProcUtils.frameIndex2LabelIndex(k, targetLabels, ((MfccFileHeader) (tgtFeatures.params)).winsize, ((MfccFileHeader) (tgtFeatures.params)).skipsize); if (index > 0) targetAverageDuration += targetLabels.items[index].time - targetLabels.items[index - 1].time; else targetAverageDuration += targetLabels.items[index].time; // // Phone: Middle frames phonetic identity if (k == middle) { targetPhn = targetLabels.items[index].phn; targetContext = new Context(targetLabels, index, WeightedCodebookTrainerParams.MAXIMUM_CONTEXT); } // // Energy index = MathUtils.linearMap(k, 0, tgtFeatures.mfccs.length - 1, 0, targetEnergies.contour.length - 1); index = MathUtils.CheckLimits(index, 0, targetEnergies.contour.length - 1); targetAverageEnergy += targetEnergies.contour[index]; // targetTotal++; } } if (bTargetOK) { for (n = 0; n < ((MfccFileHeader) (tgtFeatures.params)).dimension; n++) meanTargetEntries[n] /= totalFrames; // Write to codebook file entry = new WeightedCodebookEntry(0, meanSourceEntries.length); entry.setMfccs(meanSourceEntries, meanTargetEntries); // Pitch if (sourceTotalVoiceds > 0) sourceAverageF0 /= sourceTotalVoiceds; if (targetTotalVoiceds > 0) targetAverageF0 /= targetTotalVoiceds; entry.sourceItem.f0 = sourceAverageF0; entry.targetItem.f0 = targetAverageF0; // // Duration if (sourceTotal > 0) sourceAverageDuration /= sourceTotal; if (targetTotal > 0) sourceAverageDuration /= targetTotal; entry.sourceItem.duration = sourceAverageDuration; entry.targetItem.duration = targetAverageDuration; // // Phone entry.sourceItem.phn = sourcePhn; entry.targetItem.phn = targetPhn; entry.sourceItem.context = new Context(sourceContext); entry.targetItem.context = new Context(targetContext); // // Energy if (sourceTotal > 0) sourceAverageEnergy /= sourceTotal; if (targetTotal > 0) targetAverageEnergy /= targetTotal; entry.sourceItem.energy = sourceAverageEnergy; entry.targetItem.energy = targetAverageEnergy; // if ((entry.sourceItem.f0 > 10.0 && entry.targetItem.f0 > 10.0) || (entry.sourceItem.f0 <= 10.0 && entry.targetItem.f0 <= 10.0)) codebookFile.writeEntry(entry); // } } } System.out.println("Frame pairs processed in file " + String.valueOf(i + 1) + " of " + String.valueOf(fcol.indexMapFiles.length)); } } } } public void learnMappingLabels(WeightedCodebookFile codebookFile, WeightedCodebookFeatureCollection fcol, BaselineAdaptationSet sourceTrainingSet, BaselineAdaptationSet targetTrainingSet, int[] map) throws IOException { assert params.codebookHeader.codebookType == WeightedCodebookFileHeader.LABELS; IndexMap imap = new IndexMap(); int i, j, k, n, totalFrames, index; boolean bSourceOK = false; boolean bTargetOK = false; double[] meanSourceEntries = null; double[] meanTargetEntries = null; double sourceAverageF0; double targetAverageF0; double sourceAverageDuration; double targetAverageDuration; double sourceAverageEnergy; double targetAverageEnergy; int sourceTotalVoiceds; int targetTotalVoiceds; int sourceTotal; int targetTotal; String sourcePhn = ""; String targetPhn = ""; Context sourceContext = null; Context targetContext = null; int middle; WeightedCodebookEntry entry = null; boolean bHeaderWritten = false; // Take an average of LSF vectors within each label pair and write the resulting vector as the state // average for source and target // To do: Weighting of vectors within each label according to some criteria // on how typical they represent the current phone. // This can be implemented by looking at some distance measure (eucledian, mahalonoibis, LSF, etc) // to the cluster mean (i.e. mean of all LSF vectors for this phone), for example. for (i = 0; i < fcol.indexMapFiles.length; i++) { System.out.println("LSF mapping for pair " + String.valueOf(i + 1) + " of " + String.valueOf(fcol.indexMapFiles.length) + ":"); try { imap.readFromFile(fcol.indexMapFiles[i]); // imap keeps information about a single source-target pair only } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (imap.files != null && sourceTrainingSet.items.length > i && targetTrainingSet.items.length > i) { Mfccs srcFeatures = new Mfccs(sourceTrainingSet.items[i].mfccFile); Mfccs tgtFeatures = new Mfccs(targetTrainingSet.items[map[i]].mfccFile); // Pitch: for outlier elimination not prosody modeling! PitchReaderWriter sourceF0s = new PitchReaderWriter(sourceTrainingSet.items[i].pitchFile); PitchReaderWriter targetF0s = new PitchReaderWriter(targetTrainingSet.items[map[i]].pitchFile); // // Duration Labels sourceLabels = new Labels(sourceTrainingSet.items[i].labelFile); Labels targetLabels = new Labels(targetTrainingSet.items[map[i]].labelFile); // // Energy EnergyContourRms sourceEnergies = EnergyContourRms.ReadEnergyFile(sourceTrainingSet.items[i].energyFile); EnergyContourRms targetEnergies = EnergyContourRms.ReadEnergyFile(targetTrainingSet.items[map[i]].energyFile); // if (!bHeaderWritten) { params.codebookHeader.mfccParams.dimension = ((MfccFileHeader) (srcFeatures.params)).dimension; params.codebookHeader.mfccParams.samplingRate = ((MfccFileHeader) (srcFeatures.params)).samplingRate; codebookFile.writeCodebookHeader(params.codebookHeader); bHeaderWritten = true; } if (i == 0) { meanSourceEntries = new double[((MfccFileHeader) (srcFeatures.params)).dimension]; meanTargetEntries = new double[((MfccFileHeader) (tgtFeatures.params)).dimension]; } else { if (meanSourceEntries.length != ((MfccFileHeader) (srcFeatures.params)).dimension) { System.out.println("Error! LSF vector size mismatch in source lsf file " + sourceTrainingSet.items[i].mfccFile); return; } if (meanTargetEntries.length != ((MfccFileHeader) (tgtFeatures.params)).dimension) { System.out.println("Error! LSF vector size mismatch in target lsf file " + targetTrainingSet.items[map[i]].mfccFile); return; } } if (srcFeatures.mfccs != null && tgtFeatures.mfccs != null) { for (j = 0; j < imap.files[0].indicesMap.length; j++) // j is the index for labels { Arrays.fill(meanSourceEntries, 0.0); Arrays.fill(meanTargetEntries, 0.0); sourceAverageF0 = 0.0; targetAverageF0 = 0.0; sourceAverageDuration = 0.0; targetAverageDuration = 0.0; sourceAverageEnergy = 0.0; targetAverageEnergy = 0.0; sourceTotalVoiceds = 0; targetTotalVoiceds = 0; sourceTotal = 0; targetTotal = 0; totalFrames = 0; bSourceOK = false; middle = (int) Math.floor(0.5 * (imap.files[0].indicesMap[j][0] + imap.files[0].indicesMap[j][1]) + 0.5); for (k = imap.files[0].indicesMap[j][0]; k <= imap.files[0].indicesMap[j][1]; k++) { if (k >= 0 && k < srcFeatures.mfccs.length) { totalFrames++; bSourceOK = true; for (n = 0; n < ((MfccFileHeader) (srcFeatures.params)).dimension; n++) meanSourceEntries[n] += srcFeatures.mfccs[k][n]; // Pitch index = MathUtils.linearMap(k, 0, srcFeatures.mfccs.length - 1, 0, sourceF0s.contour.length - 1); if (sourceF0s.contour[index] > 10.0) { sourceAverageF0 += sourceF0s.contour[index]; sourceTotalVoiceds++; } // // Duration index = SignalProcUtils.frameIndex2LabelIndex(k, sourceLabels, ((MfccFileHeader) (srcFeatures.params)).winsize, ((MfccFileHeader) (srcFeatures.params)).skipsize); if (index > 0) sourceAverageDuration += sourceLabels.items[index].time - sourceLabels.items[index - 1].time; else sourceAverageDuration += sourceLabels.items[index].time; // // Phone: Middle frames phonetic identity if (k == middle) { sourcePhn = sourceLabels.items[index].phn; sourceContext = new Context(sourceLabels, index, WeightedCodebookTrainerParams.MAXIMUM_CONTEXT); } // // Energy index = MathUtils.linearMap(k, 0, srcFeatures.mfccs.length - 1, 0, sourceEnergies.contour.length - 1); index = MathUtils.CheckLimits(index, 0, sourceEnergies.contour.length - 1); sourceAverageEnergy += sourceEnergies.contour[index]; // sourceTotal++; } } if (bSourceOK) { for (n = 0; n < ((MfccFileHeader) (srcFeatures.params)).dimension; n++) meanSourceEntries[n] /= totalFrames; totalFrames = 0; bTargetOK = false; middle = (int) Math .floor(0.5 * (imap.files[0].indicesMap[j][2] + imap.files[0].indicesMap[j][3]) + 0.5); for (k = imap.files[0].indicesMap[j][2]; k <= imap.files[0].indicesMap[j][3]; k++) { if (k >= 0 && k < tgtFeatures.mfccs.length) { totalFrames++; bTargetOK = true; for (n = 0; n < ((MfccFileHeader) (tgtFeatures.params)).dimension; n++) meanTargetEntries[n] += tgtFeatures.mfccs[k][n]; // Pitch index = MathUtils.linearMap(k, 0, tgtFeatures.mfccs.length - 1, 0, targetF0s.contour.length - 1); if (targetF0s.contour[index] > 10.0) { targetAverageF0 += targetF0s.contour[index]; targetTotalVoiceds++; } // // Duration index = SignalProcUtils.frameIndex2LabelIndex(k, targetLabels, ((MfccFileHeader) (tgtFeatures.params)).winsize, ((MfccFileHeader) (tgtFeatures.params)).skipsize); if (index > 0) targetAverageDuration += targetLabels.items[index].time - targetLabels.items[index - 1].time; else targetAverageDuration += targetLabels.items[index].time; // // Phone: Middle frames phonetic identity if (k == middle) { targetPhn = targetLabels.items[index].phn; targetContext = new Context(targetLabels, index, WeightedCodebookTrainerParams.MAXIMUM_CONTEXT); } // // Energy index = MathUtils.linearMap(k, 0, tgtFeatures.mfccs.length - 1, 0, targetEnergies.contour.length - 1); index = MathUtils.CheckLimits(index, 0, targetEnergies.contour.length - 1); targetAverageEnergy += targetEnergies.contour[index]; // targetTotal++; } } if (bTargetOK) { for (n = 0; n < ((MfccFileHeader) (tgtFeatures.params)).dimension; n++) meanTargetEntries[n] /= totalFrames; // Write to codebook file entry = new WeightedCodebookEntry(0, meanSourceEntries.length); entry.setMfccs(meanSourceEntries, meanTargetEntries); // Pitch if (sourceTotalVoiceds > 0) sourceAverageF0 /= sourceTotalVoiceds; if (targetTotalVoiceds > 0) targetAverageF0 /= targetTotalVoiceds; entry.sourceItem.f0 = sourceAverageF0; entry.targetItem.f0 = targetAverageF0; // // Duration if (sourceTotal > 0) sourceAverageDuration /= sourceTotal; if (targetTotal > 0) sourceAverageDuration /= targetTotal; entry.sourceItem.duration = sourceAverageDuration; entry.targetItem.duration = targetAverageDuration; // // Phone entry.sourceItem.phn = sourcePhn; entry.targetItem.phn = targetPhn; entry.sourceItem.context = new Context(sourceContext); entry.targetItem.context = new Context(targetContext); // // Energy if (sourceTotal > 0) sourceAverageEnergy /= sourceTotal; if (targetTotal > 0) targetAverageEnergy /= targetTotal; entry.sourceItem.energy = sourceAverageEnergy; entry.targetItem.energy = targetAverageEnergy; // if ((entry.sourceItem.f0 > 10.0 && entry.targetItem.f0 > 10.0) || (entry.sourceItem.f0 <= 10.0 && entry.targetItem.f0 <= 10.0)) codebookFile.writeEntry(entry); // System.out.println("Label pair " + String.valueOf(j + 1) + " of " + String.valueOf(imap.files[0].indicesMap.length)); } } } } } } } // This function is identical to learnMappingLabels since the mapping is performed accordingly in previous steps public void learnMappingLabelGroups(WeightedCodebookFile codebookFile, WeightedCodebookFeatureCollection fcol, BaselineAdaptationSet sourceTrainingSet, BaselineAdaptationSet targetTrainingSet, int[] map) throws IOException { learnMappingLabels(codebookFile, fcol, sourceTrainingSet, targetTrainingSet, map); } public void learnMappingSpeech(WeightedCodebookFile codebookFile, WeightedCodebookFeatureCollection fcol, BaselineAdaptationSet sourceTrainingSet, BaselineAdaptationSet targetTrainingSet, int[] map) { assert params.codebookHeader.codebookType == WeightedCodebookFileHeader.SPEECH; int i, j, n; double[] meanSourceEntries = null; double[] meanTargetEntries = null; WeightedCodebookEntry entry = null; boolean bHeaderWritten = false; // Take an average of LSF vectors within each label pair and write the resulting vector as the state // average for source and target // To do: Weighting of vectors within each label according to some criteria // on how typical they represent the current phone. // This can be implemented by looking at some distance measure (eucledian, mahalonoibis, LSF, etc) // to the cluster mean (i.e. mean of all LSF vectors for this phone), for example. int totalFramesSrc = 0; boolean bSourceOK = false; int totalFramesTgt = 0; boolean bTargetOK = false; int lpOrderSrc = 0; int lpOrderTgt = 0; for (i = 0; i < fcol.indexMapFiles.length; i++) { System.out.println("LSF mapping for pair " + String.valueOf(i + 1) + " of " + String.valueOf(fcol.indexMapFiles.length) + ":"); if (sourceTrainingSet.items.length > i) { Mfccs srcFeatures = new Mfccs(sourceTrainingSet.items[i].mfccFile); Mfccs tgtFeatures = new Mfccs(targetTrainingSet.items[map[i]].mfccFile); if (!bHeaderWritten) { params.codebookHeader.mfccParams.dimension = ((MfccFileHeader) (srcFeatures.params)).dimension; params.codebookHeader.mfccParams.samplingRate = ((MfccFileHeader) (srcFeatures.params)).samplingRate; codebookFile.writeCodebookHeader(params.codebookHeader); bHeaderWritten = true; } if (i == 0) { meanSourceEntries = new double[((MfccFileHeader) (srcFeatures.params)).dimension]; meanTargetEntries = new double[((MfccFileHeader) (tgtFeatures.params)).dimension]; Arrays.fill(meanSourceEntries, 0.0); Arrays.fill(meanTargetEntries, 0.0); lpOrderSrc = ((MfccFileHeader) (srcFeatures.params)).dimension; lpOrderTgt = ((MfccFileHeader) (srcFeatures.params)).dimension; } else { if (meanSourceEntries.length != ((MfccFileHeader) (srcFeatures.params)).dimension) { System.out.println("Error! LSF vector size mismatch in source lsf file " + sourceTrainingSet.items[i].mfccFile); return; } if (meanTargetEntries.length != ((MfccFileHeader) (tgtFeatures.params)).dimension) { System.out.println("Error! LSF vector size mismatch in target lsf file " + targetTrainingSet.items[map[i]].mfccFile); return; } } if (srcFeatures.mfccs != null) { for (j = 0; j < ((MfccFileHeader) (srcFeatures.params)).numfrm; j++) { totalFramesSrc++; bSourceOK = true; for (n = 0; n < lpOrderSrc; n++) meanSourceEntries[n] += srcFeatures.mfccs[j][n]; } } if (tgtFeatures.mfccs != null) { for (j = 0; j < ((MfccFileHeader) (tgtFeatures.params)).numfrm; j++) { totalFramesTgt++; bTargetOK = true; for (n = 0; n < lpOrderTgt; n++) meanTargetEntries[n] += tgtFeatures.mfccs[j][n]; } } } } if (bSourceOK) { for (n = 0; n < lpOrderSrc; n++) meanSourceEntries[n] /= totalFramesSrc; } if (bTargetOK) { for (n = 0; n < lpOrderTgt; n++) meanTargetEntries[n] /= totalFramesTgt; } if (bSourceOK && bTargetOK) { // Write to codebook file entry = new WeightedCodebookEntry(0, meanSourceEntries.length); entry.setMfccs(meanSourceEntries, meanTargetEntries); codebookFile.writeEntry(entry); // } } }