/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.adaptation; import java.io.IOException; import marytts.signalproc.analysis.FeatureFileHeader; import marytts.signalproc.analysis.Labels; import marytts.signalproc.analysis.LsfFileHeader; import marytts.signalproc.analysis.MfccFileHeader; import marytts.util.data.AlignLabelsUtils; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; import marytts.util.string.StringUtils; /** * Generic utilities for voice conversion * * @author Oytun Türk */ public class AdaptationUtils { public static int ALL_AVAILABLE_TRAINING_FRAMES = -1; // An optimal alignment is found by dynamic programming if the labels are not identical public static IndexMap mapFramesFeatures(String sourceLabelFile, String targetLabelFile, String sourceFeatureFile, String targetFeatureFile, int vocalTractFeature, String[] labelsToExcludeFromTraining) throws IOException { IndexMap im = null; // Read label files Labels sourceLabels = new Labels(sourceLabelFile); Labels targetLabels = new Labels(targetLabelFile); // // Read feature file headers FeatureFileHeader hdr1 = null; FeatureFileHeader hdr2 = null; if (vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { hdr1 = new LsfFileHeader(sourceFeatureFile); hdr2 = new LsfFileHeader(targetFeatureFile); } else if (vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { hdr1 = new MfccFileHeader(sourceFeatureFile); hdr2 = new MfccFileHeader(targetFeatureFile); } // if (hdr1 != null && hdr2 != null && sourceLabels.items != null && targetLabels.items != null) { // Find the optimum alignment between the source and the target labels since the phone sequences may not be identical // due to silence periods etc. int[][] labelMap = AlignLabelsUtils.alignLabels(sourceLabels.items, targetLabels.items); // if (labelMap != null) { int j, srcLabInd, tgtLabInd, tgtFrmInd; double time1, time2; double srcStartTime, srcEndTime, tgtStartTime, tgtEndTime; srcLabInd = 0; // Find the corresponding target frame index for each source frame index int count = 0; im = new IndexMap(1); im.files[0] = new FileMap(hdr1.numfrm, 2); for (j = 0; j < hdr1.numfrm; j++) { time1 = SignalProcUtils.frameIndex2Time(j, hdr1.winsize, hdr1.skipsize); while (time1 > sourceLabels.items[srcLabInd].time) { srcLabInd++; if (srcLabInd > sourceLabels.items.length - 1) { srcLabInd = sourceLabels.items.length - 1; break; } } tgtLabInd = StringUtils.findInMap(labelMap, srcLabInd); if (tgtLabInd >= 0 && sourceLabels.items[srcLabInd].phn.compareTo(targetLabels.items[tgtLabInd].phn) == 0) { boolean isLabelDesired = true; if (labelsToExcludeFromTraining != null) isLabelDesired = !StringUtils.isOneOf(sourceLabels.items[srcLabInd].phn, labelsToExcludeFromTraining); if (isLabelDesired) { if (srcLabInd > 0) srcStartTime = sourceLabels.items[srcLabInd - 1].time; else srcStartTime = 0.0; if (tgtLabInd > 0) tgtStartTime = targetLabels.items[tgtLabInd - 1].time; else tgtStartTime = 0.0; srcEndTime = sourceLabels.items[srcLabInd].time; tgtEndTime = targetLabels.items[tgtLabInd].time; time2 = MathUtils.linearMap(time1, srcStartTime, srcEndTime, tgtStartTime, tgtEndTime); tgtFrmInd = SignalProcUtils.time2frameIndex(time2, hdr2.winsize, hdr2.skipsize); tgtFrmInd = Math.max(0, tgtFrmInd); tgtFrmInd = Math.min(tgtFrmInd, hdr2.numfrm - 1); im.files[0].indicesMap[count][0] = j; im.files[0].indicesMap[count][1] = tgtFrmInd; count++; if (count > hdr1.numfrm - 1) break; } } } } } return im; } // Each frame is mapped as a group of frames, i.e. with frames on the left and right context public static IndexMap mapFrameGroupsFeatures(String sourceLabelFile, String targetLabelFile, String sourceFeatureFile, String targetFeatureFile, int numNeighbours, int vocalTractFeature, String[] labelsToExcludeFromTraining) throws IOException { IndexMap im = null; // Read label files Labels sourceLabels = new Labels(sourceLabelFile); Labels targetLabels = new Labels(targetLabelFile); // // Read feature file headers FeatureFileHeader hdr1 = null; FeatureFileHeader hdr2 = null; if (vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { hdr1 = new LsfFileHeader(sourceFeatureFile); hdr2 = new LsfFileHeader(targetFeatureFile); } else if (vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { hdr1 = new MfccFileHeader(sourceFeatureFile); hdr2 = new MfccFileHeader(targetFeatureFile); } // if (hdr1 != null && hdr2 != null && sourceLabels.items != null && targetLabels.items != null) { // Find the optimum alignment between the source and the target labels since the phone sequences may not be identical // due to silence periods etc. int[][] labelMap = AlignLabelsUtils.alignLabels(sourceLabels.items, targetLabels.items); // if (labelMap != null) { int j, srcLabInd, tgtLabInd, tgtFrmInd; double time1, time2; double srcStartTime, srcEndTime, tgtStartTime, tgtEndTime; srcLabInd = 0; // Find the corresponding target frame index for each source frame index int count = 0; im = new IndexMap(1); im.files[0] = new FileMap(hdr1.numfrm, 4); for (j = 0; j < hdr1.numfrm; j++) { time1 = SignalProcUtils.frameIndex2Time(j, hdr1.winsize, hdr1.skipsize); while (time1 > sourceLabels.items[srcLabInd].time) { srcLabInd++; if (srcLabInd > sourceLabels.items.length - 1) { srcLabInd = sourceLabels.items.length - 1; break; } } tgtLabInd = StringUtils.findInMap(labelMap, srcLabInd); if (tgtLabInd >= 0 && sourceLabels.items[srcLabInd].phn.compareTo(targetLabels.items[tgtLabInd].phn) == 0) { boolean isLabelDesired = true; if (labelsToExcludeFromTraining != null) isLabelDesired = !StringUtils.isOneOf(sourceLabels.items[srcLabInd].phn, labelsToExcludeFromTraining); if (isLabelDesired) { if (srcLabInd > 0) srcStartTime = sourceLabels.items[srcLabInd - 1].time; else srcStartTime = 0.0; if (tgtLabInd > 0) tgtStartTime = targetLabels.items[tgtLabInd - 1].time; else tgtStartTime = 0.0; srcEndTime = sourceLabels.items[srcLabInd].time; tgtEndTime = targetLabels.items[tgtLabInd].time; time2 = MathUtils.linearMap(time1, srcStartTime, srcEndTime, tgtStartTime, tgtEndTime); tgtFrmInd = SignalProcUtils.time2frameIndex(time2, hdr2.winsize, hdr2.skipsize); im.files[0].indicesMap[count][0] = Math.max(0, j - numNeighbours); im.files[0].indicesMap[count][1] = Math.min(j + numNeighbours, hdr1.numfrm - 1); im.files[0].indicesMap[count][2] = Math.max(0, tgtFrmInd - numNeighbours); im.files[0].indicesMap[count][3] = Math.min(tgtFrmInd + numNeighbours, hdr2.numfrm - 1); count++; if (count > hdr1.numfrm - 1) break; } } } } } return im; } public static IndexMap mapLabelsFeatures(String sourceLabelFile, String targetLabelFile, String sourceFeatureFile, String targetFeatureFile, int vocalTractFeature, String[] labelsToExcludeFromTraining) throws IOException { IndexMap im = null; // Read label files Labels sourceLabels = new Labels(sourceLabelFile); Labels targetLabels = new Labels(targetLabelFile); // // Read feature file headers FeatureFileHeader hdr1 = null; FeatureFileHeader hdr2 = null; if (vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { hdr1 = new LsfFileHeader(sourceFeatureFile); hdr2 = new LsfFileHeader(targetFeatureFile); } else if (vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { hdr1 = new MfccFileHeader(sourceFeatureFile); hdr2 = new MfccFileHeader(targetFeatureFile); } // if (hdr1 != null && hdr2 != null && sourceLabels.items != null && targetLabels.items != null) { // Find the optimum alignment between the source and the target labels since the phone sequences may not be identical // due to silence periods etc. int[][] labelMap = AlignLabelsUtils.alignLabels(sourceLabels.items, targetLabels.items); // if (labelMap != null) { int j, tgtLabInd; double srcStartTime, srcEndTime, tgtStartTime, tgtEndTime; // Find the corresponding target frame index for each source frame index int count = 0; im = new IndexMap(1); im.files[0] = new FileMap(sourceLabels.items.length, 4); for (j = 0; j < sourceLabels.items.length; j++) { if (j > 0) srcStartTime = sourceLabels.items[j - 1].time; else srcStartTime = 0.0; tgtLabInd = StringUtils.findInMap(labelMap, j); if (tgtLabInd >= 0 && sourceLabels.items[j].phn.compareTo(targetLabels.items[tgtLabInd].phn) == 0) { boolean isLabelDesired = true; if (labelsToExcludeFromTraining != null) isLabelDesired = !StringUtils.isOneOf(sourceLabels.items[j].phn, labelsToExcludeFromTraining); if (isLabelDesired) { if (tgtLabInd > 0) tgtStartTime = targetLabels.items[tgtLabInd - 1].time; else tgtStartTime = 0.0; srcEndTime = sourceLabels.items[j].time; tgtEndTime = targetLabels.items[tgtLabInd].time; im.files[0].indicesMap[count][0] = SignalProcUtils.time2frameIndex(srcStartTime, hdr1.winsize, hdr1.skipsize); im.files[0].indicesMap[count][1] = SignalProcUtils.time2frameIndex(srcEndTime, hdr1.winsize, hdr1.skipsize); im.files[0].indicesMap[count][2] = SignalProcUtils.time2frameIndex(tgtStartTime, hdr2.winsize, hdr2.skipsize); im.files[0].indicesMap[count][3] = SignalProcUtils.time2frameIndex(tgtEndTime, hdr2.winsize, hdr2.skipsize); count++; if (count > sourceLabels.items.length - 1) break; } } } } } return im; } public static IndexMap mapLabelGroupsFeatures(String sourceLabelFile, String targetLabelFile, String sourceFeatureFile, String targetFeatureFile, int numNeighbours, int vocalTractFeature, String[] labelsToExcludeFromTraining) throws IOException { IndexMap im = null; // Read label files Labels sourceLabels = new Labels(sourceLabelFile); Labels targetLabels = new Labels(targetLabelFile); // // Read feature file headers FeatureFileHeader hdr1 = null; FeatureFileHeader hdr2 = null; if (vocalTractFeature == BaselineFeatureExtractor.LSF_FEATURES) { hdr1 = new LsfFileHeader(sourceFeatureFile); hdr2 = new LsfFileHeader(targetFeatureFile); } else if (vocalTractFeature == BaselineFeatureExtractor.MFCC_FEATURES_FROM_FILES) { hdr1 = new MfccFileHeader(sourceFeatureFile); hdr2 = new MfccFileHeader(targetFeatureFile); } // if (hdr1 != null && hdr2 != null && sourceLabels.items != null && targetLabels.items != null) { // Find the optimum alignment between the source and the target labels since the phone sequences may not be identical // due to silence periods etc. int[][] labelMap = AlignLabelsUtils.alignLabels(sourceLabels.items, targetLabels.items); // if (labelMap != null) { int j, tgtLabInd; double srcStartTime, srcEndTime, tgtStartTime, tgtEndTime; // Find the corresponding target frame index for each source frame index int count = 0; im = new IndexMap(1); im.files[0] = new FileMap(sourceLabels.items.length, 4); for (j = 0; j < sourceLabels.items.length; j++) { if (j - numNeighbours - 1 >= 0) srcStartTime = sourceLabels.items[j - numNeighbours - 1].time; else srcStartTime = 0.0; tgtLabInd = StringUtils.findInMap(labelMap, j); if (tgtLabInd >= 0 && sourceLabels.items[j].phn.compareTo(targetLabels.items[tgtLabInd].phn) == 0) { boolean isLabelDesired = true; if (labelsToExcludeFromTraining != null) isLabelDesired = !StringUtils.isOneOf(sourceLabels.items[j].phn, labelsToExcludeFromTraining); if (isLabelDesired) { if (tgtLabInd - numNeighbours - 1 >= 0) tgtStartTime = targetLabels.items[tgtLabInd - numNeighbours - 1].time; else tgtStartTime = 0.0; srcEndTime = sourceLabels.items[Math.min(j + numNeighbours, sourceLabels.items.length - 1)].time; tgtEndTime = targetLabels.items[Math.min(tgtLabInd + numNeighbours, targetLabels.items.length - 1)].time; im.files[0].indicesMap[count][0] = SignalProcUtils.time2frameIndex(srcStartTime, hdr1.winsize, hdr1.skipsize); im.files[0].indicesMap[count][1] = SignalProcUtils.time2frameIndex(srcEndTime, hdr1.winsize, hdr1.skipsize); im.files[0].indicesMap[count][2] = SignalProcUtils.time2frameIndex(tgtStartTime, hdr2.winsize, hdr2.skipsize); im.files[0].indicesMap[count][3] = SignalProcUtils.time2frameIndex(tgtEndTime, hdr2.winsize, hdr2.skipsize); count++; if (count > sourceLabels.items.length - 1) break; } } } } } return im; } public static IndexMap mapSpeechFeatures() { IndexMap im = new IndexMap(1); im.files[0] = new FileMap(1, 1); im.files[0].indicesMap[0][0] = ALL_AVAILABLE_TRAINING_FRAMES; return im; } }