/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.process; import java.io.File; import java.io.IOException; import java.util.Arrays; import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.UnsupportedAudioFileException; import marytts.signalproc.adaptation.BaselineTransformerParams; import marytts.signalproc.adaptation.prosody.ProsodyTransformerParams; import marytts.signalproc.analysis.AlignmentData; import marytts.signalproc.analysis.F0ReaderWriter; import marytts.signalproc.analysis.FestivalUtt; import marytts.signalproc.analysis.Labels; import marytts.signalproc.analysis.PitchReaderWriter; import marytts.util.data.AlignLabelsUtils; import marytts.util.data.DoubleDataSource; import marytts.util.data.audio.AudioDoubleDataSource; import marytts.util.io.FileUtils; import marytts.util.math.MathUtils; import marytts.util.signal.SignalProcUtils; import marytts.util.string.StringUtils; /** * @author Oytun Türk * */ public class VoiceModificationParametersPreprocessor extends VoiceModificationParameters { public double[] pscalesVar; public double[] tscalesVar; public double[] escalesVar; public double[] vscalesVar; public double tscaleSingle; public int numPeriods; public VoiceModificationParametersPreprocessor(int samplingRate, int LPOrder, double[] pscalesIn, double[] tscalesIn, double[] escalesIn, double[] vscalesIn, int[] pitchMarksIn, double wsFixedIn, double ssFixedIn, int numfrm, int numfrmFixed, int numPeriodsIn, boolean isFixedRate) { super(samplingRate, LPOrder, pscalesIn, tscalesIn, escalesIn, vscalesIn); initialise(pitchMarksIn, wsFixedIn, ssFixedIn, numfrm, numfrmFixed, numPeriodsIn, isFixedRate); } // To do: Handle all isPscaleFromFestivalUttFile, isTscaleFromFestivalUttFile, isEscaleFromTargetWavFile, // requests separately. Currently, there is no isEscaleFromTargetWavFile support // and no support for using isPscaleFromFestivalUttFile but not isTscaleFromFestivalUttFile // and vice versa. // This constructor should also be combined with the above constructor // which takes user specified scaling factors. // Therefore, in the final version the user can request all variations, // i.e. pscale as in the utt file with some additional scaling or shifting, // escale using only scale values provided by the user, etc public VoiceModificationParametersPreprocessor( String sourcePitchFile, boolean isF0File, String sourceLabelFile, String sourceWavFile, // only required for escales String targetPitchFile, // only required for copy pitch synthesis String targetWavFile, // only required for escales boolean isPitchFromTargetFile, int pitchFromTargetMethod, boolean isDurationFromTargetFile, int durationFromTargetMethod, boolean isEnergyFromTargetFile, int targetAlignmentFileType, String targetAlignmentFile, int[] pitchMarks, double wsFixed, double ssFixed, int numfrmIn, int numfrmFixedIn, int numPeriodsIn, boolean isFixedRate) throws IOException { super(); numPeriods = numPeriodsIn; double[] sourceEns = null; double[] targetEns = null; if (isEnergyFromTargetFile) { AudioInputStream inputAudioSrc = null; try { inputAudioSrc = AudioSystem.getAudioInputStream(new File(sourceWavFile)); } catch (UnsupportedAudioFileException e) { throw new IOException("Cannot open audio " + sourceWavFile, e); } AudioInputStream inputAudioTgt = null; try { FileUtils.copy(targetWavFile, targetWavFile + ".wav"); inputAudioTgt = AudioSystem.getAudioInputStream(new File(targetWavFile + ".wav")); } catch (UnsupportedAudioFileException e) { throw new IOException("Cannot open audio " + targetWavFile + ".wav", e); } if (inputAudioSrc != null && inputAudioTgt != null) { DoubleDataSource inputSrc = new AudioDoubleDataSource(inputAudioSrc); double[] sourceSignal = inputSrc.getAllData(); int fsSource = (int) inputAudioSrc.getFormat().getSampleRate(); DoubleDataSource inputTgt = new AudioDoubleDataSource(inputAudioTgt); double[] targetSignal = inputTgt.getAllData(); int fsTarget = (int) inputAudioTgt.getFormat().getSampleRate(); inputAudioSrc.close(); inputAudioTgt.close(); FileUtils.delete(targetWavFile + ".wav"); sourceEns = SignalProcUtils.getEnergyContourRms(sourceSignal, wsFixed, ssFixed, fsSource); targetEns = SignalProcUtils.getEnergyContourRms(targetSignal, wsFixed, ssFixed, fsTarget); } } // Read from files (only necessary ones, you will need to read more when implementing escales etc) AlignmentData ad = null; if (isPitchFromTargetFile || isDurationFromTargetFile || isEnergyFromTargetFile) { if (FileUtils.exists(targetAlignmentFile)) { if (targetAlignmentFileType == BaselineTransformerParams.LABELS) ad = new Labels(targetAlignmentFile); else if (targetAlignmentFileType == BaselineTransformerParams.FESTIVAL_UTT) ad = new FestivalUtt(targetAlignmentFile); } } PitchReaderWriter sourceF0s = null; if (isF0File) sourceF0s = new F0ReaderWriter(sourcePitchFile); else sourceF0s = new PitchReaderWriter(sourcePitchFile); Labels sourceLabels = new Labels(sourceLabelFile); PitchReaderWriter targetF0s = null; if (targetPitchFile != null && FileUtils.exists(targetPitchFile)) { if (isF0File) targetF0s = new F0ReaderWriter(targetPitchFile); else targetF0s = new PitchReaderWriter(targetPitchFile); } // MaryUtils.plot(sourceF0s.contour); // MaryUtils.plot(targetF0s.contour); // Find pscalesVar and tscalesVar from targetFestivalUttFile, sourcePitchFile, sourceLabelFile tscaleSingle = -1; // Determine the pitch and time scaling factors corresponding to each pitch synchronous frame pscalesVar = MathUtils.ones(numfrmIn); double[] sourceMappedF0s = MathUtils.zeros(numfrmIn); double[] targetMappedF0s = MathUtils.zeros(numfrmIn); tscalesVar = MathUtils.ones(numfrmIn); escalesVar = MathUtils.ones(numfrmIn); vscalesVar = MathUtils.ones(numfrmIn); boolean[] voiceds = new boolean[numfrmIn]; Arrays.fill(voiceds, false); int i; double tSource, tTarget; int sourceLabInd, targetDurationLabInd, targetPitchLabInd, sourcePitchInd, targetPitchInd, sourceEnergyInd, targetEnergyInd; double sourceDuration, targetDuration, sourcePitch, targetPitch; double sourceDurationNeigh, targetDurationNeigh; double sourceLocationInLabelPercent; // Find the optimum alignment between the source and the target labels since the phone sequences may not be identical due // to silence periods etc. int[][] durationMap = null; Labels targetDurationLabels = null; Labels targetPitchLabels = null; if (ad != null) { if (ad instanceof FestivalUtt) { for (i = 0; i < ((FestivalUtt) ad).labels.length; i++) { if (((FestivalUtt) ad).keys[i].compareTo("==Segment==") == 0 && durationMap == null) { durationMap = AlignLabelsUtils.alignLabels(sourceLabels.items, ((FestivalUtt) ad).labels[i].items); targetDurationLabels = new Labels(((FestivalUtt) ad).labels[i].items); } else if (((FestivalUtt) ad).keys[i].compareTo("==Target==") == 0) targetPitchLabels = new Labels(((FestivalUtt) ad).labels[i]); } } else if (ad instanceof Labels) { durationMap = AlignLabelsUtils.alignLabels(sourceLabels.items, ((Labels) ad).items); targetDurationLabels = new Labels((Labels) ad); targetPitchLabels = new Labels((Labels) ad); } } // double[] modifiedContour = new double[numfrmIn]; if (durationMap != null && targetDurationLabels != null && targetPitchLabels != null) { for (i = 0; i < numfrmIn; i++) { if (!isFixedRate) tSource = (0.5 * (pitchMarks[i + numPeriods] + pitchMarks[i])) / fs; else tSource = i * ssFixed + 0.5 * wsFixed; sourceLabInd = SignalProcUtils.time2LabelIndex(tSource, sourceLabels); if (sourceLabInd > 0) { sourceDuration = sourceLabels.items[sourceLabInd].time - sourceLabels.items[sourceLabInd - 1].time; sourceLocationInLabelPercent = (tSource - sourceLabels.items[sourceLabInd - 1].time) / sourceDuration; } else { sourceDuration = sourceLabels.items[sourceLabInd].time; sourceLocationInLabelPercent = tSource / sourceLabels.items[sourceLabInd].time; } targetDurationLabInd = StringUtils.findInMap(durationMap, sourceLabInd); if (targetDurationLabInd > 0) targetDuration = targetDurationLabels.items[targetDurationLabInd].time - targetDurationLabels.items[targetDurationLabInd - 1].time; else targetDuration = targetDurationLabels.items[targetDurationLabInd].time; tscalesVar[i] = 1.0; if (durationFromTargetMethod == ProsodyTransformerParams.TRIPHONE_DURATIONS) { sourceDurationNeigh = sourceDuration; if (sourceLabInd > 1) sourceDurationNeigh += sourceLabels.items[sourceLabInd - 1].time - sourceLabels.items[sourceLabInd - 2].time; if (sourceLabInd < sourceLabels.items.length - 1) sourceDurationNeigh += sourceLabels.items[sourceLabInd + 1].time - sourceLabels.items[sourceLabInd].time; targetDurationNeigh = targetDuration; if (targetDurationLabInd > 1) targetDurationNeigh += targetDurationLabels.items[targetDurationLabInd - 1].time - targetDurationLabels.items[targetDurationLabInd - 2].time; if (targetDurationLabInd < targetDurationLabels.items.length - 1) targetDurationNeigh += targetDurationLabels.items[targetDurationLabInd + 1].time - targetDurationLabels.items[targetDurationLabInd].time; tscalesVar[i] = targetDurationNeigh / sourceDurationNeigh; } else if (durationFromTargetMethod == ProsodyTransformerParams.PHONEME_DURATIONS && targetDurationLabInd >= 0) tscalesVar[i] = targetDuration / sourceDuration; tTarget = -1.0; targetPitch = 0.0; sourcePitch = 0.0; pscalesVar[i] = 1.0; if (isPitchFromTargetFile) { sourcePitchInd = SignalProcUtils.time2frameIndex(tSource, sourceF0s.header.windowSizeInSeconds, sourceF0s.header.skipSizeInSeconds); if (sourcePitchInd > sourceF0s.header.numfrm - 1) sourcePitchInd = sourceF0s.header.numfrm - 1; sourcePitch = sourceF0s.contour[sourcePitchInd]; if (sourcePitch > 10.0) voiceds[i] = true; if (ad instanceof FestivalUtt) { tTarget = tSource; targetPitchLabInd = SignalProcUtils.time2LabelIndex(tTarget, targetPitchLabels); if (targetPitchLabInd > 0) { targetPitch = MathUtils.linearMap(tTarget, targetPitchLabels.items[targetPitchLabInd - 1].time, targetPitchLabels.items[targetPitchLabInd].time, targetPitchLabels.items[targetPitchLabInd - 1].valuesRest[0], targetPitchLabels.items[targetPitchLabInd].valuesRest[0]); } else targetPitch = targetPitchLabels.items[targetPitchLabInd].valuesRest[0]; } else if (ad instanceof Labels) // Pitch comes from a target pitch contour { if (targetF0s != null) { if (targetDurationLabInd > 0) tTarget = targetDurationLabels.items[targetDurationLabInd - 1].time + sourceLocationInLabelPercent * targetDuration; else tTarget = sourceLocationInLabelPercent * targetDuration; targetPitchInd = SignalProcUtils.time2frameIndex(tTarget, targetF0s.header.windowSizeInSeconds, targetF0s.header.skipSizeInSeconds); targetPitchInd = MathUtils.CheckLimits(targetPitchInd, 0, targetF0s.contour.length - 1); targetPitch = targetF0s.contour[targetPitchInd]; } else targetPitch = sourcePitch; } sourceMappedF0s[i] = sourcePitch; targetMappedF0s[i] = targetPitch; if (pitchFromTargetMethod == ProsodyTransformerParams.FULL_CONTOUR) { if (targetPitch > 10.0 && sourcePitch > 10.0) pscalesVar[i] = targetPitch / sourcePitch; else pscalesVar[i] = 1.0; } } if (isEnergyFromTargetFile && sourceEns != null && targetEns != null) { sourceEnergyInd = SignalProcUtils.time2frameIndex(tSource, wsFixed, ssFixed); sourceEnergyInd = MathUtils.CheckLimits(sourceEnergyInd, 0, sourceEns.length - 1); targetEnergyInd = SignalProcUtils.time2frameIndex(tTarget, wsFixed, ssFixed); targetEnergyInd = MathUtils.CheckLimits(targetEnergyInd, 0, targetEns.length - 1); escalesVar[i] = targetEns[targetEnergyInd] / sourceEns[sourceEnergyInd]; // escalesVar[i] = ((double)i)/numfrmIn; //To test if this works } System.out.println("SLab=" + sourceLabels.items[sourceLabInd].phn + " TLab=" + targetDurationLabels.items[targetDurationLabInd].phn + " STime=" + String.valueOf(tSource) + " TTime=" + String.valueOf(tTarget) + " SPtich=" + sourcePitch + " TPitch=" + targetPitch + " ps=" + String.valueOf(pscalesVar[i]) + " ts=" + String.valueOf(tscalesVar[i])); } if (pitchFromTargetMethod == ProsodyTransformerParams.FULL_CONTOUR) { int smootherLen = 4; // pscalesVar = SignalProcUtils.meanFilter(pscalesVar, smootherLen); // pscalesVar = SignalProcUtils.shift(pscalesVar, (int)Math.floor(0.5*smootherLen)); for (i = 0; i < numfrmIn; i++) { if (!voiceds[i]) pscalesVar[i] = 1.0; pscalesVar[i] = Math.max(pscalesVar[i], BaselineTransformerParams.MINIMUM_ALLOWED_PITCH_SCALE); pscalesVar[i] = Math.min(pscalesVar[i], BaselineTransformerParams.MAXIMUM_ALLOWED_PITCH_SCALE); } // tscalesVar = SignalProcUtils.meanFilter(tscalesVar, smootherLen); // tscalesVar = SignalProcUtils.shift(tscalesVar, (int)Math.floor(0.5*smootherLen)); for (i = 0; i < numfrmIn; i++) { tscalesVar[i] = Math.max(tscalesVar[i], BaselineTransformerParams.MINIMUM_ALLOWED_TIME_SCALE); tscalesVar[i] = Math.min(tscalesVar[i], BaselineTransformerParams.MAXIMUM_ALLOWED_TIME_SCALE); } } else if (pitchFromTargetMethod == ProsodyTransformerParams.SENTENCE_MEAN || pitchFromTargetMethod == ProsodyTransformerParams.SENTENCE_MEAN_STDDEV) { double[] sourceVoicedF0s = MathUtils.findValues(sourceF0s.contour, MathUtils.GREATER_THAN, 10.0); double[] targetVoicedF0s = MathUtils.findValues(targetF0s.contour, MathUtils.GREATER_THAN, 10.0); double sourceF0Mean = MathUtils.mean(sourceVoicedF0s); double targetF0Mean = MathUtils.mean(targetVoicedF0s); if (pitchFromTargetMethod == ProsodyTransformerParams.SENTENCE_MEAN_STDDEV) { double sourceF0Std = MathUtils.standardDeviation(sourceVoicedF0s, sourceF0Mean); double targetF0Std = MathUtils.standardDeviation(targetVoicedF0s, targetF0Mean); for (i = 0; i < numfrmIn; i++) { pscalesVar[i] = 1.0; if (sourceMappedF0s[i] > 10.0 && targetMappedF0s[i] > 10.0) { double tF0 = ((sourceMappedF0s[i] - sourceF0Mean) / sourceF0Std) * targetF0Std + targetF0Mean; pscalesVar[i] = tF0 / sourceMappedF0s[i]; } } } else { for (i = 0; i < numfrmIn; i++) { pscalesVar[i] = 1.0; if (sourceMappedF0s[i] > 10.0 && targetMappedF0s[i] > 10.0) pscalesVar[i] = targetF0Mean / sourceF0Mean; } } } // Average duration scale estimation // This matches average duration of source sentence with the target excluding silence (Silence labels should be // appropriately listed below) if (isDurationFromTargetFile && durationFromTargetMethod == ProsodyTransformerParams.SENTENCE_DURATION) { String[] silenceLabels = { "H#", "_" }; double totalSourceDur = 0.0; double totalTargetDur = 0.0; for (i = 0; i < sourceLabels.items.length; i++) { if (!StringUtils.isOneOf(sourceLabels.items[i].phn, silenceLabels)) { if (i > 0) sourceDuration = sourceLabels.items[i].time - sourceLabels.items[i - 1].time; else sourceDuration = sourceLabels.items[i].time; targetDurationLabInd = StringUtils.findInMap(durationMap, i); if (targetDurationLabInd > 0) targetDuration = targetDurationLabels.items[targetDurationLabInd].time - targetDurationLabels.items[targetDurationLabInd - 1].time; else targetDuration = targetDurationLabels.items[targetDurationLabInd].time; totalSourceDur += sourceDuration; totalTargetDur += targetDuration; } } Arrays.fill(tscalesVar, totalTargetDur / totalSourceDur); System.out.println("Average duration scale=" + String.valueOf(totalTargetDur / totalSourceDur)); } // Arrays.fill(pscalesVar, 0.8); // MaryUtils.plot(pscalesVar); // MaryUtils.plot(tscalesVar); // MaryUtils.plot(escalesVar); } } private void initialise(int[] pitchMarksIn, double wsFixedIn, double ssFixedIn, int numfrm, int numfrmFixed, int numPeriodsIn, boolean isFixedRate) { numPeriods = numPeriodsIn; if (pitchMarksIn != null) { getScalesVar(pitchMarksIn, wsFixedIn, ssFixedIn, numfrm, numfrmFixed, isFixedRate); } } private void getScalesVar(int[] pitchMarks, double wsFixed, double ssFixed, int numfrm, int numfrmFixed, boolean isFixedRate) { if (tscales.length == 1) tscaleSingle = tscales[0]; else tscaleSingle = -1; // Find pscale, tscale and escale values corresponding to each fixed skip rate frame if (pscales.length != numfrmFixed) pscales = MathUtils.modifySize(pscales, numfrmFixed); if (tscales.length != numfrmFixed) tscales = MathUtils.modifySize(tscales, numfrmFixed); if (escales.length != numfrmFixed) escales = MathUtils.modifySize(escales, numfrmFixed); if (vscales.length != numfrmFixed) vscales = MathUtils.modifySize(vscales, numfrmFixed); // // Determine the pitch, time, and energy scaling factors corresponding to each pitch synchronous frame pscalesVar = MathUtils.ones(numfrm); tscalesVar = MathUtils.ones(numfrm); escalesVar = MathUtils.ones(numfrm); vscalesVar = MathUtils.ones(numfrm); double tVar; int ind; for (int i = 0; i < numfrm; i++) { if (!isFixedRate) tVar = (0.5 * (pitchMarks[i + numPeriods] + pitchMarks[i])) / fs; else tVar = i * ssFixed + 0.5 * wsFixed; ind = (int) (Math.floor((tVar - 0.5 * wsFixed) / ssFixed + 0.5)); if (ind < 0) ind = 0; if (ind > numfrmFixed - 1) ind = numfrmFixed - 1; pscalesVar[i] = pscales[ind]; tscalesVar[i] = tscales[ind]; escalesVar[i] = escales[ind]; vscalesVar[i] = vscales[ind]; } // } }