/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.sinusoidal; import marytts.util.signal.SignalProcUtils; /** * This class generates the sinusoidal tracks given individual peak amplitudes measured from the DFT spectrum. * * Reference: R.J. McAulay and T.F. Quatieri, "Speech Analysis/Synthesis Based on a Sinusoidal Representation," IEEE Transactions * on Acoustics, Speech and Signal Processing, vol. ASSP-34, no. 4, August 1986. * * @author Oytun Türk */ public class TrackGenerator { public static float ZERO_AMP_SHIFT_IN_SECONDS = 0.001f; // Time instant before/after current time to insert a turning-on/off // event // The amplitudes and synthesis freqs/phases are accordingly // interpolated to provide a smooth transition public static int MEAN_FILTER_FREQ_AXIS = 3; // Median filter each tracks frequency values public static int MEAN_FILTER_AMP_AXIS = 3; // Median filter each tracks amplitude values public TrackGenerator() { } /* * Group individual sinusoids into tracks by considering closeness in frequency Current version is a simple implementation of * checking the frequency difference between neighbouring sinusoids and assigning them to same track if the absolute * difference is less than a threshold Possible ways to improve this process would be to employ: - constraints on amplitude * continuity - constraints on phase continuity (i.e. the phase difference between two consecutive sinusoids should not be * larger or smaller than some percent of the period * * framesSins[i][] : Array of sinusoidal parameters (amps, freqs, phases) extracted from ith speech frame framesSins[i][j]: * Sinusoidal parameters of the jth peak sinusoid in the DFT spectrum of speech frame i Returns a number of sinusoidal tracks * * This version uses a simple search mechanism to compare a current sinusoid frequecny with the previous and if the difference * is smaller than +-deltaInHz, assigns the new sinusoid to the previous sinusoid´s track In the assignment, longer previous * paths are favoured in a weighted manner, i.e. the longer a candidate track, the more likely the current sinusoid gets * assigned to that track */ public SinusoidalTracks generateTracks(NonharmonicSinusoidalSpeechSignal sinSignal, float deltaInHz, int samplingRate) { int numFrames = sinSignal.framesSins.length; float deltaInRadians = SignalProcUtils.hz2radian(deltaInHz, samplingRate); SinusoidalTracks tr = null; int i; Sinusoid zeroAmpSin; if (numFrames > 0) { int j, k; float tmpDist, minDist; int trackInd; boolean[] bSinAssigneds = null; for (i = 0; i < numFrames; i++) { if (tr == null) // If no tracks yet, assign the current sinusoids to new tracks { tr = new SinusoidalTracks(sinSignal.framesSins[i].sinusoids.length, samplingRate); tr.setSysAmpsAndTimes(sinSignal.framesSins); for (j = 0; j < sinSignal.framesSins[i].sinusoids.length; j++) { // First add a zero amplitude sinusoid at previous time instant to allow smooth synthesis (i.e. // "turning on" the track) zeroAmpSin = new Sinusoid(0.0f, sinSignal.framesSins[i].sinusoids[j].freq, 0.0f, Sinusoid.NON_EXISTING_FRAME_INDEX); tr.add(new SinusoidalTrack(sinSignal.framesSins[i].time - ZERO_AMP_SHIFT_IN_SECONDS, zeroAmpSin, sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.TURNED_ON)); // tr.tracks[tr.currentIndex].add(sinSignal.framesSins[i].time, sinSignal.framesSins[i].sinusoids[j], sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.ACTIVE); } } else // If there are tracks, first check "continuations" by checking whether a given sinusoid is in the // +-deltaInRadians neighbourhood of the previous track. // Those tracks that do not continue are "turned off". // All sinusoids of the current frame that are not assigned to any of the "continuations" or "turned off" // are "birth"s of new tracks. { for (j = 0; j < tr.currentIndex + 1; j++) { if (tr.tracks[j] != null) tr.tracks[j].resetCandidate(); } bSinAssigneds = new boolean[sinSignal.framesSins[i].sinusoids.length]; // Continuations: for (k = 0; k < sinSignal.framesSins[i].sinusoids.length; k++) { minDist = Math.abs(sinSignal.framesSins[i].sinusoids[k].freq - tr.tracks[0].freqs[tr.tracks[0].currentIndex]); if (minDist < deltaInRadians) trackInd = 0; else trackInd = -1; for (j = 1; j < tr.currentIndex + 1; j++) { tmpDist = Math.abs(sinSignal.framesSins[i].sinusoids[k].freq - tr.tracks[j].freqs[tr.tracks[j].currentIndex]); if (tmpDist < deltaInRadians && (trackInd == -1 || tmpDist < minDist)) { minDist = tmpDist; trackInd = j; } } if (trackInd > -1) { if (tr.tracks[trackInd].newCandidateInd > -1) bSinAssigneds[tr.tracks[trackInd].newCandidateInd] = false; tr.tracks[trackInd].newCandidate = new Sinusoid(sinSignal.framesSins[i].sinusoids[k]); tr.tracks[trackInd].newCandidateInd = k; bSinAssigneds[k] = true; // The sinusoid might be assigned to an existing track provided that a closer // sinusoid is not found } else bSinAssigneds[k] = false; // This is the birth of a new track since it does not match any existing // tracks } // Here is the actual assignment of sinusoids to existing tracks for (j = 0; j < tr.currentIndex + 1; j++) { if (tr.tracks[j].newCandidate != null) { Sinusoid tmpSin = new Sinusoid(tr.tracks[j].newCandidate); if (tr.tracks[j].states[tr.tracks[j].currentIndex] != SinusoidalTrack.ACTIVE) { zeroAmpSin = new Sinusoid(0.0f, tr.tracks[j].freqs[tr.tracks[j].totalSins - 1], 0.0f, Sinusoid.NON_EXISTING_FRAME_INDEX); tr.tracks[j].add(sinSignal.framesSins[i].time - ZERO_AMP_SHIFT_IN_SECONDS, zeroAmpSin, sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.TURNED_ON); } tr.tracks[j].add(sinSignal.framesSins[i].time, tmpSin, sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.ACTIVE); } else // Turn off tracks that are not assigned any new sinusoid { if (tr.tracks[j].states[tr.tracks[j].currentIndex] != SinusoidalTrack.TURNED_OFF) { zeroAmpSin = new Sinusoid(0.0f, tr.tracks[j].freqs[tr.tracks[j].totalSins - 1], 0.0f, Sinusoid.NON_EXISTING_FRAME_INDEX); tr.tracks[j].add(sinSignal.framesSins[i].time + ZERO_AMP_SHIFT_IN_SECONDS, zeroAmpSin, sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.TURNED_OFF); } } } // Births: Create new tracks from sinusoids that are not assigned to existing tracks for (k = 0; k < bSinAssigneds.length; k++) { if (!bSinAssigneds[k]) { // First add a zero amplitude sinusoid to previous frame to allow smooth synthesis (i.e. "turning on" // the track) zeroAmpSin = new Sinusoid(0.0f, sinSignal.framesSins[i].sinusoids[k].freq, 0.0f, Sinusoid.NON_EXISTING_FRAME_INDEX); tr.add(new SinusoidalTrack(sinSignal.framesSins[i].time - ZERO_AMP_SHIFT_IN_SECONDS, zeroAmpSin, sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.TURNED_ON)); // tr.tracks[tr.currentIndex].add(sinSignal.framesSins[i].time, sinSignal.framesSins[i].sinusoids[k], sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.ACTIVE); } } System.out.println("Track generation using frame " + String.valueOf(i + 1) + " of " + String.valueOf(numFrames)); } // Turn-off all active tracks after the last speech frame if (i == numFrames - 1) { for (j = 0; j < tr.currentIndex + 1; j++) { if (Math.abs(sinSignal.framesSins[i].time - tr.tracks[j].times[tr.tracks[j].totalSins - 1]) < ZERO_AMP_SHIFT_IN_SECONDS) { if (tr.tracks[j].states[tr.tracks[j].currentIndex] == SinusoidalTrack.ACTIVE) { zeroAmpSin = new Sinusoid(0.0f, tr.tracks[j].freqs[tr.tracks[j].totalSins - 1], 0.0f, Sinusoid.NON_EXISTING_FRAME_INDEX); tr.tracks[j].add(sinSignal.framesSins[i].time + ZERO_AMP_SHIFT_IN_SECONDS, zeroAmpSin, sinSignal.framesSins[i].maxFreqOfVoicing, SinusoidalTrack.TURNED_OFF); } } } } // } } for (i = 0; i <= tr.currentIndex; i++) tr.tracks[i].correctTrack(); tr.setOriginalDurationManual(sinSignal.originalDurationInSeconds); SinusoidalTracks trOut = new SinusoidalTracks(tr, 0, tr.currentIndex); trOut = postProcess(trOut); return trOut; } // Simple median filtering along frequencies and amplitudes public static SinusoidalTracks postProcess(SinusoidalTracks st) { for (int i = 0; i < st.totalTracks; i++) { if (st.tracks[i].totalSins > 20) { st.tracks[i].freqs = SignalProcUtils.meanFilter(st.tracks[i].freqs, MEAN_FILTER_FREQ_AXIS); st.tracks[i].amps = SignalProcUtils.meanFilter(st.tracks[i].amps, MEAN_FILTER_AMP_AXIS); } } return st; } }