/**
* Copyright 2004-2006 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.signalproc.process;
import java.io.File;
import java.util.Arrays;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import marytts.signalproc.display.FunctionGraph;
import marytts.signalproc.display.SignalGraph;
import marytts.signalproc.window.Window;
import marytts.util.data.BlockwiseDoubleDataSource;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.DoubleDataSource;
import marytts.util.data.SequenceDoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.audio.DDSAudioInputStream;
import marytts.util.math.MathUtils;
/**
* Compute the overlap-add of a framewise-processed input signal, with optional time stretching (in subclasses). The OLA algorithm
* works as follows. 1. Assuming an input frameshift of 1/12th of the frame length, and a signal length equal to (frame
* length+4*frameshift), we cover the input data as follows: (+=valid data points, -=zero, |=valid data point at start/end of
* input)
*
* <pre>
* ---|++++++++
* --|+++++++++
* -|++++++++++
* |+++++++++++
* ++++++++++++
* ++++++++++++
* ++++++++++++
* +++++++++++|
* ++++++++++|-
* +++++++++|--
* ++++++++|---
* </pre>
*
* With a synthesis frameshift of 1/4th of the frame length, implying that four frames need to be overlapped to reconstruct the
* signal, this becomes:
*
* <pre>
* ---|++++++++
* --|+++++++++
* -|++++++++++
* |+++++++++++ *** first usable
* ++++++++++++
* ++++++++++++
* ++++++++++++
* +++++++++++|
* ++++++++++|-
* +++++++++|--
* ++++++++|--- *** last usable: first 3 of this
* </pre>
*
* It can be seen that three times the input frameshift needs to be zero-padded before the signal, and discarded to reach proper
* signal reconstruction.
*
* Similarly, the last frame to be used is the one to which three times the input shift has been zero-padded; only the first
* output frameshift samples of it can be used.
*
* 2. Assuming an input frameshift of 1/24th of the frame length, and a signal length equal to (frame length+4*frameshift), we
* cover the input data as follows: (+=valid data points, -=zero, |=valid data point at start/end of input)
*
* <pre>
* --------|+++++++++++++++
* -------|++++++++++++++++
* ------|+++++++++++++++++
* -----|++++++++++++++++++
* ----|+++++++++++++++++++
* ---|++++++++++++++++++++
* --|+++++++++++++++++++++
* -|++++++++++++++++++++++
* |+++++++++++++++++++++++
* ++++++++++++++++++++++++
* ++++++++++++++++++++++++
* ++++++++++++++++++++++++
* +++++++++++++++++++++++|
* ++++++++++++++++++++++|-
* +++++++++++++++++++++|--
* ++++++++++++++++++++|---
* +++++++++++++++++++|----
* ++++++++++++++++++|-----
* +++++++++++++++++|------
* ++++++++++++++++|-------
* +++++++++++++++|--------
* </pre>
*
* With a synthesis frameshift of 1/8th of the frame length, implying that eight frames need to be overlapped to reconstruct the
* signal, this becomes:
*
* <pre>
* -------|++++++++++++++++
* ------|+++++++++++++++++
* -----|++++++++++++++++++
* ----|+++++++++++++++++++
* ---|++++++++++++++++++++
* --|+++++++++++++++++++++
* -|++++++++++++++++++++++
* |+++++++++++++++++++++++ *** first usable
* ++++++++++++++++++++++++
* ++++++++++++++++++++++++
* ++++++++++++++++++++++++
* +++++++++++++++++++++++|
* ++++++++++++++++++++++|-
* +++++++++++++++++++++|--
* ++++++++++++++++++++|---
* +++++++++++++++++++|----
* ++++++++++++++++++|-----
* +++++++++++++++++|------
* ++++++++++++++++|------- *** last usable: first 3 of this
* </pre>
*
* It can be seen that seven times the input frameshift needs to be zero-padded before the signal, and discarded to reach proper
* signal reconstruction.
*
* Similarly, the last frame to be used is the one to which seven times the input shift has been zero-padded; only the first
* output frameshift samples of it can be used.
*
* 3. Assuming an input frameshift of 1/3rd of the frame length, and a signal length equal to (frame length+4*frameshift), we
* cover the input data as follows: (+=valid data points, -=zero, |=valid data point at start/end of input)
*
* <pre>
* --------|+++
* ----|+++++++
* |+++++++++++
* ++++++++++++
* ++++++++++++
* ++++++++++++
* +++++++++++|
* +++++++|----
* +++|--------
* </pre>
*
* With a synthesis frameshift of 1/4th of the frame length, implying that four frames need to be overlapped to reconstruct the
* signal, this becomes:
*
* <pre>
* --------|+++
* ----|+++++++
* |+++++++++++
* ++++++++++++ *** first usable
* ++++++++++++
* ++++++++++++
* +++++++++++|
* +++++++|----
* +++|-------- *** last usable: first 3 of this
* </pre>
*
* It can be seen that only two times the input frameshift needs to be zero-padded before the signal; nevertheless, the first
* three frames need to be procesesed but discarded to reach proper signal reconstruction.
*
* Similarly, the last frame to be used is the one to which two times the input shift has been zero-padded; only the first output
* frameshift samples of it can be used.
*
* 4. Assuming an input frameshift of 1/2rd of the frame length, and a signal length equal to (frame length+4*frameshift), we
* cover the input data as follows: (+=valid data points, -=zero, |=valid data point at start/end of input)
*
* <pre>
* ------|+++++
* |+++++++++++
* ++++++++++++
* ++++++++++++
* ++++++++++++
* +++++++++++|
* +++++|------
* </pre>
*
* With a synthesis frameshift of 1/4th of the frame length, implying that four frames need to be overlapped to reconstruct the
* signal, this becomes:
*
* <pre>
* ------|+++++
* |+++++++++++
* ++++++++++++
* ++++++++++++ *** first usable
* ++++++++++++
* +++++++++++|
* +++++|------ *** last usable: first 3 of this
* </pre>
*
* It can be seen that only two times the input frameshift needs to be zero-padded before the signal; nevertheless, the first
* three frames need to be procesesed but discarded to reach proper signal reconstruction.
*
* Similarly, the last frame to be used is the one to which two times the input shift has been zero-padded; only the first output
* frameshift samples of it can be used.
*
* Generalising: May ro be the output overlap ratio, ro = output frameshift / framelength, and ri be the input overlap ratio, ri =
* input frameshift / framelength, then n = 1/(1-ro) is the number of frames to be overlapped so that the signal is reconstructed.
* The amount of zeroes to be padded before and after the signal is (n-1)*input frameshift, or in the case of speeding up,
* (m-1)*input frameshift where m = 1/ri. (n-1) frames must be read and discarded before the actual data. If the signal length can
* be described as l = framelength + n*frameshift, exactly output frameshift samples are to be used from the last frame. If the
* signal is a bit shorter, i.e. l = framelength + n*frameshift - delta, then (output frameshift - delta) samples can be read from
* the last frame.
*
* @author Marc Schröder
*/
public class FrameOverlapAddSource extends BlockwiseDoubleDataSource {
public static final int DEFAULT_WINDOWTYPE = Window.HANNING;
protected FrameProvider frameProvider;
protected Window outputWindow;
protected double[] memory;
protected InlineDataProcessor processor;
/**
* Default constructor for subclasses who want to call initialise() themselves.
*/
protected FrameOverlapAddSource() {
super(null, 0); // need to set blockSize right later
}
public FrameOverlapAddSource(DoubleDataSource inputSource, int frameLength, int samplingRate, InlineDataProcessor processor) {
this(inputSource, DEFAULT_WINDOWTYPE, false, frameLength, samplingRate, processor);
}
public FrameOverlapAddSource(DoubleDataSource inputSource, int windowType, boolean applySynthesisWindow, int frameLength,
int samplingRate, InlineDataProcessor processor) {
super(null, 0); // need to set blockSize right later
initialise(inputSource, windowType, applySynthesisWindow, frameLength, samplingRate, processor);
}
/**
* To be called by constructor in order to set up this frame overlap add source.
*
* @param inputSource
* input source
* @param windowType
* window type
* @param applySynthesisWindow
* apply synthesis window
* @param frameLength
* frame length
* @param samplingRate
* sampling rate
* @param processor
* processor
*/
protected void initialise(DoubleDataSource inputSource, int windowType, boolean applySynthesisWindow, int frameLength,
int samplingRate, InlineDataProcessor processor) {
double overlapFraction;
double prescale = 1;
switch (windowType) {
case Window.HANNING:
overlapFraction = 0.75;
// Prescale to allow for perfect restitution for rate factor 1:
// If we overlap-add simple hann windows by 3/4, we increase the amplitude by 2;
// if we overlap-add squared hann windows by 3/4, we increase the amplitude by 1.5.
// for an overlap ratio of 7/8, these values are twice as large.
double onceOrTwice = 0.25 / (1 - overlapFraction); // == 2 for overlap 7/8, 1 for overlap 3/4
prescale = applySynthesisWindow ? Math.sqrt(2. / 3 / onceOrTwice) : 0.5 / onceOrTwice;
break;
case Window.BLACKMAN:
case Window.HAMMING:
overlapFraction = 0.875;
break;
default:
throw new IllegalArgumentException("Window type not supported");
}
// output frameshift is constrained by window type and frame length:
this.blockSize = (int) (frameLength * (1 - overlapFraction));
int inputFrameshift = getInputFrameshift(blockSize);
// System.err.println("Blocksize: "+blockSize+", inputFrameshift: "+inputFrameshift);
Window window = Window.get(windowType, frameLength + 1, prescale);
if (applySynthesisWindow)
this.outputWindow = window;
else
this.outputWindow = null;
this.memory = new double[frameLength];
// This is used when the last input frame has already been read,
// to do the last frame output properly:
this.processor = processor;
// We need to feed through (and discard) 3 (if overlapFraction == 3/4)
// blocks of zeroes, so that the first three blocks are properly rebuilt.
int nBlocks = (int) (1 / (1 - overlapFraction)) - 1;
// If we insist on 4-fold overlap for speeding up, we need to
// feed in less zeroes: (m-1)*inputFrameshift, where m = frameLength/inputFrameshift.
int m = frameLength / inputFrameshift;
int nZeroes = nBlocks * inputFrameshift < frameLength ? nBlocks : (m - 1);
DoubleDataSource padding1 = new BufferedDoubleDataSource(new double[nZeroes * inputFrameshift]);
DoubleDataSource padding2 = new BufferedDoubleDataSource(new double[nZeroes * inputFrameshift]);
DoubleDataSource paddedSource = new SequenceDoubleDataSource(new DoubleDataSource[] { padding1, inputSource, padding2 });
this.frameProvider = new FrameProvider(paddedSource, window, frameLength, inputFrameshift, samplingRate, true);
double[] dummy = new double[blockSize];
for (int i = 0; i < nBlocks; i++) {
// System.err.println("Discarding "+blockSize+" samples:");
getData(dummy, 0, blockSize);
}
this.frameProvider.resetInternalTimer();
}
/**
* Get the next frame of input data. This method is called by prepareBlock() when preparing the output data to be read. This
* implementation simply reads the data from the frameProvider.
*
* @return the next frame of frameProvider
*/
protected double[] getNextFrame() {
return frameProvider.getNextFrame();
}
/**
* Prepare one block of data for output. This method is called from the superclass before readBlock() is called.
*/
protected void prepareBlock() {
double[] frame = getNextFrame();
if (frame == null)
return;
int frameLength = frameProvider.getFrameLengthSamples();
if (processor != null)
processor.applyInline(frame, 0, frameLength);
if (outputWindow != null)
outputWindow.applyInline(frame, 0, frameLength);
// Extend memory if necessary:
if (memory.length < frameLength) {
double[] oldMemory = memory;
memory = new double[frameLength];
System.arraycopy(oldMemory, 0, memory, 0, oldMemory.length);
}
// The overlap-add part:
for (int i = 0; i < frameLength; i++) {
memory[i] += frame[i];
}
}
protected int getBlockSize() {
return blockSize;
}
/**
* Provide a block of data. This method is called from the superclass when data is requested. Note that prepareBlock() will be
* called before this.
*/
protected int readBlock(double[] target, int targetPos) {
// Now, the first blockSize samples can be output:
int blockSize = getBlockSize();
int validSamplesInFrame = frameProvider.validSamplesInFrame();
// System.err.println("OLA: valid samples in current frame: "+validSamplesInFrame);
int frameLength = frameProvider.getFrameLengthSamples();
if (validSamplesInFrame < frameLength) {
assert !frameProvider.hasMoreData();
// assert frameLength-validSamplesInFrame < frameProvider.getFrameShiftSamples();
// But in the case of speeding up, frameLength-validSamplesInFrame
// can still be > blockSize; in that case, copy only blockSize samples,
// and discard the rest.
int nCopied;
if (blockSize < (frameLength - validSamplesInFrame)) {
nCopied = blockSize;
} else {
nCopied = blockSize - (frameLength - validSamplesInFrame);
}
assert nCopied > 0; // otherwise someone should notice we should not be called
// System.err.println("OLA: Outputting last frame: "+nCopied+" samples ("+blockSize+","+frameLength+","+validSamplesInFrame+")");
System.arraycopy(memory, 0, target, targetPos, nCopied);
return nCopied;
} else {
// System.err.println("OLA: Outputting normal frame: "+blockSize+" samples (keeping "+(validSamplesInFrame-blockSize)+")");
System.arraycopy(memory, 0, target, targetPos, blockSize);
// Shift the data left in memory:
System.arraycopy(memory, blockSize, memory, 0, memory.length - blockSize);
Arrays.fill(memory, memory.length - blockSize, memory.length, 0);
return blockSize;
}
}
protected int getInputFrameshift(int outputFrameshift) {
return outputFrameshift; // default: inputFrameshift == outputFrameshift
}
public boolean hasMoreData() {
return frameProvider.hasMoreData();
}
public static void main(String[] args) throws Exception {
for (int i = 0; i < args.length; i++) {
AudioInputStream inputAudio = AudioSystem.getAudioInputStream(new File(args[i]));
int samplingRate = (int) inputAudio.getFormat().getSampleRate();
double[] signal = new AudioDoubleDataSource(inputAudio).getAllData();
FunctionGraph signalGraph = new SignalGraph(signal, samplingRate);
signalGraph.showInJFrame("signal", true, true);
FrameOverlapAddSource ola = new FrameOverlapAddSource(new BufferedDoubleDataSource(signal), 2048, samplingRate, null);
double[] result = ola.getAllData();
FunctionGraph resultGraph = new SignalGraph(result, samplingRate);
resultGraph.showInJFrame("result", true, true);
System.err.println("Signal has length " + signal.length + ", result " + result.length);
double err = MathUtils.sumSquaredError(signal, result);
System.err.println("Sum squared error: " + err);
double[] difference = MathUtils.subtract(signal, result);
FunctionGraph diffGraph = new SignalGraph(difference, samplingRate);
diffGraph.showInJFrame("difference", true, true);
DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(ola), inputAudio.getFormat());
String outFileName = args[i].substring(0, args[i].length() - 4) + "_copy.wav";
AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outFileName));
}
}
}