/**
* Copyright 2000-2007 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.util.data.audio;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.List;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
import marytts.signalproc.analysis.EnergyAnalyser;
import marytts.signalproc.filter.LowPassFilter;
import marytts.util.data.BufferedDoubleDataSource;
import marytts.util.data.DoubleDataSource;
import marytts.util.signal.SignalProcUtils;
/**
*
* Audio Convertion Utilities
*
* @author Sathish Chandra Pammi
*
*/
public class AudioConverterUtils {
public static class SequenceAudioProcessor implements AudioProcessor {
private List<AudioProcessor> procs;
public SequenceAudioProcessor(List<AudioProcessor> procs) {
this.procs = procs;
}
public AudioInputStream apply(AudioInputStream ais) {
AudioInputStream soFar = ais;
for (AudioProcessor p : procs) {
soFar = p.apply(soFar);
}
return soFar;
}
}
public static class Stereo2Mono implements AudioProcessor {
private int mode;
/**
* Convert a stereo audio input stream to a mono audio input stream, using both channels.
*/
public Stereo2Mono() {
this(AudioPlayer.STEREO);
}
/**
* Convert a stereo audio input stream, using the channels as indicated by mode.
*
* @param mode
* AudioPlayer.LEFT_ONLY, AudioPlayer.RIGHT_ONLY or AudioPlayer.STEREO.
*/
public Stereo2Mono(int mode) {
this.mode = mode;
}
public AudioInputStream apply(AudioInputStream ais) {
return new MonoAudioInputStream(ais, mode);
}
}
/**
* A high-pass filter with flexible cutoff frequency and transition bandwidth.
*
* @author marc
*
*/
public static class HighPassFilter implements AudioProcessor {
private double cutoffFrequency;
private double transitionBandwidth;
public HighPassFilter(double cutoffFrequency, double transitionBandwidth) {
this.cutoffFrequency = cutoffFrequency;
this.transitionBandwidth = transitionBandwidth;
}
public AudioInputStream apply(AudioInputStream ais) {
float samplingRate = ais.getFormat().getSampleRate();
double cutOff = cutoffFrequency / samplingRate;
double transition = transitionBandwidth / samplingRate;
marytts.signalproc.filter.HighPassFilter hFilter = new marytts.signalproc.filter.HighPassFilter(cutOff, transition);
DoubleDataSource audio = new AudioDoubleDataSource(ais);
DoubleDataSource filtered = hFilter.apply(audio);
return new DDSAudioInputStream(filtered, ais.getFormat());
}
}
/**
* 24-Bit Audio to 16-bit Audio converter
*
* @param ais
* ais
* @return AudioInputStream audio input stream
* @throws Exception
* exception
*/
public static AudioInputStream convertBit24ToBit16(AudioInputStream ais) throws Exception {
int bitsPerSample = 24;
int targetBitsPerSample = 16;
int noOfbitsPerSample = ais.getFormat().getSampleSizeInBits();
if (noOfbitsPerSample != bitsPerSample) {
throw new Exception("24-Bit Audio Data Expected. But given Audio Data is " + noOfbitsPerSample + "-Bit data");
}
if (ais.getFormat().getChannels() != 1) {
throw new Exception("Expected Audio type is Mono. But given Audio Data has " + ais.getFormat().getChannels()
+ " channels");
}
float samplingRate = ais.getFormat().getSampleRate();
int channels = ais.getFormat().getChannels();
int nBytes = ais.available();
boolean bigEndian = ais.getFormat().isBigEndian();
byte[] byteBuf = new byte[nBytes];
int nBytesRead = ais.read(byteBuf, 0, nBytes); // Reading all Bytes at a time
int currentPos = 0;
int noOfSamples = nBytes / 3;
int[] sample = new int[noOfSamples];
for (int i = 0; i < nBytesRead; i += 3, currentPos++) {
byte lobyte;
byte midbyte;
byte hibyte;
if (!bigEndian) {
lobyte = byteBuf[i];
midbyte = byteBuf[i + 1];
hibyte = byteBuf[i + 2];
} else {
lobyte = byteBuf[i + 2];
midbyte = byteBuf[i + 1];
hibyte = byteBuf[i];
}
sample[currentPos] = hibyte << 16 | (midbyte & 0xFF) << 8 | lobyte & 0xFF;
}
int maxBitPos = 0;
int valueAfterShift;
for (int i = 0; i < sample.length; i++) {
for (int j = bitsPerSample; j >= 1; j--) {
valueAfterShift = Math.abs(sample[i]) >> j;
if (valueAfterShift != 0) {
if (maxBitPos < j)
maxBitPos = j;
break;
}
}
}
int shiftBits = maxBitPos - targetBitsPerSample + 2; // need to change 24 to 16
int sign;
for (int i = 0; (shiftBits > 0 && i < sample.length); i++) {
if (sample[i] < 0)
sign = -1;
else
sign = 1;
sample[i] = sign * (Math.abs(sample[i]) >> shiftBits);
}
currentPos = 0; // off
int nRead = sample.length;
byte[] b = new byte[2 * sample.length];
int MAX_AMPLITUDE = 32767;
// Conversion to BYTE ARRAY
for (int i = 0; i < nRead; i++, currentPos += 2) {
int samp = sample[i];
if (samp > MAX_AMPLITUDE || samp < -MAX_AMPLITUDE) {
System.err.println("Warning: signal amplitude out of range: " + samp);
}
byte hibyte = (byte) (samp >> 8);
byte lobyte = (byte) (samp & 0xFF);
if (!bigEndian) {
b[currentPos] = lobyte;
b[currentPos + 1] = hibyte;
} else {
b[currentPos] = hibyte;
b[currentPos + 1] = lobyte;
}
}
ByteArrayInputStream bais = new ByteArrayInputStream(b);
boolean signed = true; // true,false
AudioFormat af = new AudioFormat(samplingRate, targetBitsPerSample, channels, signed, bigEndian);
long lengthInSamples = b.length / (targetBitsPerSample / 8);
return new AudioInputStream(bais, af, lengthInSamples);
}
/**
* 24-Bit Audio to 16-bit Audio converter
*
* @param ais
* ais
* @param shiftBits
* shift bits
* @return AudioInputStream
* @throws Exception
* exception
*/
public static AudioInputStream convertBit24ToBit16(AudioInputStream ais, int shiftBits) throws Exception {
int bitsPerSample = 24;
int targetBitsPerSample = 16;
int noOfbitsPerSample = ais.getFormat().getSampleSizeInBits();
if (noOfbitsPerSample != bitsPerSample) {
throw new Exception("24-Bit Audio Data Expected. But given Audio Data is " + noOfbitsPerSample + "-Bit data");
}
if (ais.getFormat().getChannels() != 1) {
throw new Exception("Expected Audio type is Mono. But given Audio Data has " + ais.getFormat().getChannels()
+ " channels");
}
// System.out.println("Shift bits: "+shiftBits);
float samplingRate = ais.getFormat().getSampleRate();
int channels = ais.getFormat().getChannels();
int nBytes = ais.available();
boolean bigEndian = ais.getFormat().isBigEndian();
byte[] byteBuf = new byte[nBytes];
int nBytesRead = ais.read(byteBuf, 0, nBytes); // Reading all Bytes at a time
int currentPos = 0;
int noOfSamples = nBytes / 3;
int[] sample = new int[noOfSamples];
for (int i = 0; i < nBytesRead; i += 3, currentPos++) {
byte lobyte;
byte midbyte;
byte hibyte;
if (!bigEndian) {
lobyte = byteBuf[i];
midbyte = byteBuf[i + 1];
hibyte = byteBuf[i + 2];
} else {
lobyte = byteBuf[i + 2];
midbyte = byteBuf[i + 1];
hibyte = byteBuf[i];
}
sample[currentPos] = hibyte << 16 | (midbyte & 0xFF) << 8 | lobyte & 0xFF;
}
int sign;
for (int i = 0; (shiftBits > 0 && i < sample.length); i++) {
if (sample[i] < 0)
sign = -1;
else
sign = 1;
sample[i] = sign * (Math.abs(sample[i]) >> shiftBits);
}
currentPos = 0; // off
int nRead = sample.length;
byte[] b = new byte[2 * sample.length];
int MAX_AMPLITUDE = 32767;
// Conversion to BYTE ARRAY
for (int i = 0; i < nRead; i++, currentPos += 2) {
int samp = sample[i];
if (samp > MAX_AMPLITUDE || samp < -MAX_AMPLITUDE) {
System.err.println("Warning: signal amplitude out of range: " + samp);
}
byte hibyte = (byte) (samp >> 8);
byte lobyte = (byte) (samp & 0xFF);
if (!bigEndian) {
b[currentPos] = lobyte;
b[currentPos + 1] = hibyte;
} else {
b[currentPos] = hibyte;
b[currentPos + 1] = lobyte;
}
}
ByteArrayInputStream bais = new ByteArrayInputStream(b);
boolean signed = true; // true,false
AudioFormat af = new AudioFormat(samplingRate, targetBitsPerSample, channels, signed, bigEndian);
long lengthInSamples = b.length / (targetBitsPerSample / 8);
return new AudioInputStream(bais, af, lengthInSamples);
}
/**
* Get samples in Integer Format (un-normalized) from AudioInputStream
*
* @param ais
* ais
* @return samples
* @throws Exception
* exception
*/
public static int[] getSamples(AudioInputStream ais) throws Exception {
int noOfbitsPerSample = ais.getFormat().getSampleSizeInBits();
float samplingRate = ais.getFormat().getSampleRate();
int channels = ais.getFormat().getChannels();
int nBytes = ais.available();
boolean bigEndian = ais.getFormat().isBigEndian();
byte[] byteBuf = new byte[nBytes];
int nBytesRead = ais.read(byteBuf, 0, nBytes); // Reading all Bytes at a time
int noOfBytesPerSample = noOfbitsPerSample / 8;
int[] samples = new int[nBytes / noOfBytesPerSample];
int currentPos = 0; // off
if (noOfBytesPerSample == 1) {
for (int i = 0; i < nBytesRead; i++, currentPos++) {
samples[currentPos] = (byteBuf[i] << 8);
}
} else if (noOfBytesPerSample == 2) { // 16 bit
for (int i = 0; i < nBytesRead; i += 2, currentPos++) {
int sample;
byte lobyte;
byte hibyte;
if (!bigEndian) {
lobyte = byteBuf[i];
hibyte = byteBuf[i + 1];
} else {
lobyte = byteBuf[i + 1];
hibyte = byteBuf[i];
}
samples[currentPos] = hibyte << 8 | lobyte & 0xFF;
}
} else { // noOfBytesPerSample == 3, i.e. 24 bit
for (int i = 0; i < nBytesRead; i += 3, currentPos++) {
int sample;
byte lobyte;
byte midbyte;
byte hibyte;
if (!bigEndian) {
lobyte = byteBuf[i];
midbyte = byteBuf[i + 1];
hibyte = byteBuf[i + 2];
} else {
lobyte = byteBuf[i + 2];
midbyte = byteBuf[i + 1];
hibyte = byteBuf[i];
}
samples[currentPos] = hibyte << 16 | (midbyte & 0xFF) << 8 | lobyte & 0xFF;
}
}
return samples;
}
/**
* DownSampling given Audio Input Stream
*
* @param ais
* ais
* @param targetSamplingRate
* target sampling rate
* @return oais
* @throws Exception
* exception
*/
public static AudioInputStream downSampling(AudioInputStream ais, int targetSamplingRate) throws Exception {
float currentSamplingRate = ais.getFormat().getSampleRate();
if (targetSamplingRate >= currentSamplingRate) {
throw new Exception("Requested sampling rate " + targetSamplingRate
+ " is greater than or equal to Audio sampling rate " + currentSamplingRate);
}
int noOfbitsPerSample = ais.getFormat().getSampleSizeInBits();
int channels = ais.getFormat().getChannels();
int nBytes = ais.available();
boolean bigEndian = ais.getFormat().isBigEndian();
double[] samples = new AudioDoubleDataSource(ais).getAllData();
// **** Filtering to Remove Aliasing ******
double filterCutof = 0.5 * (double) targetSamplingRate / currentSamplingRate;
// System.out.println("filterCutof: "+filterCutof);
LowPassFilter filter = new LowPassFilter(filterCutof);
samples = filter.apply(samples);
double duration = (double) samples.length / currentSamplingRate;
// System.out.println("duration: "+duration);
int newSampleLen = (int) Math.floor(duration * targetSamplingRate);
// System.out.println("New Sample Length: "+newSampleLen);
double fraction = (double) currentSamplingRate / targetSamplingRate;
// System.out.println("Fraction: "+fraction);
double[] newSignal = new double[newSampleLen];
for (int i = 0; i < newSignal.length; i++) {
double posIdx = fraction * i;
int nVal = (int) Math.floor(posIdx);
double diffVal = posIdx - nVal;
// Linear Interpolation
newSignal[i] = (diffVal * samples[nVal + 1]) + ((1 - diffVal) * samples[nVal]);
}
boolean signed = true; // true,false
AudioFormat af = new AudioFormat(targetSamplingRate, noOfbitsPerSample, channels, signed, bigEndian);
DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(newSignal), af);
return oais;
}
/**
* Removes endpoints from given file.
*
* @param inputFile
* input file
* @param outputFile
* output file
* @param energyBufferLength
* energyBufferLength
* @param speechStartLikelihood
* speechStartLikelihood
* @param speechEndLikelihood
* speechEndLikelihood
* @param shiftFromMinimumEnergyCenter
* shiftFromMinimumEnergyCenter
* @param numClusters
* numClusters
* @param minimumStartSilenceInSeconds
* minimumStartSilenceInSeconds
* @param minimumEndSilenceInSeconds
* minimumEndSilenceInSeconds
* @throws IOException
* IOException
* @throws UnsupportedAudioFileException
* UnsupportedAudioFileException
*/
public static void removeEndpoints(String inputFile, String outputFile, int energyBufferLength, double speechStartLikelihood,
double speechEndLikelihood, double shiftFromMinimumEnergyCenter, int numClusters,
double minimumStartSilenceInSeconds, double minimumEndSilenceInSeconds) throws IOException,
UnsupportedAudioFileException {
/*
* 1. identify and remove end points 2. make sure at least some desired amount of silence in the beginning and at the end
* 3. store as output wavefile
*/
AudioInputStream ais = AudioSystem.getAudioInputStream(new File(inputFile));
if (!ais.getFormat().getEncoding().equals(AudioFormat.Encoding.PCM_SIGNED)) {
ais = AudioSystem.getAudioInputStream(AudioFormat.Encoding.PCM_SIGNED, ais);
}
if (ais.getFormat().getChannels() > 1) {
throw new IllegalArgumentException("Can only deal with mono audio signals");
}
int samplingRate = (int) ais.getFormat().getSampleRate();
DoubleDataSource signal = new AudioDoubleDataSource(ais);
int framelength = (int) (0.01 /* seconds */* samplingRate);
EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, samplingRate);
// double[][] speechStretches = ea.getSpeechStretches();
double[][] speechStretches = ea.getSpeechStretchesUsingEnergyHistory(energyBufferLength, speechStartLikelihood,
speechEndLikelihood, shiftFromMinimumEnergyCenter, numClusters);
ais.close();
try {
ais = AudioSystem.getAudioInputStream(new File(inputFile));
} catch (UnsupportedAudioFileException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
signal = new AudioDoubleDataSource(ais);
double[] x = signal.getAllData();
ais.close();
if (speechStretches.length == 0) {
System.out.println("No segments detected in " + inputFile + " copying whole file...");
DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(x), ais.getFormat());
AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputFile));
} else {
int numStretches = speechStretches.length;
int speechStartIndex = (int) (samplingRate * speechStretches[0][0]);
int speechEndIndex = (int) (samplingRate * speechStretches[numStretches - 1][1]);
// Check if sufficient silence exists in the input waveform, if not generate as required
int silStartRequired = Math.max(0, (int) (samplingRate * minimumStartSilenceInSeconds));
int silStartLen = 0;
if (speechStartIndex < silStartRequired) {
silStartLen = silStartRequired - speechStartIndex;
speechStartIndex = 0;
} else
speechStartIndex -= silStartRequired;
double[] silStart = null;
if (silStartLen > 0)
silStart = SignalProcUtils.getWhiteNoise(silStartLen, 1e-20);
int silEndRequired = Math.max(0, (int) (samplingRate * minimumEndSilenceInSeconds));
int silEndLen = 0;
if (x.length - speechEndIndex < silEndRequired) {
silEndLen = silEndRequired - (x.length - speechEndIndex);
speechEndIndex = x.length - 1;
} else
speechEndIndex += silEndRequired;
double[] silEnd = null;
if (silEndLen > 0)
silEnd = SignalProcUtils.getWhiteNoise(silEndLen, 1e-20);
//
double[] y = null;
if (speechEndIndex - speechStartIndex + silStartLen + silEndLen > 0)
y = new double[speechEndIndex - speechStartIndex + silStartLen + silEndLen];
else
throw new Error("No output samples to write for " + inputFile);
int start = 0;
if (silStartLen > 0) {
System.arraycopy(silStart, 0, y, start, silStartLen);
start += silStartLen;
}
if (speechEndIndex - speechStartIndex > 0) {
System.arraycopy(x, speechStartIndex, y, start, speechEndIndex - speechStartIndex);
start += (speechEndIndex - speechStartIndex);
}
if (silEndLen > 0) {
System.arraycopy(silEnd, 0, y, start, silEndLen);
start += silEndLen;
}
DDSAudioInputStream outputAudio = new DDSAudioInputStream(new BufferedDoubleDataSource(y), ais.getFormat());
AudioSystem.write(outputAudio, AudioFileFormat.Type.WAVE, new File(outputFile));
}
}
}