EnergyAnalyser.java example

Explorer
marytts-master
/**
 * Copyright 2004-2006 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.signalproc.analysis;

import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.LinkedList;

import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.TargetDataLine;

import marytts.machinelearning.KMeansClusteringTrainer;
import marytts.machinelearning.KMeansClusteringTrainerParams;
import marytts.signalproc.window.RectWindow;
import marytts.util.data.DoubleDataSource;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.math.MathUtils;
import marytts.util.string.PrintfFormat;

/**
 * 
 * @author Marc Schröder
 * 
 *         A class that analyses the energy distribution, and computes a silence cutoff threshold, in the linear energy domain.
 *
 */
public class EnergyAnalyser extends FrameBasedAnalyser<Double> {
	protected final int DEFAULT_MAXSIZE = Integer.MAX_VALUE / 2;
	/** array of frame energies, for further analysis */
	protected double[] frameEnergies = new double[16384];
	/**
	 * Beginning of valid data in frameEnergies; will be >0 only after more than maxSize frames have been read.
	 */
	protected int offset = 0;
	/**
	 * Length of valid data, counting from offset. This will count up to maxSize and then stay equal to maxSize.
	 */
	protected int len = 0;
	/** maximum size of the double[] storing the frame energies */
	protected int maxSize;

	public EnergyAnalyser(DoubleDataSource signal, int framelength, int samplingRate) {
		super(signal, new RectWindow(framelength), framelength, samplingRate);
		maxSize = DEFAULT_MAXSIZE;
	}

	public EnergyAnalyser(DoubleDataSource signal, int framelength, int frameShift, int samplingRate) {
		super(signal, new RectWindow(framelength), frameShift, samplingRate);
		maxSize = DEFAULT_MAXSIZE;
	}

	public EnergyAnalyser(DoubleDataSource signal, int framelength, int frameShift, int samplingRate, int maxSize) {
		super(signal, new RectWindow(framelength), frameShift, samplingRate);
		this.maxSize = maxSize;
	}

	/**
	 * Apply this FrameBasedAnalyser to the given data.
	 * 
	 * @param frame
	 *            the data to analyse, which must be of the length prescribed by this FrameBasedAnalyser, i.e. by works like
	 *            {@link #getFrameLengthSamples()} .
	 * @return a Double representing the total energy in the frame.
	 * @throws IllegalArgumentException
	 *             if frame does not have the prescribed length
	 */
	public Double analyse(double[] frame) {
		if (frame.length != getFrameLengthSamples())
			throw new IllegalArgumentException("Expected frame of length " + getFrameLengthSamples() + ", got " + frame.length);
		double totalEnergy = 0;
		for (int i = 0; i < frame.length; i++) {
			totalEnergy += frame[i] * frame[i];
		}
		rememberFrameEnergy(totalEnergy);
		return new Double(totalEnergy);
	}

	protected void rememberFrameEnergy(double energy) {
		if (offset + len == frameEnergies.length) { // need to make space
			if (len < maxSize) { // need to increase the array size
				assert offset == 0;
				double[] dummy = new double[2 * frameEnergies.length];
				System.arraycopy(frameEnergies, 0, dummy, 0, frameEnergies.length);
				frameEnergies = dummy;
			} else { // we have reached the maximum length
				if (frameEnergies.length < 2 * maxSize) { // make sure we have a buffer twice maxSize
					double[] dummy = new double[2 * maxSize];
					System.arraycopy(frameEnergies, offset, dummy, 0, len);
					frameEnergies = dummy;
					offset = 0;
				} else { // need to copy valid data to the beginning of the array
					System.arraycopy(frameEnergies, offset, frameEnergies, 0, len);
					offset = 0;
				}
			}
		}
		assert offset + len < frameEnergies.length;
		frameEnergies[offset + len] = energy;
		if (len < maxSize)
			len++;
		else
			offset++;
	}

	/**
	 * Compute the overall mean energy in all frames.
	 * 
	 * @return a double representing the mean energy (non-normalised, i.e. in units of square sample amplitudes).
	 */
	public double getMeanFrameEnergy() {
		double mean = 0;
		for (int i = 0; i < len; i++) {
			mean += frameEnergies[offset + i];
		}
		mean /= len;
		return mean;
	}

	/**
	 * Compute the overall maximum energy in all frames.
	 * 
	 * @return a double representing the maximum energy (non-normalised, i.e. in units of square sample amplitudes).
	 */
	public double getMaxFrameEnergy() {
		if (len == 0)
			return Double.NaN;
		// otherwise, we have at least one valid value
		double max = frameEnergies[offset];
		for (int i = 0; i < len; i++) {
			double val = frameEnergies[offset + i];
			if (val > max)
				max = val;
		}
		return max;
	}

	/**
	 * Compute the overall minimum energy in all frames.
	 * 
	 * @return a double representing the minimum energy (non-normalised, i.e. in units of square sample amplitudes).
	 */
	public double getMinFrameEnergy() {
		if (len == 0)
			return Double.NaN;
		// otherwise, we have at least one valid value
		double min = frameEnergies[offset];
		for (int i = 0; i < len; i++) {
			double val = frameEnergies[offset + i];
			if (val < min)
				min = val;
		}
		return min;
	}

	/**
	 * Compute a histogram of energies found in the data. Bin sizes are automatically determined based on the min and max frame
	 * energies, such that the interval between min and max energy is split into 100 bins.
	 * 
	 * @return an array of doubles of length nbins, representing percentage distribution across bins.
	 */
	public double[] getEnergyHistogram() {
		return getEnergyHistogram(100);
	}

	/**
	 * Compute a histogram of energies found in the data. Bin sizes are automatically determined based on the min and max frame
	 * energies, such that the interval between min and max energy is split into nbins bins.
	 * 
	 * @param nbins
	 *            the number of bins to compute, e.g. 100
	 * @return an array of doubles of length nbins, representing percentage distribution across bins.
	 */
	public double[] getEnergyHistogram(int nbins) {
		double[] histogram = new double[nbins];
		double min = getMinFrameEnergy();
		double range = getMaxFrameEnergy() - min;
		double binWidth = range / nbins;
		double increment = 1. / len;
		for (int i = 0; i < len; i++) {
			int bin = (int) Math.floor((frameEnergies[offset + i] - min) / binWidth);
			// special case maximum energy: it still belongs to the top bin
			if (bin == nbins)
				bin = nbins - 1;
			assert bin < nbins;
			histogram[bin] += increment;
		}
		return histogram;
	}

	/**
	 * Determine the energy level below which to find silence. This is based on the energy histogram.
	 * 
	 * @return the energy below which is silence.
	 */
	public double getSilenceCutoff() {
		double[] hist = getEnergyHistogram();
		double[] lowerHalf = new double[hist.length / 2];
		// computation of the length of upperHalf accounts for the possibility that hist.length is odd
		double[] upperHalf = new double[hist.length - lowerHalf.length];
		System.arraycopy(hist, 0, lowerHalf, 0, lowerHalf.length);
		System.arraycopy(hist, lowerHalf.length, upperHalf, 0, upperHalf.length);
		int silencePeak = MathUtils.findGlobalPeakLocation(lowerHalf);
		int speechPeak = lowerHalf.length + MathUtils.findGlobalPeakLocation(upperHalf);
		int iCutoff = silencePeak + (speechPeak - silencePeak) / 2;
		// Compute dB correlate of cutoff level
		double minEnergy = getMinFrameEnergy();
		double maxEnergy = getMaxFrameEnergy();
		double cutoffEnergy = minEnergy + (maxEnergy - minEnergy) * iCutoff / hist.length;

		return cutoffEnergy;
	}

	public double getSilenceCutoffFromSortedEnergies(FrameAnalysisResult[] far, double silenceThreshold) {
		double[] energies = new double[far.length];
		double cutoffEnergy;

		for (int i = 0; i < far.length; i++)
			energies[i] = ((Double) far[i].get()).doubleValue();

		MathUtils.quickSort(energies);
		int cutoffIndex = (int) Math.floor(silenceThreshold * energies.length);

		while (energies[cutoffIndex] == 0.0) {
			cutoffIndex++;
			if (cutoffIndex > energies.length - 1) {
				cutoffIndex = energies.length - 1;
				break;
			}
		}

		cutoffEnergy = energies[cutoffIndex];

		return cutoffEnergy;
	}

	/**
	 * For the current audio data and the automatically calculated silence cutoff, compute a list of start and end times
	 * representing speech stretches within the file. This method will take the following System properties into account:
	 * <ul>
	 * <li><code>signalproc.minsilenceduration</code> (default: 0.1 (seconds))
	 * <li><code>signalproc.minspeechduration</code> (default: 0.1 (seconds))
	 * </ul>
	 * Silence or speech stretches shorter than these values will be ignored.
	 * 
	 * @return an array of double pairs, representing start and end times (in seconds) for each speech stretch.
	 */
	public double[][] getSpeechStretches() {
		double minSilenceDur = Double.parseDouble(System.getProperty("signalproc.minsilenceduration", "0.1"));
		double minSpeechDur = Double.parseDouble(System.getProperty("signalproc.minspeechduration", "0.1"));
		FrameAnalysisResult[] far = analyseAllFrames();
		double silenceCutoff = getSilenceCutoff();
		LinkedList stretches = new LinkedList();
		boolean withinSpeech = false;
		for (int i = 0; i < far.length; i++) {
			double energy = ((Double) far[i].get()).doubleValue();
			if (energy > silenceCutoff) { // it's a speech frame
				if (!withinSpeech) { // previous was silence
					boolean addStretch = false;
					// Check that the preceding silence was long enough:
					if (stretches.size() == 0) {
						addStretch = true;
					} else { // there is a preceding stretch
						double silenceStart = ((double[]) stretches.getLast())[1];
						double silenceEnd = i * getFrameLengthTime(); // current time
						if (silenceEnd - silenceStart >= minSilenceDur) {
							addStretch = true;
						}
					}
					if (addStretch) {
						double[] newStretch = new double[2];
						// Start of current frame is start of new stretch
						newStretch[0] = i * getFrameLengthTime();
						stretches.add(newStretch);
					} // else, overwrite position [1] of existing stretch
					withinSpeech = true;
					assert stretches.size() > 0;
				}
			} else { // it's a silence frame
				if (withinSpeech) { // previous was speech
					assert stretches.size() > 0;
					double[] latestStretch = (double[]) stretches.getLast();
					double speechStart = latestStretch[0];
					double speechEnd = (double) (i + 1) * getFrameLengthTime(); // end of current frame
					if (speechEnd - speechStart >= minSpeechDur) { // long enough
						// complete the segment:
						latestStretch[1] = speechEnd;
					} else { // not long enough
						// delete the stretch
						stretches.removeLast();
					}
					withinSpeech = false;
				}
			}
		}
		return (double[][]) stretches.toArray(new double[0][0]);
	}

	public double getSilenceCutoffFromKMeansClustering(double shiftFromMinimumEnergyCenter, int numClusters) {
		int i, j;

		FrameAnalysisResult[] far = analyseAllFrames();

		double[][] energies = new double[far.length][1];
		for (i = 0; i < far.length; i++)
			energies[i][0] = ((Double) far[i].get()).doubleValue();

		KMeansClusteringTrainerParams p = new KMeansClusteringTrainerParams();
		p.numClusters = numClusters;
		p.maxIterations = 40;
		KMeansClusteringTrainer t = new KMeansClusteringTrainer();
		t.train(energies, p);

		double[] meanEns = new double[p.numClusters];
		for (i = 0; i < p.numClusters; i++) {
			meanEns[i] = t.clusters[i].meanVector[0];
			System.out.println(String.valueOf(meanEns[i]));
		}

		double minEnCenter = MathUtils.getMin(meanEns);
		double maxEnCenter = MathUtils.getMax(meanEns);

		double energyTh = minEnCenter + shiftFromMinimumEnergyCenter * (maxEnCenter - minEnCenter);
		// System.out.println(String.valueOf(energyTh));

		return energyTh;
	}

	/**
	 * 
	 * The latest version uses K-Means clustering to cluster energy values into 3 separate clusters. Then, the energy threshold is
	 * selected using the lowest and highest energy cluster centers
	 * 
	 * @param energyBufferLength
	 *            energyBufferLength
	 * @param speechStartLikelihood
	 *            speechStartLikelihood
	 * @param speechEndLikelihood
	 *            speechEndLikelihood
	 * @param shiftFromMinimumEnergyCenter
	 *            shiftFromMinimumEnergyCenter
	 * @param numClusters
	 *            numClusters
	 * @return stretches.toArray(new double[0][0])
	 */
	public double[][] getSpeechStretchesUsingEnergyHistory(int energyBufferLength, double speechStartLikelihood,
			double speechEndLikelihood, double shiftFromMinimumEnergyCenter, int numClusters) {
		int i, j;
		double minSilenceDur = Double.parseDouble(System.getProperty("signalproc.minsilenceduration", "0.3"));
		double minSpeechDur = Double.parseDouble(System.getProperty("signalproc.minspeechduration", "0.3"));

		FrameAnalysisResult<Double>[] far = analyseAllFrames();

		double[][] energies = new double[far.length][1];
		for (i = 0; i < far.length; i++)
			energies[i][0] = far[i].get();

		double[] isSpeechsAll = new double[far.length];
		Arrays.fill(isSpeechsAll, 0.0);

		KMeansClusteringTrainerParams p = new KMeansClusteringTrainerParams();
		p.numClusters = numClusters;
		p.maxIterations = 40;
		KMeansClusteringTrainer t = new KMeansClusteringTrainer();
		t.train(energies, p);

		double[] meanEns = new double[p.numClusters];
		// TODO: stop mixing log and non-log code -- either use log energy by using EnergyAnalyser_dB, or linear energy by using
		// EnergyAnalyser
		boolean takeLog = true;
		if (this instanceof EnergyAnalyser_dB)
			takeLog = false;
		for (i = 0; i < p.numClusters; i++) {
			meanEns[i] = t.clusters[i].meanVector[0];
			if (takeLog) {
				meanEns[i] = 10 * Math.log10(meanEns[i]);
			}
			// System.out.println(String.valueOf(meanEns[i]));
		}

		double minEnCenter = MathUtils.getMin(meanEns);
		double maxEnCenter = MathUtils.getMax(meanEns);

		double energyTh = minEnCenter + shiftFromMinimumEnergyCenter * (maxEnCenter - minEnCenter);
		// System.out.println(String.valueOf(energyTh));

		LinkedList stretches = new LinkedList();

		if (energyBufferLength > far.length)
			energyBufferLength = far.length;

		double[] energyBuffer = new double[energyBufferLength];

		int[] isSpeechs = new int[energyBufferLength];

		Arrays.fill(isSpeechs, 0);

		double ratio;
		int speechCount;

		int bufferInd = 0;
		for (i = 0; i < energyBufferLength - 1; i++) {
			energyBuffer[bufferInd] = energies[i][0];
			if (takeLog) {
				energyBuffer[bufferInd] = 10 * Math.log10(energyBuffer[bufferInd]);
			}
			bufferInd++;
		}

		boolean isSpeechStarted = false;
		int tmpSpeechStartIndex = -1;
		int tmpSpeechEndIndex = -1;
		int prevStartIndex = -1;

		double speechStart = -1.0;
		double speechEnd = -1.0;

		for (i = energyBufferLength - 1; i < energies.length; i++) {
			if (bufferInd > energyBufferLength - 1)
				bufferInd = 0;

			energyBuffer[bufferInd] = energies[i][0];
			if (takeLog) {
				energyBuffer[bufferInd] = 10 * Math.log10(energyBuffer[bufferInd]);
			}

			if (energyBuffer[bufferInd] > energyTh) {
				isSpeechs[bufferInd] = 1;
				isSpeechsAll[i] = 1;
			} else
				isSpeechs[bufferInd] = 0;

			speechCount = 0;
			for (j = 0; j < energyBufferLength; j++) {
				if (isSpeechs[j] == 1)
					speechCount++;
			}

			ratio = ((double) speechCount) / energyBufferLength;
			if (!isSpeechStarted && ratio > speechStartLikelihood) {
				isSpeechStarted = true;

				tmpSpeechStartIndex = i - energyBufferLength;
				speechStart = Math.max(0.0, tmpSpeechStartIndex * getFrameShiftTime() - 0.5 * getFrameLengthTime());

				tmpSpeechEndIndex = -1;
			} else if (isSpeechStarted && ratio <= speechEndLikelihood) {
				isSpeechStarted = false;
				tmpSpeechEndIndex = i;

				// System.out.println(String.valueOf(tmpSpeechStartIndex*0.01) + " " + String.valueOf(tmpSpeechEndIndex*0.01));

				speechEnd = Math.max(0.0, i * getFrameShiftTime() + 0.5 * getFrameLengthTime());

				double[] newStretch = new double[2];
				newStretch[0] = speechStart;
				newStretch[1] = speechEnd;
				stretches.add(newStretch);

				tmpSpeechStartIndex = -1;
			}

			bufferInd++;
		}
		if (isSpeechStarted) { // unfinished speech stretch
			speechEnd = (energies.length - 1) * getFrameShiftTime() + 0.5 * getFrameLengthTime();
			stretches.add(new double[] { speechStart, speechEnd });
		}

		double[][] speechStretches = (double[][]) stretches.toArray(new double[0][0]);
		boolean[] bRemoveds = new boolean[speechStretches.length];
		Arrays.fill(bRemoveds, false);

		// Check overlapping segments and short silence segments
		double[] stretch1 = new double[2];
		double[] stretch2 = new double[2];
		for (i = speechStretches.length - 1; i > 0; i--) {
			if (speechStretches[i][0] - speechStretches[i - 1][1] < minSilenceDur) {
				speechStretches[i - 1][1] = speechStretches[i][1];
				bRemoveds[i] = true;
			}
		}
		//

		// Check and remove short speech segments
		for (i = 0; i < speechStretches.length; i++) {
			if (!bRemoveds[i] && speechStretches[i][1] - speechStretches[i][0] < minSpeechDur)
				bRemoveds[i] = true;
		}
		//

		stretches.clear();
		for (i = 0; i < bRemoveds.length; i++) {
			if (!bRemoveds[i]) {
				double[] newStretch = new double[2];
				newStretch[0] = speechStretches[i][0];
				newStretch[1] = speechStretches[i][1];
				stretches.add(newStretch);
			}
		}

		return (double[][]) stretches.toArray(new double[0][0]);
	}

	/**
	 * Segment a WAVE file by energy, ideally one word per segment (the result might contain more); the result is saved in a file
	 * in transcriber format so the segmentation can be easily inspected and corrected. The parameters in:
	 * EnergyAnalyser.getSpeechStretchesUsingEnergyHistory(): signalproc.minsilenceduration signalproc.minspeechduration can be
	 * tuned to get better segmentation.
	 * 
	 * @param args
	 *            : first argument is the directory where the wav files are, next arguments in the list are the files for
	 *            segmenting.
	 * @throws Exception
	 *             : IOException, UnsupportedAudioFile exception and IllegalArgumentException when the file is not mono, it just
	 *             handles mono audio signals.
	 */
	public static void energySegmentation(String[] args) throws Exception {
		// First argument is the directory where the files are
		String wavDirectory = args[0];
		String fileNameNoExt;
		String segmentationFileName;
		float duration;
		int i;
		Date today;
		String currentDate;
		SimpleDateFormat formatter;
		formatter = new SimpleDateFormat("yyMMdd");
		today = new Date();
		currentDate = formatter.format(today);

		if (args.length > 0) {
			for (int file = 1; file < args.length; file++) {
				System.out.println("\nProcessing file: " + args[file]);
				AudioInputStream ais = AudioSystem.getAudioInputStream(new File(wavDirectory + "/" + args[file]));
				if (!ais.getFormat().getEncoding().equals(AudioFormat.Encoding.PCM_SIGNED)) {
					ais = AudioSystem.getAudioInputStream(AudioFormat.Encoding.PCM_SIGNED, ais);
				}
				if (ais.getFormat().getChannels() > 1) {
					throw new IllegalArgumentException("Can only deal with mono audio signals");
				}
				int samplingRate = (int) ais.getFormat().getSampleRate();
				DoubleDataSource signal = new AudioDoubleDataSource(ais);
				int framelength = (int) (0.01 /* seconds */* samplingRate);
				EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, samplingRate);
				double[][] speechStretches1 = ea.getSpeechStretches();
				int energyBufferLength = 30;
				double speechStartLikelihood = 0.6;
				double speechEndLikelihood = 0.2;
				double shiftFromMinimumEnergyCenter = 0.1;
				int numClusters = 5;
				double[][] speechStretches2 = ea.getSpeechStretchesUsingEnergyHistory(energyBufferLength, speechStartLikelihood,
						speechEndLikelihood, shiftFromMinimumEnergyCenter, numClusters);

				System.out.println("Speech stretches1 in " + args[file] + ":");
				PrintfFormat format = new PrintfFormat("%.4f");
				for (i = 0; i < speechStretches1.length; i++) {
					System.out.println(format.sprintf(speechStretches1[i][0]) + " " + format.sprintf(speechStretches1[i][1]));
				}

				fileNameNoExt = args[file];
				fileNameNoExt = fileNameNoExt.replace(".wav", "");
				segmentationFileName = wavDirectory + "/" + fileNameNoExt + ".trs";
				PrintWriter toList = new PrintWriter(new FileWriter(segmentationFileName));

				toList.println("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" + "<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">");
				toList.println("<Trans scribe=\"MARY (automatic)\" audio_filename=\"" + fileNameNoExt
						+ "\" version=\"1\" version_date=\"" + currentDate + "\">");

				// length in samples
				ais.getFrameLength();
				duration = ais.getFrameLength() / ais.getFormat().getFrameRate();

				toList.println("<Speakers>");
				toList.println("<Speaker id=\"spk1\" name=\"word\" check=\"no\" dialect=\"native\" accent=\"\" scope=\"local\"/>");
				toList.println("</Speakers>");

				toList.println("<Episode>");
				toList.println("<Section type=\"report\" startTime=\"0\" endTime=\"" + format.sprintf(duration) + "\">");
				toList.println("<Turn startTime=\"0\" endTime=\"" + format.sprintf(speechStretches2[0][0]) + "\">");
				toList.println("<Sync time=\"0\"/>");
				toList.println("");
				toList.println("</Turn>");

				System.out.println("Speech stretches2 in " + args[file] + ":");
				for (i = 0; i < speechStretches2.length; i++) {
					System.out.println(format.sprintf(speechStretches2[i][0]) + " " + format.sprintf(speechStretches2[i][1]));

					toList.println("<Turn speaker=\"spk1\" startTime=\"" + format.sprintf(speechStretches2[i][0])
							+ "0\" endTime=\"" + format.sprintf(speechStretches2[i][1]) + "\">");

					toList.println("<Sync time=\"" + format.sprintf(speechStretches2[i][0]) + "\"/>");
					toList.println("");
					toList.println("</Turn>");
				}
				toList.println("</Section>");
				toList.println("</Episode>");
				toList.println("</Trans>");
				toList.close();

				System.out.println("list of Speech stretches2 in " + segmentationFileName + "  num=" + i + "  dur=" + duration);
			}

		} else {
			System.out.println("No arguments provided: \n Usage: EnergyAnalyser wav_directory wav1 wav2 ... wavN");

		}

	}

	public static void main(String[] args) throws Exception {
		if (args.length > 0) {
			for (int file = 0; file < args.length; file++) {
				AudioInputStream ais = AudioSystem.getAudioInputStream(new File(args[file]));
				if (!ais.getFormat().getEncoding().equals(AudioFormat.Encoding.PCM_SIGNED)) {
					ais = AudioSystem.getAudioInputStream(AudioFormat.Encoding.PCM_SIGNED, ais);
				}
				if (ais.getFormat().getChannels() > 1) {
					throw new IllegalArgumentException("Can only deal with mono audio signals");
				}
				int samplingRate = (int) ais.getFormat().getSampleRate();
				DoubleDataSource signal = new AudioDoubleDataSource(ais);
				int framelength = (int) (0.01 /* seconds */* samplingRate);
				EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, samplingRate);
				double[][] speechStretches1 = ea.getSpeechStretches();
				int energyBufferLength = 30;
				double speechStartLikelihood = 0.6;
				double speechEndLikelihood = 0.2;
				double shiftFromMinimumEnergyCenter = 0.1;
				int numClusters = 3;
				double[][] speechStretches2 = ea.getSpeechStretchesUsingEnergyHistory(energyBufferLength, speechStartLikelihood,
						speechEndLikelihood, shiftFromMinimumEnergyCenter, numClusters);

				System.out.println("Speech stretches1 in " + args[file] + ":");
				PrintfFormat format = new PrintfFormat("%.4f");
				for (int i = 0; i < speechStretches1.length; i++) {
					System.out.println(format.sprintf(speechStretches1[i][0]) + " " + format.sprintf(speechStretches1[i][1]));
				}

				System.out.println("Speech stretches2 in " + args[file] + ":");
				for (int i = 0; i < speechStretches2.length; i++) {
					System.out.println(format.sprintf(speechStretches2[i][0]) + " " + format.sprintf(speechStretches2[i][1]));
				}
			}

		} else {
			AudioFormat audioFormat = new AudioFormat(AudioFormat.Encoding.PCM_SIGNED, 44100.0F, 16, 1, 2, 44100.0F, false);
			DataLine.Info info = new DataLine.Info(TargetDataLine.class, audioFormat);
			AudioInputStream input = null;
			try {
				TargetDataLine mic = (TargetDataLine) AudioSystem.getLine(info);
				mic.open(audioFormat);
				mic.start();
				input = new AudioInputStream(mic);
			} catch (LineUnavailableException e) {
				e.printStackTrace();
			}
			DoubleDataSource signal = new AudioDoubleDataSource(input);
			int framelength = (int) (0.01 /* seconds */* audioFormat.getSampleRate());
			EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, (int) audioFormat.getSampleRate());
			while (true) {
				try {
					Thread.sleep(100);
				} catch (InterruptedException ie) {
				}
				System.out.println(ea.getSilenceCutoff());
			}

		}

	}

}