/** * Copyright 2004-2006 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.signalproc.analysis; import java.io.File; import java.io.FileWriter; import java.io.PrintWriter; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.LinkedList; import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioInputStream; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.DataLine; import javax.sound.sampled.LineUnavailableException; import javax.sound.sampled.TargetDataLine; import marytts.machinelearning.KMeansClusteringTrainer; import marytts.machinelearning.KMeansClusteringTrainerParams; import marytts.signalproc.window.RectWindow; import marytts.util.data.DoubleDataSource; import marytts.util.data.audio.AudioDoubleDataSource; import marytts.util.math.MathUtils; import marytts.util.string.PrintfFormat; /** * * @author Marc Schröder * * A class that analyses the energy distribution, and computes a silence cutoff threshold, in the linear energy domain. * */ public class EnergyAnalyser extends FrameBasedAnalyser<Double> { protected final int DEFAULT_MAXSIZE = Integer.MAX_VALUE / 2; /** array of frame energies, for further analysis */ protected double[] frameEnergies = new double[16384]; /** * Beginning of valid data in frameEnergies; will be >0 only after more than maxSize frames have been read. */ protected int offset = 0; /** * Length of valid data, counting from offset. This will count up to maxSize and then stay equal to maxSize. */ protected int len = 0; /** maximum size of the double[] storing the frame energies */ protected int maxSize; public EnergyAnalyser(DoubleDataSource signal, int framelength, int samplingRate) { super(signal, new RectWindow(framelength), framelength, samplingRate); maxSize = DEFAULT_MAXSIZE; } public EnergyAnalyser(DoubleDataSource signal, int framelength, int frameShift, int samplingRate) { super(signal, new RectWindow(framelength), frameShift, samplingRate); maxSize = DEFAULT_MAXSIZE; } public EnergyAnalyser(DoubleDataSource signal, int framelength, int frameShift, int samplingRate, int maxSize) { super(signal, new RectWindow(framelength), frameShift, samplingRate); this.maxSize = maxSize; } /** * Apply this FrameBasedAnalyser to the given data. * * @param frame * the data to analyse, which must be of the length prescribed by this FrameBasedAnalyser, i.e. by works like * {@link #getFrameLengthSamples()} . * @return a Double representing the total energy in the frame. * @throws IllegalArgumentException * if frame does not have the prescribed length */ public Double analyse(double[] frame) { if (frame.length != getFrameLengthSamples()) throw new IllegalArgumentException("Expected frame of length " + getFrameLengthSamples() + ", got " + frame.length); double totalEnergy = 0; for (int i = 0; i < frame.length; i++) { totalEnergy += frame[i] * frame[i]; } rememberFrameEnergy(totalEnergy); return new Double(totalEnergy); } protected void rememberFrameEnergy(double energy) { if (offset + len == frameEnergies.length) { // need to make space if (len < maxSize) { // need to increase the array size assert offset == 0; double[] dummy = new double[2 * frameEnergies.length]; System.arraycopy(frameEnergies, 0, dummy, 0, frameEnergies.length); frameEnergies = dummy; } else { // we have reached the maximum length if (frameEnergies.length < 2 * maxSize) { // make sure we have a buffer twice maxSize double[] dummy = new double[2 * maxSize]; System.arraycopy(frameEnergies, offset, dummy, 0, len); frameEnergies = dummy; offset = 0; } else { // need to copy valid data to the beginning of the array System.arraycopy(frameEnergies, offset, frameEnergies, 0, len); offset = 0; } } } assert offset + len < frameEnergies.length; frameEnergies[offset + len] = energy; if (len < maxSize) len++; else offset++; } /** * Compute the overall mean energy in all frames. * * @return a double representing the mean energy (non-normalised, i.e. in units of square sample amplitudes). */ public double getMeanFrameEnergy() { double mean = 0; for (int i = 0; i < len; i++) { mean += frameEnergies[offset + i]; } mean /= len; return mean; } /** * Compute the overall maximum energy in all frames. * * @return a double representing the maximum energy (non-normalised, i.e. in units of square sample amplitudes). */ public double getMaxFrameEnergy() { if (len == 0) return Double.NaN; // otherwise, we have at least one valid value double max = frameEnergies[offset]; for (int i = 0; i < len; i++) { double val = frameEnergies[offset + i]; if (val > max) max = val; } return max; } /** * Compute the overall minimum energy in all frames. * * @return a double representing the minimum energy (non-normalised, i.e. in units of square sample amplitudes). */ public double getMinFrameEnergy() { if (len == 0) return Double.NaN; // otherwise, we have at least one valid value double min = frameEnergies[offset]; for (int i = 0; i < len; i++) { double val = frameEnergies[offset + i]; if (val < min) min = val; } return min; } /** * Compute a histogram of energies found in the data. Bin sizes are automatically determined based on the min and max frame * energies, such that the interval between min and max energy is split into 100 bins. * * @return an array of doubles of length nbins, representing percentage distribution across bins. */ public double[] getEnergyHistogram() { return getEnergyHistogram(100); } /** * Compute a histogram of energies found in the data. Bin sizes are automatically determined based on the min and max frame * energies, such that the interval between min and max energy is split into nbins bins. * * @param nbins * the number of bins to compute, e.g. 100 * @return an array of doubles of length nbins, representing percentage distribution across bins. */ public double[] getEnergyHistogram(int nbins) { double[] histogram = new double[nbins]; double min = getMinFrameEnergy(); double range = getMaxFrameEnergy() - min; double binWidth = range / nbins; double increment = 1. / len; for (int i = 0; i < len; i++) { int bin = (int) Math.floor((frameEnergies[offset + i] - min) / binWidth); // special case maximum energy: it still belongs to the top bin if (bin == nbins) bin = nbins - 1; assert bin < nbins; histogram[bin] += increment; } return histogram; } /** * Determine the energy level below which to find silence. This is based on the energy histogram. * * @return the energy below which is silence. */ public double getSilenceCutoff() { double[] hist = getEnergyHistogram(); double[] lowerHalf = new double[hist.length / 2]; // computation of the length of upperHalf accounts for the possibility that hist.length is odd double[] upperHalf = new double[hist.length - lowerHalf.length]; System.arraycopy(hist, 0, lowerHalf, 0, lowerHalf.length); System.arraycopy(hist, lowerHalf.length, upperHalf, 0, upperHalf.length); int silencePeak = MathUtils.findGlobalPeakLocation(lowerHalf); int speechPeak = lowerHalf.length + MathUtils.findGlobalPeakLocation(upperHalf); int iCutoff = silencePeak + (speechPeak - silencePeak) / 2; // Compute dB correlate of cutoff level double minEnergy = getMinFrameEnergy(); double maxEnergy = getMaxFrameEnergy(); double cutoffEnergy = minEnergy + (maxEnergy - minEnergy) * iCutoff / hist.length; return cutoffEnergy; } public double getSilenceCutoffFromSortedEnergies(FrameAnalysisResult[] far, double silenceThreshold) { double[] energies = new double[far.length]; double cutoffEnergy; for (int i = 0; i < far.length; i++) energies[i] = ((Double) far[i].get()).doubleValue(); MathUtils.quickSort(energies); int cutoffIndex = (int) Math.floor(silenceThreshold * energies.length); while (energies[cutoffIndex] == 0.0) { cutoffIndex++; if (cutoffIndex > energies.length - 1) { cutoffIndex = energies.length - 1; break; } } cutoffEnergy = energies[cutoffIndex]; return cutoffEnergy; } /** * For the current audio data and the automatically calculated silence cutoff, compute a list of start and end times * representing speech stretches within the file. This method will take the following System properties into account: * <ul> * <li><code>signalproc.minsilenceduration</code> (default: 0.1 (seconds)) * <li><code>signalproc.minspeechduration</code> (default: 0.1 (seconds)) * </ul> * Silence or speech stretches shorter than these values will be ignored. * * @return an array of double pairs, representing start and end times (in seconds) for each speech stretch. */ public double[][] getSpeechStretches() { double minSilenceDur = Double.parseDouble(System.getProperty("signalproc.minsilenceduration", "0.1")); double minSpeechDur = Double.parseDouble(System.getProperty("signalproc.minspeechduration", "0.1")); FrameAnalysisResult[] far = analyseAllFrames(); double silenceCutoff = getSilenceCutoff(); LinkedList stretches = new LinkedList(); boolean withinSpeech = false; for (int i = 0; i < far.length; i++) { double energy = ((Double) far[i].get()).doubleValue(); if (energy > silenceCutoff) { // it's a speech frame if (!withinSpeech) { // previous was silence boolean addStretch = false; // Check that the preceding silence was long enough: if (stretches.size() == 0) { addStretch = true; } else { // there is a preceding stretch double silenceStart = ((double[]) stretches.getLast())[1]; double silenceEnd = i * getFrameLengthTime(); // current time if (silenceEnd - silenceStart >= minSilenceDur) { addStretch = true; } } if (addStretch) { double[] newStretch = new double[2]; // Start of current frame is start of new stretch newStretch[0] = i * getFrameLengthTime(); stretches.add(newStretch); } // else, overwrite position [1] of existing stretch withinSpeech = true; assert stretches.size() > 0; } } else { // it's a silence frame if (withinSpeech) { // previous was speech assert stretches.size() > 0; double[] latestStretch = (double[]) stretches.getLast(); double speechStart = latestStretch[0]; double speechEnd = (double) (i + 1) * getFrameLengthTime(); // end of current frame if (speechEnd - speechStart >= minSpeechDur) { // long enough // complete the segment: latestStretch[1] = speechEnd; } else { // not long enough // delete the stretch stretches.removeLast(); } withinSpeech = false; } } } return (double[][]) stretches.toArray(new double[0][0]); } public double getSilenceCutoffFromKMeansClustering(double shiftFromMinimumEnergyCenter, int numClusters) { int i, j; FrameAnalysisResult[] far = analyseAllFrames(); double[][] energies = new double[far.length][1]; for (i = 0; i < far.length; i++) energies[i][0] = ((Double) far[i].get()).doubleValue(); KMeansClusteringTrainerParams p = new KMeansClusteringTrainerParams(); p.numClusters = numClusters; p.maxIterations = 40; KMeansClusteringTrainer t = new KMeansClusteringTrainer(); t.train(energies, p); double[] meanEns = new double[p.numClusters]; for (i = 0; i < p.numClusters; i++) { meanEns[i] = t.clusters[i].meanVector[0]; System.out.println(String.valueOf(meanEns[i])); } double minEnCenter = MathUtils.getMin(meanEns); double maxEnCenter = MathUtils.getMax(meanEns); double energyTh = minEnCenter + shiftFromMinimumEnergyCenter * (maxEnCenter - minEnCenter); // System.out.println(String.valueOf(energyTh)); return energyTh; } /** * * The latest version uses K-Means clustering to cluster energy values into 3 separate clusters. Then, the energy threshold is * selected using the lowest and highest energy cluster centers * * @param energyBufferLength * energyBufferLength * @param speechStartLikelihood * speechStartLikelihood * @param speechEndLikelihood * speechEndLikelihood * @param shiftFromMinimumEnergyCenter * shiftFromMinimumEnergyCenter * @param numClusters * numClusters * @return stretches.toArray(new double[0][0]) */ public double[][] getSpeechStretchesUsingEnergyHistory(int energyBufferLength, double speechStartLikelihood, double speechEndLikelihood, double shiftFromMinimumEnergyCenter, int numClusters) { int i, j; double minSilenceDur = Double.parseDouble(System.getProperty("signalproc.minsilenceduration", "0.3")); double minSpeechDur = Double.parseDouble(System.getProperty("signalproc.minspeechduration", "0.3")); FrameAnalysisResult<Double>[] far = analyseAllFrames(); double[][] energies = new double[far.length][1]; for (i = 0; i < far.length; i++) energies[i][0] = far[i].get(); double[] isSpeechsAll = new double[far.length]; Arrays.fill(isSpeechsAll, 0.0); KMeansClusteringTrainerParams p = new KMeansClusteringTrainerParams(); p.numClusters = numClusters; p.maxIterations = 40; KMeansClusteringTrainer t = new KMeansClusteringTrainer(); t.train(energies, p); double[] meanEns = new double[p.numClusters]; // TODO: stop mixing log and non-log code -- either use log energy by using EnergyAnalyser_dB, or linear energy by using // EnergyAnalyser boolean takeLog = true; if (this instanceof EnergyAnalyser_dB) takeLog = false; for (i = 0; i < p.numClusters; i++) { meanEns[i] = t.clusters[i].meanVector[0]; if (takeLog) { meanEns[i] = 10 * Math.log10(meanEns[i]); } // System.out.println(String.valueOf(meanEns[i])); } double minEnCenter = MathUtils.getMin(meanEns); double maxEnCenter = MathUtils.getMax(meanEns); double energyTh = minEnCenter + shiftFromMinimumEnergyCenter * (maxEnCenter - minEnCenter); // System.out.println(String.valueOf(energyTh)); LinkedList stretches = new LinkedList(); if (energyBufferLength > far.length) energyBufferLength = far.length; double[] energyBuffer = new double[energyBufferLength]; int[] isSpeechs = new int[energyBufferLength]; Arrays.fill(isSpeechs, 0); double ratio; int speechCount; int bufferInd = 0; for (i = 0; i < energyBufferLength - 1; i++) { energyBuffer[bufferInd] = energies[i][0]; if (takeLog) { energyBuffer[bufferInd] = 10 * Math.log10(energyBuffer[bufferInd]); } bufferInd++; } boolean isSpeechStarted = false; int tmpSpeechStartIndex = -1; int tmpSpeechEndIndex = -1; int prevStartIndex = -1; double speechStart = -1.0; double speechEnd = -1.0; for (i = energyBufferLength - 1; i < energies.length; i++) { if (bufferInd > energyBufferLength - 1) bufferInd = 0; energyBuffer[bufferInd] = energies[i][0]; if (takeLog) { energyBuffer[bufferInd] = 10 * Math.log10(energyBuffer[bufferInd]); } if (energyBuffer[bufferInd] > energyTh) { isSpeechs[bufferInd] = 1; isSpeechsAll[i] = 1; } else isSpeechs[bufferInd] = 0; speechCount = 0; for (j = 0; j < energyBufferLength; j++) { if (isSpeechs[j] == 1) speechCount++; } ratio = ((double) speechCount) / energyBufferLength; if (!isSpeechStarted && ratio > speechStartLikelihood) { isSpeechStarted = true; tmpSpeechStartIndex = i - energyBufferLength; speechStart = Math.max(0.0, tmpSpeechStartIndex * getFrameShiftTime() - 0.5 * getFrameLengthTime()); tmpSpeechEndIndex = -1; } else if (isSpeechStarted && ratio <= speechEndLikelihood) { isSpeechStarted = false; tmpSpeechEndIndex = i; // System.out.println(String.valueOf(tmpSpeechStartIndex*0.01) + " " + String.valueOf(tmpSpeechEndIndex*0.01)); speechEnd = Math.max(0.0, i * getFrameShiftTime() + 0.5 * getFrameLengthTime()); double[] newStretch = new double[2]; newStretch[0] = speechStart; newStretch[1] = speechEnd; stretches.add(newStretch); tmpSpeechStartIndex = -1; } bufferInd++; } if (isSpeechStarted) { // unfinished speech stretch speechEnd = (energies.length - 1) * getFrameShiftTime() + 0.5 * getFrameLengthTime(); stretches.add(new double[] { speechStart, speechEnd }); } double[][] speechStretches = (double[][]) stretches.toArray(new double[0][0]); boolean[] bRemoveds = new boolean[speechStretches.length]; Arrays.fill(bRemoveds, false); // Check overlapping segments and short silence segments double[] stretch1 = new double[2]; double[] stretch2 = new double[2]; for (i = speechStretches.length - 1; i > 0; i--) { if (speechStretches[i][0] - speechStretches[i - 1][1] < minSilenceDur) { speechStretches[i - 1][1] = speechStretches[i][1]; bRemoveds[i] = true; } } // // Check and remove short speech segments for (i = 0; i < speechStretches.length; i++) { if (!bRemoveds[i] && speechStretches[i][1] - speechStretches[i][0] < minSpeechDur) bRemoveds[i] = true; } // stretches.clear(); for (i = 0; i < bRemoveds.length; i++) { if (!bRemoveds[i]) { double[] newStretch = new double[2]; newStretch[0] = speechStretches[i][0]; newStretch[1] = speechStretches[i][1]; stretches.add(newStretch); } } return (double[][]) stretches.toArray(new double[0][0]); } /** * Segment a WAVE file by energy, ideally one word per segment (the result might contain more); the result is saved in a file * in transcriber format so the segmentation can be easily inspected and corrected. The parameters in: * EnergyAnalyser.getSpeechStretchesUsingEnergyHistory(): signalproc.minsilenceduration signalproc.minspeechduration can be * tuned to get better segmentation. * * @param args * : first argument is the directory where the wav files are, next arguments in the list are the files for * segmenting. * @throws Exception * : IOException, UnsupportedAudioFile exception and IllegalArgumentException when the file is not mono, it just * handles mono audio signals. */ public static void energySegmentation(String[] args) throws Exception { // First argument is the directory where the files are String wavDirectory = args[0]; String fileNameNoExt; String segmentationFileName; float duration; int i; Date today; String currentDate; SimpleDateFormat formatter; formatter = new SimpleDateFormat("yyMMdd"); today = new Date(); currentDate = formatter.format(today); if (args.length > 0) { for (int file = 1; file < args.length; file++) { System.out.println("\nProcessing file: " + args[file]); AudioInputStream ais = AudioSystem.getAudioInputStream(new File(wavDirectory + "/" + args[file])); if (!ais.getFormat().getEncoding().equals(AudioFormat.Encoding.PCM_SIGNED)) { ais = AudioSystem.getAudioInputStream(AudioFormat.Encoding.PCM_SIGNED, ais); } if (ais.getFormat().getChannels() > 1) { throw new IllegalArgumentException("Can only deal with mono audio signals"); } int samplingRate = (int) ais.getFormat().getSampleRate(); DoubleDataSource signal = new AudioDoubleDataSource(ais); int framelength = (int) (0.01 /* seconds */* samplingRate); EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, samplingRate); double[][] speechStretches1 = ea.getSpeechStretches(); int energyBufferLength = 30; double speechStartLikelihood = 0.6; double speechEndLikelihood = 0.2; double shiftFromMinimumEnergyCenter = 0.1; int numClusters = 5; double[][] speechStretches2 = ea.getSpeechStretchesUsingEnergyHistory(energyBufferLength, speechStartLikelihood, speechEndLikelihood, shiftFromMinimumEnergyCenter, numClusters); System.out.println("Speech stretches1 in " + args[file] + ":"); PrintfFormat format = new PrintfFormat("%.4f"); for (i = 0; i < speechStretches1.length; i++) { System.out.println(format.sprintf(speechStretches1[i][0]) + " " + format.sprintf(speechStretches1[i][1])); } fileNameNoExt = args[file]; fileNameNoExt = fileNameNoExt.replace(".wav", ""); segmentationFileName = wavDirectory + "/" + fileNameNoExt + ".trs"; PrintWriter toList = new PrintWriter(new FileWriter(segmentationFileName)); toList.println("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" + "<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">"); toList.println("<Trans scribe=\"MARY (automatic)\" audio_filename=\"" + fileNameNoExt + "\" version=\"1\" version_date=\"" + currentDate + "\">"); // length in samples ais.getFrameLength(); duration = ais.getFrameLength() / ais.getFormat().getFrameRate(); toList.println("<Speakers>"); toList.println("<Speaker id=\"spk1\" name=\"word\" check=\"no\" dialect=\"native\" accent=\"\" scope=\"local\"/>"); toList.println("</Speakers>"); toList.println("<Episode>"); toList.println("<Section type=\"report\" startTime=\"0\" endTime=\"" + format.sprintf(duration) + "\">"); toList.println("<Turn startTime=\"0\" endTime=\"" + format.sprintf(speechStretches2[0][0]) + "\">"); toList.println("<Sync time=\"0\"/>"); toList.println(""); toList.println("</Turn>"); System.out.println("Speech stretches2 in " + args[file] + ":"); for (i = 0; i < speechStretches2.length; i++) { System.out.println(format.sprintf(speechStretches2[i][0]) + " " + format.sprintf(speechStretches2[i][1])); toList.println("<Turn speaker=\"spk1\" startTime=\"" + format.sprintf(speechStretches2[i][0]) + "0\" endTime=\"" + format.sprintf(speechStretches2[i][1]) + "\">"); toList.println("<Sync time=\"" + format.sprintf(speechStretches2[i][0]) + "\"/>"); toList.println(""); toList.println("</Turn>"); } toList.println("</Section>"); toList.println("</Episode>"); toList.println("</Trans>"); toList.close(); System.out.println("list of Speech stretches2 in " + segmentationFileName + " num=" + i + " dur=" + duration); } } else { System.out.println("No arguments provided: \n Usage: EnergyAnalyser wav_directory wav1 wav2 ... wavN"); } } public static void main(String[] args) throws Exception { if (args.length > 0) { for (int file = 0; file < args.length; file++) { AudioInputStream ais = AudioSystem.getAudioInputStream(new File(args[file])); if (!ais.getFormat().getEncoding().equals(AudioFormat.Encoding.PCM_SIGNED)) { ais = AudioSystem.getAudioInputStream(AudioFormat.Encoding.PCM_SIGNED, ais); } if (ais.getFormat().getChannels() > 1) { throw new IllegalArgumentException("Can only deal with mono audio signals"); } int samplingRate = (int) ais.getFormat().getSampleRate(); DoubleDataSource signal = new AudioDoubleDataSource(ais); int framelength = (int) (0.01 /* seconds */* samplingRate); EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, samplingRate); double[][] speechStretches1 = ea.getSpeechStretches(); int energyBufferLength = 30; double speechStartLikelihood = 0.6; double speechEndLikelihood = 0.2; double shiftFromMinimumEnergyCenter = 0.1; int numClusters = 3; double[][] speechStretches2 = ea.getSpeechStretchesUsingEnergyHistory(energyBufferLength, speechStartLikelihood, speechEndLikelihood, shiftFromMinimumEnergyCenter, numClusters); System.out.println("Speech stretches1 in " + args[file] + ":"); PrintfFormat format = new PrintfFormat("%.4f"); for (int i = 0; i < speechStretches1.length; i++) { System.out.println(format.sprintf(speechStretches1[i][0]) + " " + format.sprintf(speechStretches1[i][1])); } System.out.println("Speech stretches2 in " + args[file] + ":"); for (int i = 0; i < speechStretches2.length; i++) { System.out.println(format.sprintf(speechStretches2[i][0]) + " " + format.sprintf(speechStretches2[i][1])); } } } else { AudioFormat audioFormat = new AudioFormat(AudioFormat.Encoding.PCM_SIGNED, 44100.0F, 16, 1, 2, 44100.0F, false); DataLine.Info info = new DataLine.Info(TargetDataLine.class, audioFormat); AudioInputStream input = null; try { TargetDataLine mic = (TargetDataLine) AudioSystem.getLine(info); mic.open(audioFormat); mic.start(); input = new AudioInputStream(mic); } catch (LineUnavailableException e) { e.printStackTrace(); } DoubleDataSource signal = new AudioDoubleDataSource(input); int framelength = (int) (0.01 /* seconds */* audioFormat.getSampleRate()); EnergyAnalyser ea = new EnergyAnalyser(signal, framelength, framelength, (int) audioFormat.getSampleRate()); while (true) { try { Thread.sleep(100); } catch (InterruptedException ie) { } System.out.println(ea.getSilenceCutoff()); } } } }