/* * Copyright 2007 Sun Microsystems, Inc. * * This file is part of jVoiceBridge. * * jVoiceBridge is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation and distributed hereunder * to you. * * jVoiceBridge is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * Sun designates this particular file as subject to the "Classpath" * exception as provided by Sun in the License file that accompanied this * code. */ package com.sun.voip; import java.io.IOException; public class SpeechDetector { private boolean speakingChanged = false; private boolean isSpeaking = false; static final int POW_THRESH = 50000; // initial power threshold static int cnThresh = 50; // # of avgs to test speaking (1/10 sec at 8k hz) static double powerThresholdLimit = 1.05f; static int onThresh = 1; static int offThresh = 4; double powthresh = POW_THRESH; int oncount = 0; int offcount = 0; double sum = 0; double cnt = 0; int speechDetectorCalls; long speechDetectorTime; String id; MediaInfo mediaInfo; public SpeechDetector(String id, MediaInfo mediaInfo) { this.id = id; this.mediaInfo = mediaInfo; cnThresh = mediaInfo.getSampleRate() / 8000 * 50; if (mediaInfo.getChannels() == 2) { cnThresh *= 2; } } public static void setCnThresh(int cnThresh) { SpeechDetector.cnThresh = cnThresh; if (Logger.logLevel >= Logger.LOG_MOREINFO) { Logger.println("cnThresh set to " + cnThresh); } } public static int getCnThresh() { return cnThresh; } public static void setPowerThresholdLimit(double powerThresholdLimit) { SpeechDetector.powerThresholdLimit = powerThresholdLimit; if (Logger.logLevel >= Logger.LOG_MOREINFO) { Logger.println("powerThresholdLimit set to " + powerThresholdLimit); } } public static double getPowerThresholdLimit() { return powerThresholdLimit; } public static void setOnThresh(int onThresh) { SpeechDetector.onThresh = onThresh; if (Logger.logLevel >= Logger.LOG_MOREINFO) { Logger.println("onThresh set to " + onThresh); } } public static int getOnThresh() { return onThresh; } public static void setOffThresh(int offThresh) { SpeechDetector.offThresh = offThresh; if (Logger.logLevel >= Logger.LOG_MOREINFO) { Logger.println("offThresh set to " + offThresh); } } public static int getOffThresh() { return offThresh; } public boolean reset() { sum = 0; cnt = 0; powthresh = POW_THRESH; boolean oldIsSpeaking = isSpeaking; speakingChanged = false; isSpeaking = false; return oldIsSpeaking; } /* * linearData contains 16-bit linear data in a byte array. * Returns true if speaking started or stopped. */ public boolean processData(byte[] linearData) { speechDetectorCalls++; long start = CurrentTime.getTime(); /* * Round down to 16 byte boundary in case length isn't * a multiple of 16. */ int length = (linearData.length / 16) * 16; long sq = 0; for (int i = 0; i <= length - 16; i += 16) { /* * average next 8 samples (MSB only), square result, * add to running avg */ double avg = (double) ((linearData[i + 0] + linearData[i + 2] + linearData[i + 4] + linearData[i + 6] + linearData[i + 8] + linearData[i + 10] + linearData[i + 12] + linearData[i + 14]) / 8.); /* * Divide by the number of channels. For stereo we're * likely to be getting similar sounds in each channel. */ avg /= mediaInfo.getChannels(); /* * By squaring the average, the larger values weigh more * than the smaller ones. * Also, squaring makes everything positive */ sum += (avg * avg); cnt++; } speechDetectorTime += (CurrentTime.getTime() - start); return (speakingChanged()); } /* * linearData contains 16-bit linear data in a int array. * Returns true if speaking started or stopped. */ public boolean processData(int[] linearData) { speechDetectorCalls++; long start = CurrentTime.getTime(); /* * Round down to 8 sample boundary */ int nSamples = (linearData.length / 8) * 8; long sq = 0; for (int i = 0; i <= nSamples - 8; i += 8) { /* * average next 8 samples (MSB only), square result, * add to running avg */ double avg = (double) ( ((byte)(linearData[i + 0] >> 8) + (byte)(linearData[i + 1] >> 8) + (byte)(linearData[i + 2] >> 8) + (byte)(linearData[i + 3] >> 8) + (byte)(linearData[i + 4] >> 8) + (byte)(linearData[i + 5] >> 8) + (byte)(linearData[i + 6] >> 8) + (byte)(linearData[i + 7] >> 8)) / 8.); /* * Divide by the number of channels. For stereo we're * likely to be getting similar sounds in each channel. */ avg /= mediaInfo.getChannels(); /* * By squaring the average, the larger values weigh more * than the smaller ones. * Also, squaring makes everything positive */ sum += (avg * avg); cnt++; } speechDetectorTime += (CurrentTime.getTime() - start); return (speakingChanged()); } /* * Returns true if speaker has started or stopped speaking */ private boolean speakingChanged() { boolean speakingChanged = false; if (cnt < cnThresh) { return false; // can't tell for sure yet } double value = sum / cnt; /* value is now the power in this sample set. */ if (value > powthresh) { oncount++; offcount = 0; if (oncount > onThresh) { if (isSpeaking == false) { isSpeaking = true; speakingChanged = true; } } // drag powthresh up powthresh = ((powthresh + 2) * 63 + value) / 64; } else { offcount++; oncount= 0; if (offcount > offThresh) { if (isSpeaking == true) { isSpeaking = false; speakingChanged = true; } } // make sure powthresh is < value * 1.1 + 2 if (powthresh > value * powerThresholdLimit + 2) { powthresh = value * powerThresholdLimit + 2; } } sum = 0; cnt = 0; return speakingChanged; } /* * return true if we can determine that there is speech * in the data. If there's not enough data or we're * sure there's not speech, return true. */ public boolean isSpeaking() { return isSpeaking; } public void printStatistics() { String s = ""; if (id != null) { s += "Call " + id + ": "; } Logger.writeFile(s + "Speech detector calls: " + speechDetectorCalls); if (speechDetectorCalls != 0) { Logger.writeFile(s + "SpeechDetector average ms per call: " + ((float)((float)speechDetectorTime / speechDetectorCalls) / CurrentTime.getTimeUnitsPerSecond())); } } public static void main(String[] args) { if (args.length != 1) { Logger.println("Usage: java SpeechDetector <.au file>"); System.exit(1); } TreatmentManager treatmentManager = null; try { treatmentManager = new TreatmentManager(args[0], 0); } catch (IOException e) { System.out.println("Can't get treatment " + e.getMessage()); System.exit(1); } MediaInfo mediaInfo = new MediaInfo((byte) 0, RtpPacket.PCM_ENCODING, treatmentManager.getSampleRate(), treatmentManager.getChannels(), false); Logger.println("MediaInfo " + mediaInfo); SpeechDetector speechDetector = new SpeechDetector("Test", mediaInfo); byte[] linearData; while ((linearData = treatmentManager.getLinearDataBytes( RtpPacket.PACKET_PERIOD)) != null) { if (speechDetector.processData(linearData) == true) { if (speechDetector.isSpeaking()) { Logger.println("Started speaking..."); } else { Logger.println("Stopped speaking..."); } } } } }