/*
* Copyright 1999-2002 Carnegie Mellon University.
* Portions Copyright 2002 Sun Microsystems, Inc.
* Portions Copyright 2002 Mitsubishi Electric Research Laboratories.
* All Rights Reserved. Use is subject to license terms.
*
* See the file "license.terms" for information on usage and
* redistribution of this file, and for a DISCLAIMER OF ALL
* WARRANTIES.
*
*/
package edu.cmu.sphinx.frontend.endpoint;
import edu.cmu.sphinx.frontend.*;
import edu.cmu.sphinx.util.LogMath;
import edu.cmu.sphinx.util.props.PropertyException;
import edu.cmu.sphinx.util.props.PropertySheet;
import edu.cmu.sphinx.util.props.S4Double;
import edu.cmu.sphinx.util.props.S4Integer;
import java.util.logging.Level;
/**
* Implements a level tracking endpointer invented by Bent Schmidt Nielsen.
* <p>This endpointer is composed of two main steps.
* <ol>
* <li>classification of audio into speech and non-speech
* <li>inserting SPEECH_START and SPEECH_END signals around speech and removing non-speech regions
* </ol>
* <p>
* The first step, classification of audio into speech and non-speech, uses Bent Schmidt Nielsen's algorithm. Each
* time audio comes in, the average signal level and the background noise level are updated, using the signal level of
* the current audio. If the average signal level is greater than the background noise level by a certain threshold
* value (configurable), then the current audio is marked as speech. Otherwise, it is marked as non-speech.
* <p>
* The second step of this endpointer is documented in the class {@link SpeechMarker SpeechMarker}
*
* @see SpeechMarker
*/
public class SpeechClassifier extends AbstractVoiceActivityDetector {
/** The property specifying the endpointing frame length in milliseconds. */
@S4Integer(defaultValue = 10)
public static final String PROP_FRAME_LENGTH_MS = "frameLengthInMs";
/** The property specifying the minimum signal level used to update the background signal level. */
@S4Double(defaultValue = 0)
public static final String PROP_MIN_SIGNAL = "minSignal";
/**
* The property specifying the threshold. If the current signal level is greater than the background level by
* this threshold, then the current signal is marked as speech. Therefore, a lower threshold will make the
* endpointer more sensitive, that is, mark more audio as speech. A higher threshold will make the endpointer less
* sensitive, that is, mark less audio as speech.
*/
@S4Double(defaultValue = 10)
public static final String PROP_THRESHOLD = "threshold";
/** The property specifying the adjustment. */
@S4Double(defaultValue = 0.003)
public static final String PROP_ADJUSTMENT = "adjustment";
protected final double averageNumber = 1;
protected double adjustment;
/** average signal level. */
protected double level;
/** background signal level. */
protected double background;
/** minimum valid signal level. */
protected double minSignal;
protected double threshold;
protected float frameLengthSec;
protected boolean isSpeech;
/* Statistics */
protected long speechFrames;
protected long backgroundFrames;
protected double totalBackgroundLevel;
protected double totalSpeechLevel;
public SpeechClassifier(int frameLengthMs, double adjustment, double threshold, double minSignal ) {
initLogger();
this.frameLengthSec = frameLengthMs / 1000.f;
this.adjustment = adjustment;
this.threshold = threshold;
this.minSignal = minSignal;
initialize();
}
public SpeechClassifier() {
}
@Override
public void newProperties(PropertySheet ps) throws PropertyException {
super.newProperties(ps);
int frameLengthMs = ps.getInt(PROP_FRAME_LENGTH_MS);
frameLengthSec = frameLengthMs / 1000.f;
adjustment = ps.getDouble(PROP_ADJUSTMENT);
threshold = ps.getDouble(PROP_THRESHOLD);
minSignal = ps.getDouble(PROP_MIN_SIGNAL);
logger = ps.getLogger();
//logger.setLevel(Level.FINEST);
initialize();
}
/** Initializes this LevelTracker endpointer and DataProcessor predecessor. */
@Override
public void initialize() {
super.initialize();
reset();
}
/** Resets this LevelTracker to a starting state. */
protected void reset() {
level = 0;
background = 300;
resetStats();
}
/**
* Returns the logarithm base 10 of the root mean square of the given samples.
*
* @param samples the samples
* @return the calculated log root mean square in log 10
*/
public static double logRootMeanSquare(double[] samples) {
assert samples.length > 0;
double sumOfSquares = 0.0f;
for (double sample : samples) {
sumOfSquares += sample * sample;
}
double rootMeanSquare = Math.sqrt
(sumOfSquares / samples.length);
rootMeanSquare = Math.max(rootMeanSquare, 1);
return (LogMath.log10((float) rootMeanSquare) * 20);
}
/**
* Classifies the given audio frame as speech or not, and updates the endpointing parameters.
*
* @param audio the audio frame
* @return Data with classification flag
*/
protected SpeechClassifiedData classify(DoubleData audio) {
double current = logRootMeanSquare(audio.getValues());
isSpeech = false;
if (current >= minSignal) {
level = ((level * averageNumber) + current) / (averageNumber + 1);
if (current < background) {
background = current;
} else {
background += (current - background) * adjustment;
}
if (level < background) {
level = background;
}
isSpeech = (level - background > threshold);
}
SpeechClassifiedData labeledAudio = new SpeechClassifiedData(audio, isSpeech);
if (logger.isLoggable(Level.FINEST)) {
String speech = "";
if (labeledAudio.isSpeech())
speech = "*";
logger.finest("Bkg: " + background + ", level: " + level +
", current: " + current + ' ' + speech);
}
collectStats (isSpeech);
return labeledAudio;
}
/**
* Reset statistics
*/
private void resetStats () {
backgroundFrames = 1;
speechFrames = 1;
totalSpeechLevel = 0;
totalBackgroundLevel = 0;
}
/**
* Collects the statistics to provide information about signal to noise ratio in channel
*
* @param isSpeech if the current frame is classified as speech
*/
private void collectStats(boolean isSpeech) {
if (isSpeech) {
totalSpeechLevel = totalSpeechLevel + level;
speechFrames = speechFrames + 1;
} else {
totalBackgroundLevel = totalBackgroundLevel + background;
backgroundFrames = backgroundFrames + 1;
}
}
/**
* Returns the next Data object.
*
* @return the next Data object, or null if none available
* @throws DataProcessingException if a data processing error occurs
*/
@Override
public Data getData() throws DataProcessingException {
Data audio = getPredecessor().getData();
if (audio instanceof DataStartSignal)
reset();
if (audio instanceof DoubleData) {
DoubleData data = (DoubleData) audio;
audio = classify(data);
}
return audio;
}
/**
* Method that returns if current returned frame contains speech.
* It could be used by noise filter for example to adjust noise
* spectrum estimation.
*
* @return if current frame is speech
*/
@Override
public boolean isSpeech() {
return isSpeech;
}
/**
* Retrieves accumulated signal to noise ratio in dbScale
*
* @return signal to noise ratio
*/
public double getSNR () {
double snr = (totalSpeechLevel / speechFrames) - (totalBackgroundLevel / backgroundFrames);
logger.fine ("Background " + totalBackgroundLevel / backgroundFrames);
logger.fine ("Speech " + totalSpeechLevel / speechFrames);
logger.fine ("SNR is " + snr);
return snr;
}
/**
* Return the estimation if input data was noisy enough to break
* recognition. The audio is counted noisy if signal to noise ratio
* is less then 20dB.
*
* @return estimation of data being noisy
*/
public boolean getNoisy () {
return (getSNR() < 20);
}
}