/*
* Copyright 1999-2002 Carnegie Mellon University.
* Portions Copyright 2002 Sun Microsystems, Inc.
* Portions Copyright 2002 Mitsubishi Electric Research Laboratories.
* All Rights Reserved. Use is subject to license terms.
*
* See the file "license.terms" for information on usage and
* redistribution of this file, and for a DISCLAIMER OF ALL
* WARRANTIES.
*
*/
package edu.cmu.sphinx.frontend.endpoint;
import edu.cmu.sphinx.frontend.*;
import edu.cmu.sphinx.util.props.PropertyException;
import edu.cmu.sphinx.util.props.PropertySheet;
import edu.cmu.sphinx.util.props.S4Integer;
import java.util.LinkedList;
/**
* Converts a stream of SpeechClassifiedData objects, marked as speech and
* non-speech, and mark out the regions that are considered speech. This is done
* by inserting SPEECH_START and SPEECH_END signals into the stream.
* <p>
* The algorithm for inserting the two signals is as follows.
* <p>
* The algorithm is always in one of two states: 'in-speech' and
* 'out-of-speech'. If 'out-of-speech', it will read in audio until we hit audio
* that is speech. If we have read more than 'startSpeech' amount of
* <i>continuous</i> speech, we consider that speech has started, and insert a
* SPEECH_START at 'speechLeader' time before speech first started. The state of
* the algorithm changes to 'in-speech'.
* <p>
* Now consider the case when the algorithm is in 'in-speech' state. If it read
* an audio that is speech, it is scheduled for output. If the audio is
* non-speech, we read ahead until we have 'endSilence' amount of
* <i>continuous</i> non-speech. At the point we consider that speech has ended.
* A SPEECH_END signal is inserted at 'speechTrailer' time after the first
* non-speech audio. The algorithm returns to 'out-of-speech' state. If any
* speech audio is encountered in-between, the accounting starts all over again.
*
* While speech audio is processed delay is lowered to some minimal amount. This
* helps to segment both slow speech with visible delays and fast speech when
* delays are minimal.
*/
public class SpeechMarker extends BaseDataProcessor {
/**
* The property for the minimum amount of time in speech (in milliseconds)
* to be considered as utterance start.
*/
@S4Integer(defaultValue = 200)
public static final String PROP_START_SPEECH = "startSpeech";
private int startSpeechTime;
/**
* The property for the amount of time in silence (in milliseconds) to be
* considered as utterance end.
*/
@S4Integer(defaultValue = 200)
public static final String PROP_END_SILENCE = "endSilence";
private int endSilenceTime;
/**
* The property for the amount of time (in milliseconds) before speech start
* to be included as speech data.
*/
@S4Integer(defaultValue = 50)
public static final String PROP_SPEECH_LEADER = "speechLeader";
private int speechLeader;
private LinkedList<Data> inputQueue; // Audio objects are added to the end
private LinkedList<Data> outputQueue; // Audio objects are added to the end
private boolean inSpeech;
private int speechCount;
private int silenceCount;
private int startSpeechFrames;
private int endSilenceFrames;
private int speechLeaderFrames;
public SpeechMarker(int startSpeechTime, int endSilenceTime, int speechLeader) {
initLogger();
this.startSpeechTime = startSpeechTime;
this.speechLeader = speechLeader;
this.endSilenceTime = endSilenceTime;
}
public SpeechMarker() {
}
@Override
public void newProperties(PropertySheet ps) throws PropertyException {
super.newProperties(ps);
startSpeechTime = ps.getInt(PROP_START_SPEECH);
endSilenceTime = ps.getInt(PROP_END_SILENCE);
speechLeader = ps.getInt(PROP_SPEECH_LEADER);
}
/**
* Initializes this SpeechMarker
*/
@Override
public void initialize() {
super.initialize();
reset();
}
/**
* Resets this SpeechMarker to a starting state.
*/
private void reset() {
inSpeech = false;
speechCount = 0;
silenceCount = 0;
startSpeechFrames = startSpeechTime / 10;
endSilenceFrames = endSilenceTime / 10;
speechLeaderFrames = speechLeader / 10;
this.inputQueue = new LinkedList<Data>();
this.outputQueue = new LinkedList<Data>();
}
/**
* Returns the next Data object.
*
* @return the next Data object, or null if none available
* @throws DataProcessingException
* if a data processing error occurs
*/
@Override
public Data getData() throws DataProcessingException {
while (outputQueue.isEmpty()) {
Data data = getPredecessor().getData();
if (data == null)
break;
if (data instanceof DataStartSignal) {
reset();
outputQueue.add(data);
break;
}
if (data instanceof DataEndSignal) {
if (inSpeech) {
outputQueue.add(new SpeechEndSignal());
}
outputQueue.add(data);
break;
}
if (data instanceof SpeechClassifiedData) {
SpeechClassifiedData cdata = (SpeechClassifiedData) data;
if (cdata.isSpeech()) {
speechCount++;
silenceCount = 0;
} else {
speechCount = 0;
silenceCount++;
}
if (inSpeech) {
outputQueue.add(data);
} else {
inputQueue.add(data);
if (inputQueue.size() > startSpeechFrames + speechLeaderFrames) {
inputQueue.remove(0);
}
}
if (!inSpeech && speechCount == startSpeechFrames) {
inSpeech = true;
outputQueue.add(new SpeechStartSignal(cdata.getCollectTime() - speechLeader - startSpeechFrames));
outputQueue.addAll(inputQueue.subList(
Math.max(0, inputQueue.size() - startSpeechFrames - speechLeaderFrames), inputQueue.size()));
inputQueue.clear();
}
if (inSpeech && silenceCount == endSilenceFrames) {
inSpeech = false;
outputQueue.add(new SpeechEndSignal(cdata.getCollectTime()));
}
}
}
// If we have something left, return that
if (!outputQueue.isEmpty()) {
Data audio = outputQueue.remove(0);
if (audio instanceof SpeechClassifiedData) {
SpeechClassifiedData data = (SpeechClassifiedData) audio;
audio = data.getDoubleData();
}
return audio;
} else {
return null;
}
}
public boolean inSpeech() {
return inSpeech;
}
}