/*
* Copyright 1999-2002 Carnegie Mellon University.
* Portions Copyright 2002 Sun Microsystems, Inc.
* Portions Copyright 2002 Mitsubishi Electric Research Laboratories.
* All Rights Reserved. Use is subject to license terms.
*
* See the file "license.terms" for information on usage and
* redistribution of this file, and for a DISCLAIMER OF ALL
* WARRANTIES.
*
*/
package edu.cmu.sphinx.trainer;
import edu.cmu.sphinx.util.props.PropertySheet;
import edu.cmu.sphinx.util.props.PropertyException;
import edu.cmu.sphinx.util.props.S4Component;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
/** Provides mechanisms for accessing a next utterance's file name and transcription. */
public class SimpleControlFile implements ControlFile {
@S4Component(type = TrainerDictionary.class)
public static final String DICTIONARY = "dictionary";
private TrainerDictionary dictionary;
private String audioFile; // the audio file
private String transcriptFile; // the transcript file
private String wordSeparator; // the word separator
private int currentPartition; // the current partition
private int numberOfPartitions; // total number of partitions
private Iterator<String> audioFileIterator; // iterator for the control file
private Iterator<String> transcriptFileIterator; // iterator for the transcriptions
private List<String> audioFileList; // list containing the audio files
private List<String> transcriptFileList; // list containing the transcriptions
/*
* The logger for this class
*/
private Logger logger;
public void newProperties(PropertySheet ps) throws PropertyException {
logger = ps.getLogger();
this.dictionary = (TrainerDictionary)ps.getComponent(DICTIONARY);
try {
dictionary.allocate();
} catch (IOException e) {
throw new PropertyException(e);
}
this.audioFile = ps.getString(PROP_AUDIO_FILE);
this.transcriptFile = ps.getString(PROP_TRANSCRIPT_FILE);
this.currentPartition = ps.getInt(PROP_WHICH_BATCH);
this.numberOfPartitions = ps.getInt(PROP_TOTAL_BATCHES);
logger.info("Audio control file: " + this.audioFile);
logger.info("Transcript file: " + this.transcriptFile);
this.wordSeparator = " \t\n\r\f"; // the white spaces
logger.info("Processing part " + this.currentPartition +
" of " + this.numberOfPartitions);
try {
this.audioFileList = getLines(audioFile);
} catch (IOException ioe) {
throw new Error("IOE: Can't open file " + audioFile, ioe);
}
try {
this.transcriptFileList = getLines(transcriptFile);
} catch (IOException ioe) {
throw new Error("IOE: Can't open file " + transcriptFile, ioe);
}
}
/** Gets an iterator for utterances. */
public void startUtteranceIterator() {
audioFileIterator = audioFileList.iterator();
transcriptFileIterator = transcriptFileList.iterator();
}
/**
* Returns whether there is another utterance.
*
* @return true if there is another utterance.
*/
public boolean hasMoreUtterances() {
// Should throw exception or break if one has next and the
// other doesn't.
return (audioFileIterator.hasNext()
&& transcriptFileIterator.hasNext());
}
/**
* Gets the next utterance.
*
* @return the next utterance.
*/
public Utterance nextUtterance() {
logger.fine("processing ext utterance");
String utteranceLine = audioFileIterator.next() + ".mfc";
Utterance utterance = new SimpleUtterance(utteranceLine);
String utteranceFilename =
utteranceLine.replaceFirst("^.*/", "").replaceFirst("\\..*$", "");
String transcriptLine = transcriptFileIterator.next();
// Finds out if the audio file name is part of the transcript line
assert transcriptLine.matches(".*[ \t]\\(" + utteranceFilename + "\\)$") :
"File name in transcript \"" + transcriptLine +
"\" and control file \"" + utteranceFilename +
"\" have to match.";
// Removes the filename from the transcript line.
// The transcript line is of the form:
// She washed her dark suit (st002)
String transcript = transcriptLine.replaceFirst("[ \t]\\(.*\\)$", "");
utterance.add(transcript, dictionary, false, wordSeparator);
return utterance;
}
// Next method copied from decoder.BatchDecoder
/**
* Gets the set of lines from the file.
*
* @param file the name of the file
* @throws IOException if error occurs while reading file
*/
private List<String> getLines(String file) throws IOException {
List<String> list = new ArrayList<String>();
BufferedReader reader
= new BufferedReader(new FileReader(file));
String line = null;
while ((line = reader.readLine()) != null) {
list.add(line);
}
reader.close();
if (numberOfPartitions > 1) {
int linesPerBatch = list.size() / numberOfPartitions;
if (linesPerBatch < 1) {
linesPerBatch = 1;
}
if (currentPartition >= numberOfPartitions) {
currentPartition = numberOfPartitions - 1;
}
int startLine = currentPartition * linesPerBatch;
// last batch needs to get all remaining lines
if (currentPartition == (numberOfPartitions - 1)) {
list = list.subList(startLine, list.size());
} else {
list = list.subList(startLine, startLine +
linesPerBatch);
}
}
return list;
}
}