package edu.cmu.sphinx.tools.batch; import edu.cmu.sphinx.decoder.search.Token; import edu.cmu.sphinx.frontend.DataProcessor; import edu.cmu.sphinx.frontend.util.StreamCepstrumSource; import edu.cmu.sphinx.frontend.util.StreamDataSource; import edu.cmu.sphinx.linguist.WordSearchState; import edu.cmu.sphinx.linguist.dictionary.Word; import edu.cmu.sphinx.recognizer.Recognizer; import edu.cmu.sphinx.result.Result; import edu.cmu.sphinx.util.Utilities; import edu.cmu.sphinx.util.props.*; import java.io.*; import java.net.URL; import java.util.Iterator; import java.util.List; import java.util.logging.Logger; /** * Copyright 1999-2002 Carnegie Mellon University. Portions Copyright 2002 Sun Microsystems, Inc. Portions Copyright * 2002 Mitsubishi Electric Research Laboratories. All Rights Reserved. Use is subject to license terms. * <p> * See the file "license.terms" for information on usage and redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * <p> * User: Peter Wolf Date: Nov 10, 2005 Time: 2:42:06 PM Copyright 2005, Peter Wolf * <p> * Runs a NIST corpus as used by the GALE project. The inputs are a CTL file, and a REF file. The output is a CTM * file. * <p> * A CTL file contains a list of utterances to decode. The format is * <p> * <utterance file> <start offset> <end offset> <utterance name> * <p> * The <utterance file> is a base to which the property "dataDirectory" is prepended, and ".raw" is appended. The * utterance file should be raw PCM that agrees with the "bitsPerSample", "channelCount", "samplesPerSecond", and * "framesPerSecond" properties. * <p> * The <start offset> and <end offset> are specified in frames, where * <p> * bytesPerFrame = (bitsPerSample/8)*channelCount*samplesPerSecond/framesPerSecond * <p> * The <utterance name> should be a unique string. For example "<utterance file>_<start offset>_<end offset>". * <p> * A REF file contains the correct transcripts of the utterances specified in the CTL file. Each line should be of the * form * <p> * <ASCII transcript> (<utterance name>) * <p> * The output is a "processed" CTM file. It is used by the NIST tools to compute the performance on the copus. The * format is not documented because it is currently a hack to get the Dry Run going. We need to think more about it. If * you want to use this tool talk to Peter Wolf, or Arthur Chan. */ public class BatchNISTRecognizer extends BatchModeRecognizer { protected String ctlFile; protected String dataDir; protected String refFile; protected String ctmFile; protected int bitsPerSample; protected int samplesPerSecond; protected int framesPerSecond; protected int channelCount; protected int bytesPerFrame; /** * The property that specifies the file containing the corpus utterance audio */ @S4String(defaultValue = "<raw data directory not set>") public final static String PROP_DATA_DIR = "dataDirectory"; /** * The property that specifies the file containing the corpus utterance audio */ @S4String(defaultValue = "<ctl file not set>") public final static String PROP_CTL_FILE = "ctlFile"; /** * The property that specifies the file containing the transcripts of the corpus */ @S4String(defaultValue = "<ref file not set>") public final static String PROP_REF_FILE = "refFile"; /** * The property that specifies the the directory where the output XXX files should be placed */ @S4String(defaultValue = "<ctm file not set>") public final static String PROP_CTM_FILE = "ctmFile"; /** * The sphinx properties that specify the format of the PCM audio in the data file */ @S4Integer(defaultValue = 16) public final static String PROP_BITS_PER_SAMPLE = "bitsPerSample"; @S4Integer(defaultValue = 1) public final static String PROP_CHANNEL_COUNT = "channelCount"; @S4Integer(defaultValue = 16000) public final static String PROP_SAMPLES_PER_SECOND = "samplesPerSecond"; @S4Integer(defaultValue = 100) public final static String PROP_FRAMES_PER_SECOND = "framesPerSecond"; public BatchNISTRecognizer( Recognizer recognizer, List<DataProcessor> inputDataProcessors, String ctlFile, String dataDir, String refFile, String ctmFile, int bitsPerSample, int samplesPerSecond, int framesPerSecond, int channelCount ) { this.logger = Logger.getLogger(getClass().getName()); this.recognizer = recognizer; this.inputDataProcessors = inputDataProcessors; this.dataDir = dataDir; this.ctlFile = ctlFile; this.refFile = refFile; this.ctmFile = ctmFile; this.bitsPerSample = bitsPerSample; this.channelCount = channelCount; this.samplesPerSecond = samplesPerSecond; this.framesPerSecond = framesPerSecond; this.bytesPerFrame = ((bitsPerSample / 8) * channelCount * samplesPerSecond) / framesPerSecond; logger.info( "BatchNISTRecognizer:\n" + " dataDirectory=" + dataDir + '\n' + " ctlFile=" + ctlFile + '\n' + " bitsPerSample=" + bitsPerSample + '\n' + " channelCount=" + channelCount + '\n' + " samplesPerSecond=" + samplesPerSecond + '\n' + " framesPerSecond=" + framesPerSecond + '\n'); } public BatchNISTRecognizer() { } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util.props.PropertySheet) */ @Override public void newProperties(PropertySheet ps) throws PropertyException { logger = ps.getLogger(); recognizer = (Recognizer) ps.getComponent(PROP_RECOGNIZER); inputDataProcessors = ps.getComponentList(PROP_INPUT_DATA_PROCESSORS, DataProcessor.class); dataDir = ps.getString(PROP_DATA_DIR); ctlFile = ps.getString(PROP_CTL_FILE); refFile = ps.getString(PROP_REF_FILE); ctmFile = ps.getString(PROP_CTM_FILE); bitsPerSample = ps.getInt(PROP_BITS_PER_SAMPLE); channelCount = ps.getInt(PROP_CHANNEL_COUNT); samplesPerSecond = ps.getInt(PROP_SAMPLES_PER_SECOND); framesPerSecond = ps.getInt(PROP_FRAMES_PER_SECOND); bytesPerFrame = ((bitsPerSample / 8) * channelCount * samplesPerSecond) / framesPerSecond; logger.info( "BatchNISTRecognizer:\n" + " dataDirectory=" + dataDir + '\n' + " ctlFile=" + ctlFile + '\n' + " bitsPerSample=" + bitsPerSample + '\n' + " channelCount=" + channelCount + '\n' + " samplesPerSecond=" + samplesPerSecond + '\n' + " framesPerSecond=" + framesPerSecond + '\n'); } @SuppressWarnings("serial") protected class CTLException extends Exception { CTLException(String msg) { super(msg); } } public class CTLUtterance { int startOffset; int endOffset; String name; byte[] data; final String ref; public String getFile() { return file; } String file; CTLUtterance(String ctl, String ref) throws CTLException { /* example line: 20040422_150000_NTDTV.80Hz-6400Hz 64155 65103 20040422_150000_NTDTV_64155-65103_spk8 */ this.ref = ref; String[] fields = ctl.split(" "); if (fields.length != 4) throw new CTLException("CTL Syntax Error: " + ctl); startOffset = Integer.parseInt(fields[1]); endOffset = Integer.parseInt(fields[2]); name = fields[3]; data = new byte[(endOffset - startOffset) * bytesPerFrame]; int i = fields[0].indexOf('.'); file = fields[0]; if (i >= 0) { file = file.substring(0, i); } file = dataDir + '/' + file + ".raw"; try { InputStream dataStream = new FileInputStream(file); dataStream.skip(startOffset * bytesPerFrame); if (dataStream.read(data) != data.length) { dataStream.close(); throw new CTLException("Unable to read " + data.length + " bytes of utterance " + name); } dataStream.close(); } catch (IOException e) { throw new CTLException("Unable to read utterance " + name + ": " + e.getMessage()); } } public InputStream getInputStream() { return new ByteArrayInputStream(data); } public String getName() { return name; } public String getRef() { return ref; } public int getStartOffset() { return startOffset; } public int getEndOffset() { return endOffset; } } protected class CTLIterator implements Iterator<CTLUtterance> { CTLUtterance utterance; LineNumberReader ctlReader; LineNumberReader refReader; public CTLIterator() throws IOException { ctlReader = new LineNumberReader(new FileReader(ctlFile)); refReader = new LineNumberReader(new FileReader(refFile)); utterance = nextUtterance(); } private CTLUtterance nextUtterance() { try { String ctl = ctlReader.readLine(); String ref = refReader.readLine(); if (ctl == null || ref == null) return null; else return new CTLUtterance(ctl, ref); } catch (Exception e) { throw new Error(e.getMessage()); } } public boolean hasNext() { return utterance != null; } public CTLUtterance next() { CTLUtterance u = utterance; utterance = nextUtterance(); return u; } public void remove() { throw new Error("Not implemented"); } } protected void setInputStream(CTLUtterance utt) throws IOException { for (DataProcessor dataSource : inputDataProcessors) { if (dataSource instanceof StreamDataSource) { ((StreamDataSource) dataSource).setInputStream(utt.getInputStream()); } else if (dataSource instanceof StreamCepstrumSource) { boolean isBigEndian = Utilities .isCepstraFileBigEndian(utt.getName()); StreamCepstrumSource cepstrumSource = (StreamCepstrumSource) dataSource; cepstrumSource.setInputStream(utt.getInputStream(), isBigEndian); } } } public void decode() { try { utteranceId = 0; DataOutputStream ctm = new DataOutputStream(new FileOutputStream(ctmFile)); recognizer.allocate(); for (Iterator<CTLUtterance> i = new CTLIterator(); i.hasNext();) { CTLUtterance utt = i.next(); setInputStream(utt); Result result = recognizer.recognize(); System.out.println("Utterance " + utteranceId + ": " + utt.getName()); System.out.println("Reference: " + utt.getRef()); System.out.println("Result : " + result); logger.info("Utterance " + utteranceId + ": " + utt.getName()); logger.info("Result : " + result); handleResult(ctm, utt, result); utteranceId++; } recognizer.deallocate(); } catch (IOException io) { logger.severe("I/O error during decoding: " + io.getMessage()); } logger.info("BatchCTLDecoder: " + utteranceId + " utterances decoded"); } protected void handleResult(DataOutputStream out, CTLUtterance utt, Result result) throws IOException { dumpBestPath(out, utt, result.getBestFinalToken()); } private long dumpBestPath(DataOutputStream out, CTLUtterance utt, Token token) throws IOException { if (token == null) return 0; Token pred = token.getPredecessor(); long startFrame = dumpBestPath(out, utt, pred); if (token.isWord()) { long endFrame = token.getCollectTime(); WordSearchState wordState = (WordSearchState) token.getSearchState(); Word word = wordState.getPronunciation().getWord(); String spelling = word.getSpelling(); if (!spelling.startsWith("<")) { String[] names = utt.name.split("_"); out.write((names[0] + '_' + names[1] + '_' + names[2] + " 1 " + (utt.startOffset + startFrame) / 100.0 + ' ' + (endFrame - startFrame) / 100.0 + ' ').getBytes()); out.write(hex2Binary(spelling)); out.write(" 0.700000\n".getBytes()); } return endFrame; } return startFrame; } static public byte[] hex2Binary(String spelling) { byte[] bin = new byte[spelling.length() / 2]; for (int i = 0; i < spelling.length(); i += 2) { int i0 = hexToByte(spelling.charAt(i)); int i1 = hexToByte(spelling.charAt(i + 1)); bin[i / 2] = (byte) (i1 + (16 * i0)); } return bin; } static private int hexToByte(char c) { switch (c) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; case '7': return 7; case '8': return 8; case '9': return 9; case 'a': return 10; case 'b': return 11; case 'c': return 12; case 'd': return 13; case 'e': return 14; case 'f': return 15; default: throw new Error("Bad hex char " + c); } } public static void main(String[] argv) { if (argv.length != 1) { System.out.println( "Usage: BatchNISTRecognizer propertiesFile"); System.exit(1); } String propertiesFile = argv[0]; ConfigurationManager cm; BatchNISTRecognizer bmr; try { URL url = new File(propertiesFile).toURI().toURL(); cm = new ConfigurationManager(url); bmr = (BatchNISTRecognizer) cm.lookup("batchNIST"); } catch (IOException ioe) { System.err.println("I/O error during initialization: \n " + ioe); return; } catch (PropertyException e) { System.err.println("Error during initialization: \n " + e); return; } if (bmr == null) { System.err.println("Can't find batchNIST in " + propertiesFile); return; } bmr.decode(); } }