HmmReader.java example

Explorer
contexttoolkit-master
- src
/*
 * Copyright (c) 2004-2009, Jean-Marc François. All Rights Reserved.
 * Licensed under the New BSD license.  See the LICENSE file.
 */

package be.ac.ulg.montefiore.run.jahmm.io;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

import be.ac.ulg.montefiore.run.jahmm.*;


/**
 * This class can read Hidden Markov Models represented as text files.
 * The file syntax is as follows.
 * <p>
 * A '#' character induces a comment ; the rest of the line is skipped.
 * Words must be separated with a white space (space, tab or new line).
 * The file is case-sensitive.
 * The file must begin with the words <tt>Hmm</tt> and <tt>v1.0</tt>.
 * <p>
 * The next word must be <tt>NbStates</tt> followed by a number. This
 * number is the HMM's number of states.
 * <p>
 * Then comes a description of each state.  The n-th description is
 * related to the n-th state.  A state description begins with the keywords
 * <tt>State</tt> and </tt>Pi</tt>, followed by the initial probability of
 * this state.  Then comes the letter <tt>A</tt> followed by the state
 * transition probabilities separated by a space, in the right order.
 * Then comes a description of an observation distribution which depends on
 * the type of observation handled by the HMM.
 * <p>
 * The opdfs associated with all the states must have the same type.
 * A HMM description file thus looks like this:
 * <pre>
 * # A simple Hmm
 * Hmm
 * v1.0
 * NbStates 2
 * 
 * State
 * Pi 0.7
 * A 0.1 0.9
 * IntegerOPDF [ .2 .3 .4 .1 ]
 *
 * State
 * Pi 0.3
 * A 0.4 0.6
 * IntegerOPDF [ .1 .1 .1 .7 ]
 * </pre>
 * The lines starting with 'IntegerOPDF' are distributions descriptions.
 */
public class HmmReader
{	
	/**
	 * Reads a HMM from a text file.
	 * 
	 * @param reader The reader to read the HMM description from.
	 * @param opdfReader The {@link OpdfReader} used to read the observation
	 *        distributions.
	 */
	public static <O extends Observation> Hmm<O>
	read(Reader reader, OpdfReader<? extends Opdf<O>> opdfReader)
	throws IOException, FileFormatException
	{
		StreamTokenizer st = new StreamTokenizer(reader);
		initSyntaxTable(st);
		
		readWords(st, "Hmm", "v1.0", "NbStates");
		int nbStates = (int) readNumber(st);
		
		double[] pi = new double[nbStates];
		double[][] a = new double[nbStates][nbStates];
		List<Opdf<O>> opdfs = new ArrayList<Opdf<O>>(nbStates); 
		
		for (int i = 0; i < nbStates; i++) {
//			System.out.println("read state = " + i);
			readState(st, nbStates, i, pi, a, opdfs, opdfReader);
		}
		
		return new Hmm<O>(pi, a, opdfs);
	}
	
	
	static private <O extends Observation> void
	readState(StreamTokenizer st, int nbStates,	int stateNb, double[] pi,
			double[][] a,
			List<Opdf<O>> opdfs, OpdfReader<? extends Opdf<O>> opdfReader)
	throws IOException, FileFormatException
	{
		readWords(st, "State", "Pi");
		pi[stateNb] = readNumber(st);
		
		readWords(st, "A");
		for (int i = 0; i < nbStates; i++)
			a[stateNb][i] = readNumber(st);
		
		opdfs.add(opdfReader.read(st));
	}
	
	
	/**
	 * Reads some keywords out of a {@link StreamTokenizer}. 
	 * 
	 * @param st A stream tokenizer.
	 * @param words The words to read, in the right order.
	 */
	static void readWords(StreamTokenizer st, String... words)
	throws IOException, FileFormatException
	{
		for (String word : words) {
			st.nextToken();
			
			if (st.ttype == StreamTokenizer.TT_WORD)
				if (st.sval.equals(word))
					continue;
				else
					throw new FileFormatException(st.lineno(),
							"Syntax error: unexpected token '" +
							st.sval + "', ('" + word + "' expected)");
			
			if (st.ttype > 0) // Single character token
				if (word.length() == 1 && st.ttype == (int) word.charAt(0))
					continue;
				else
					throw new FileFormatException(st.lineno(),
							"Syntax error: unexpected token '" +
							(char) st.ttype + "' (" + word + "' expected)");
			
			throw new FileFormatException(st.lineno(), "Syntax error: '" +
					word + "' expected");
		}
	}
	

	static double readNumber(StreamTokenizer st)
	throws IOException, FileFormatException
	{
		st.nextToken();
		
		if (st.ttype != StreamTokenizer.TT_NUMBER)
			throw new FileFormatException(st.lineno(),
					"Syntax error: number expected");
		
		return st.nval;
	}

	
	/* Initialize the syntax table of a stream tokenizer */
	static void initSyntaxTable(StreamTokenizer st)
	{
		st.resetSyntax();
		st.parseNumbers();
		st.wordChars('a', 'z');
		st.wordChars('A', 'Z');
		st.whitespaceChars(0, (int) ' ');
		st.whitespaceChars((int) '\t', (int) '\t');
		st.eolIsSignificant(false);
		st.commentChar((int) '#');
	}
}