ObservationSequencesReader.java example

Explorer
contexttoolkit-master
- src
/*
 * Copyright (c) 2004-2009, Jean-Marc François. All Rights Reserved.
 * Licensed under the New BSD license.  See the LICENSE file.
 */

package be.ac.ulg.montefiore.run.jahmm.io;

import java.io.*;
import java.util.*;

import be.ac.ulg.montefiore.run.jahmm.Observation;


/**
 * This class can read observations sequences from file.
 * <p>
 * The file format has been chosen to be very simple:
 * <ul>
 * <li> a line per observation sequence, in pure 7 bits ASCII;</li>
 * <li> empty (white) lines, space and tab characters are not significant;</li>
 * <li> each observation is followed by a semi-colon
 *      (<i>i.e.</i> the line ends with a semi-colon);</li>
 * <li> The '#' character introduce a comment; the rest of the line is
 *      skipped; </li>
 * <li> A newline can be escaped using the '\' character; this character can't
 *      be used in any other context;</li>
 * <li> the format of each observation is defined by the corresponding
 *      IO class.</li>
 * </ul>
 * <p>
 * Those rules must be followed by {@link ObservationReader ObservationReader} 
 * subclasses.
 */
public class ObservationSequencesReader
{	
	/**
	 * Reads observation sequences file.  Such a file holds a set of observation
	 * sequences.
	 *
	 * @param or An observation reader.
	 * @param reader Holds the character stream reader the sequences are read 
	 *               from.
	 * @return A {@link java.util.Vector Vector} of 
	 *         {@link java.util.Vector Vector}s of
	 *         {@link be.ac.ulg.montefiore.run.jahmm.Observation Observation}s.
	 */
	static public <O extends Observation> List<List<O>>
	readSequences(ObservationReader<O> or, Reader reader)
	throws IOException, FileFormatException
	{
		List<List<O>> sequences = new ArrayList<List<O>>();
		StreamTokenizer st = new StreamTokenizer(reader);
		
		initSyntaxTable(st);
		
		for (st.nextToken(); st.ttype != StreamTokenizer.TT_EOF; 
		st.nextToken()) {
			st.pushBack();
			List<O> sequence = new ArrayList<O>(readSequence(or, st));
			
//			if (sequence == null)
//				break;
			
			sequences.add(sequence);
		}
		
		return sequences;
	}
	
	
	/* Initialize the syntax table of a stream tokenizer */
	static void initSyntaxTable(StreamTokenizer st)
	{
		st.resetSyntax();
		st.parseNumbers();
		st.whitespaceChars(0, (int) ' ');
		st.eolIsSignificant(true);
		st.commentChar((int) '#');
	}
	
	
	/**
	 * Reads an observation sequence out of a file {@link java.io.Reader
	 * Reader}. 
	 *
	 * @param oir An observation reader.
	 * @param reader Holds the character reader the sequences are read from.
	 * @return An observation sequence read from <code>st</code> or null if the
	 *         end of the file is reached before any sequence is found.
	 */
	static public <O extends Observation> List<O> 
	readSequence(ObservationReader<O> oir, Reader reader) 
	throws IOException, FileFormatException
	{	
		StreamTokenizer st = new StreamTokenizer(reader);
		initSyntaxTable(st);
		
		return readSequence(oir, st);
	}
	
	
	/*
	 * Reads an observation sequence out of a {@link java.io.StreamTokenizer
	 * StreamTokenizer}.  Empty lines or comments can appear before the
	 * sequence itself. <code>st</code>'s syntax table must be properly
	 * initialized.
	 */
	static <O extends Observation> List<O>
	readSequence(ObservationReader<O> oir, StreamTokenizer st) 
	throws IOException, FileFormatException
	{	
		for (st.nextToken(); st.ttype == StreamTokenizer.TT_EOL;
		st.nextToken());
		if (st.ttype == StreamTokenizer.TT_EOF)
			return null;
		
		List<O> sequence = new ArrayList<O>();
		
		do {
			st.pushBack();
			sequence.add(oir.read(st));
			
			if (st.nextToken() == '\\') { /* New lines can be escaped by '\' */
				if (st.nextToken() != StreamTokenizer.TT_EOL)
					throw new FileFormatException("'\' token is not followed " +
					"by a new line");
				st.nextToken();
			}
		} while (st.ttype != StreamTokenizer.TT_EOL &&
				st.ttype != StreamTokenizer.TT_EOF);
		
		if (st.ttype == StreamTokenizer.TT_EOF)
			throw new FileFormatException("Unexpected token: EOF"); 
		
		return sequence;
	}
}