/*
* Copyright (c) 2004-2009, Jean-Marc François. All Rights Reserved.
* Licensed under the New BSD license. See the LICENSE file.
*/
package be.ac.ulg.montefiore.run.jahmm.io;
import java.io.*;
import java.util.*;
import be.ac.ulg.montefiore.run.jahmm.Observation;
/**
* This class can read observations sequences from file.
* <p>
* The file format has been chosen to be very simple:
* <ul>
* <li> a line per observation sequence, in pure 7 bits ASCII;</li>
* <li> empty (white) lines, space and tab characters are not significant;</li>
* <li> each observation is followed by a semi-colon
* (<i>i.e.</i> the line ends with a semi-colon);</li>
* <li> The '#' character introduce a comment; the rest of the line is
* skipped; </li>
* <li> A newline can be escaped using the '\' character; this character can't
* be used in any other context;</li>
* <li> the format of each observation is defined by the corresponding
* IO class.</li>
* </ul>
* <p>
* Those rules must be followed by {@link ObservationReader ObservationReader}
* subclasses.
*/
public class ObservationSequencesReader
{
/**
* Reads observation sequences file. Such a file holds a set of observation
* sequences.
*
* @param or An observation reader.
* @param reader Holds the character stream reader the sequences are read
* from.
* @return A {@link java.util.Vector Vector} of
* {@link java.util.Vector Vector}s of
* {@link be.ac.ulg.montefiore.run.jahmm.Observation Observation}s.
*/
static public <O extends Observation> List<List<O>>
readSequences(ObservationReader<O> or, Reader reader)
throws IOException, FileFormatException
{
List<List<O>> sequences = new ArrayList<List<O>>();
StreamTokenizer st = new StreamTokenizer(reader);
initSyntaxTable(st);
for (st.nextToken(); st.ttype != StreamTokenizer.TT_EOF;
st.nextToken()) {
st.pushBack();
List<O> sequence = new ArrayList<O>(readSequence(or, st));
// if (sequence == null)
// break;
sequences.add(sequence);
}
return sequences;
}
/* Initialize the syntax table of a stream tokenizer */
static void initSyntaxTable(StreamTokenizer st)
{
st.resetSyntax();
st.parseNumbers();
st.whitespaceChars(0, (int) ' ');
st.eolIsSignificant(true);
st.commentChar((int) '#');
}
/**
* Reads an observation sequence out of a file {@link java.io.Reader
* Reader}.
*
* @param oir An observation reader.
* @param reader Holds the character reader the sequences are read from.
* @return An observation sequence read from <code>st</code> or null if the
* end of the file is reached before any sequence is found.
*/
static public <O extends Observation> List<O>
readSequence(ObservationReader<O> oir, Reader reader)
throws IOException, FileFormatException
{
StreamTokenizer st = new StreamTokenizer(reader);
initSyntaxTable(st);
return readSequence(oir, st);
}
/*
* Reads an observation sequence out of a {@link java.io.StreamTokenizer
* StreamTokenizer}. Empty lines or comments can appear before the
* sequence itself. <code>st</code>'s syntax table must be properly
* initialized.
*/
static <O extends Observation> List<O>
readSequence(ObservationReader<O> oir, StreamTokenizer st)
throws IOException, FileFormatException
{
for (st.nextToken(); st.ttype == StreamTokenizer.TT_EOL;
st.nextToken());
if (st.ttype == StreamTokenizer.TT_EOF)
return null;
List<O> sequence = new ArrayList<O>();
do {
st.pushBack();
sequence.add(oir.read(st));
if (st.nextToken() == '\\') { /* New lines can be escaped by '\' */
if (st.nextToken() != StreamTokenizer.TT_EOL)
throw new FileFormatException("'\' token is not followed " +
"by a new line");
st.nextToken();
}
} while (st.ttype != StreamTokenizer.TT_EOL &&
st.ttype != StreamTokenizer.TT_EOF);
if (st.ttype == StreamTokenizer.TT_EOF)
throw new FileFormatException("Unexpected token: EOF");
return sequence;
}
}