/* RISO: an implementation of distributed belief networks. * Copyright (C) 1999-2001, Robert Dodier. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA, 02111-1307, USA, * or visit the GNU web site, www.gnu.org. */ package riso.general; import java.io.*; import java.util.*; /** Set up a tokenizer the way we like it. * In particular, ignore comments beginning with "<tt>%</tt>"; do not * parse numbers (use <tt>Double.parseDouble</tt> or <tt>Integer.parseInt</tt> for that); make the special chars * <tt>+-./#$?@[\]_:</tt> word characters; and use <tt>"</tt> for * quoted strings. */ public class SmarterTokenizer extends StreamTokenizer { protected Reader reader; public Hashtable string_to_numeric = null; // Quick reference: ascii character set. // 32= !"#$%&'()*+,-./0123456789:;<=>?@=64 // 65=ABCDEFGHIJKLMNOPQRSTUVWXYZ=90 // 91=[\]^_`=96 // 97=abcdefghijklmnopqrstuvwxyz=122 // 123={|}~=126 public SmarterTokenizer( Reader r ) { super(r); reader = r; // Set tokenizer so all printable ascii chars are `ordinary.' // This means that stream becomes a sequence of 1-char tokens. ordinaryChars( ' ', '~' ); // Now assign all of the letters, digits, and some of the // special characters to be word parts. // Use parse* methods of Double, Int, or Long to parse numbers; // don't rely on StreamTokenizer. wordChars( 'A', 'Z' ); wordChars( 'a', 'z' ); wordChars( '0', '9' ); wordChars( '-', '/' ); // '-', '.', '/' wordChars( '#', '$' ); // '#', '$' wordChars( '?', '@' ); // '?', '@' wordChars( '[', ']' ); // '[', '\', ']' wordChars( '_', '_' ); wordChars( ':', ':' ); wordChars( '+', '+' ); commentChar( '%' ); quoteChar( '"' ); whitespaceChars( ' ', ' ' ); whitespaceChars( '\t', '\t' ); whitespaceChars( '\n', '\n' ); whitespaceChars( '\r', '\r' ); } /** Parse the next token in the input stream as a number. * Note that in this class, the function of parsing a number is separate * from that of parsing a string; the two functions are combined in * <tt>StreamTokenizer</tt>, but are separated here (1) so as to avoid * breaking existing code which parses strings and then calls <tt>Double.parseDouble</tt>, * and (2) so that string to numeric lookups are only called when needed, instead * of calling <tt>Hashtable.get</tt> on every token; <tt>nextToken</tt> is slow * enough as it is. */ public int nextNumber() throws IOException { nextToken(); if ( string_to_numeric != null ) { Double x; if ( (x = (Double) string_to_numeric.get( sval )) != null ) { nval = x.doubleValue(); ttype = TT_NUMBER; return ttype; } } nval = Double.parseDouble( sval ); ttype = TT_NUMBER; return ttype; } /** Parses the next block of input as a string. The <tt>ttype</tt> * is <tt>TT_WORD</tt> and <tt>sval</tt> is set to the string. * A ``block'' is a sequence of tokens between matching curly braces, * and includes the braces. */ public int nextBlock() throws IOException { MultipleBuffer multi_buffer = new MultipleBuffer(); int bracket_level = 0; int nchar = 0; int c; do { c = reader.read(); if ( c == -1 ) break; multi_buffer.store_to_buffer( (char)c ); ++nchar; if ( c == '{' ) ++bracket_level; else if ( c == '}' ) --bracket_level; } while ( c != '}' || bracket_level > 0 ); if ( nchar > 0 ) { sval = multi_buffer.toString(); ttype = TT_WORD; return ttype; } else { sval = null; ttype = TT_EOF; return ttype; } } public static void main( String[] args ) { try { SmarterTokenizer st = new SmarterTokenizer( new InputStreamReader( System.in ) ); for ( st.nextBlock(); st.ttype != StreamTokenizer.TT_EOF; st.nextBlock() ) System.out.println( "tokenizer: "+st ); } catch (Exception e) { e.printStackTrace(); } } } class MultipleBuffer { SimpleBuffer current_buffer = null; Vector buffer_list = new Vector(); MultipleBuffer() { current_buffer = new SimpleBuffer(); buffer_list.addElement( current_buffer ); } void store_to_buffer( char c ) { if ( ! current_buffer.store_to_buffer( c ) ) { current_buffer = new SimpleBuffer(); buffer_list.addElement( current_buffer ); current_buffer.store_to_buffer( c ); } } public String toString() { int i, ii, total_size = (buffer_list.size()-1)*SimpleBuffer.BUFFER_SIZE + current_buffer.count; char[] total_buffer = new char[ total_size ]; for ( i = 0, ii = 0; i < buffer_list.size(); i++ ) { SimpleBuffer sb = (SimpleBuffer) buffer_list.elementAt(i); System.arraycopy( sb.buffer, 0, total_buffer, ii, sb.count ); ii += sb.count; } return new String( total_buffer ); } } class SimpleBuffer { static final int BUFFER_SIZE = 4096; int count = 0; char[] buffer = new char[ BUFFER_SIZE ]; boolean store_to_buffer( char c ) { if ( count < BUFFER_SIZE ) { buffer[ count++ ] = c; return true; } else return false; } }