SmarterTokenizer.java example

Explorer
MinorThird-master
/* RISO: an implementation of distributed belief networks.
 * Copyright (C) 1999-2001, Robert Dodier.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA, 02111-1307, USA,
 * or visit the GNU web site, www.gnu.org.
 */

package riso.general;

import java.io.*;
import java.util.*;

/** Set up a tokenizer the way we like it.
  * In particular, ignore comments beginning with "<tt>%</tt>"; do not
  * parse numbers (use <tt>Double.parseDouble</tt> or <tt>Integer.parseInt</tt> for that); make the special chars
  * <tt>+-./#$?@[\]_:</tt> word characters; and use <tt>"</tt> for
  * quoted strings.
  */
public class SmarterTokenizer extends StreamTokenizer
{
	protected Reader reader;
	public Hashtable string_to_numeric = null;

	// Quick reference: ascii character set.
	// 32= !"#$%&'()*+,-./0123456789:;<=>?@=64
	// 65=ABCDEFGHIJKLMNOPQRSTUVWXYZ=90
	// 91=[\]^_`=96
	// 97=abcdefghijklmnopqrstuvwxyz=122
	// 123={|}~=126

	public SmarterTokenizer( Reader r )
	{
		super(r);
		reader = r;

		// Set tokenizer so all printable ascii chars are `ordinary.'
		// This means that stream becomes a sequence of 1-char tokens.

		ordinaryChars( ' ', '~' );

		// Now assign all of the letters, digits, and some of the
		// special characters to be word parts.
		// Use parse* methods of Double, Int, or Long to parse numbers;
		// don't rely on StreamTokenizer.

		wordChars( 'A', 'Z' );
		wordChars( 'a', 'z' );
		wordChars( '0', '9' );
		wordChars( '-', '/' );	// '-', '.', '/'
		wordChars( '#', '$' );	// '#', '$'
		wordChars( '?', '@' );	// '?', '@'
		wordChars( '[', ']' );	// '[', '\', ']'
		wordChars( '_', '_' );	
		wordChars( ':', ':' );	
		wordChars( '+', '+' );	

		commentChar( '%' );

		quoteChar( '"' );

		whitespaceChars( ' ', ' ' );
		whitespaceChars( '\t', '\t' );
		whitespaceChars( '\n', '\n' );
		whitespaceChars( '\r', '\r' );
	}

	/** Parse the next token in the input stream as a number.
	  * Note that in this class, the function of parsing a number is separate
	  * from that of parsing a string; the two functions are combined in
	  * <tt>StreamTokenizer</tt>, but are separated here (1) so as to avoid
	  * breaking existing code which parses strings and then calls <tt>Double.parseDouble</tt>,
	  * and (2) so that string to numeric lookups are only called when needed, instead
	  * of calling <tt>Hashtable.get</tt> on every token; <tt>nextToken</tt> is slow
	  * enough as it is.
	  */
	public int nextNumber() throws IOException
	{
		nextToken();

		if ( string_to_numeric != null )
		{
			Double x;
			if ( (x = (Double) string_to_numeric.get( sval )) != null )
			{
				nval = x.doubleValue();
				ttype = TT_NUMBER;
				return ttype;
			}
		}

		nval = Double.parseDouble( sval );
		ttype = TT_NUMBER;
		return ttype;
	}

	/** Parses the next block of input as a string. The <tt>ttype</tt>
	  * is <tt>TT_WORD</tt> and <tt>sval</tt> is set to the string.
	  * A ``block'' is a sequence of tokens between matching curly braces,
	  * and includes the braces.
	  */
	public int nextBlock() throws IOException
	{
		MultipleBuffer multi_buffer = new MultipleBuffer();
		int bracket_level = 0;
		int nchar = 0;
		int c;

		do
		{
			c = reader.read();
			if ( c == -1 ) break;
			multi_buffer.store_to_buffer( (char)c );
			++nchar;
			if ( c == '{' )
				++bracket_level;
			else if ( c == '}' )
				--bracket_level;
		}
		while ( c != '}' || bracket_level > 0 );

		if ( nchar > 0 )
		{
			sval = multi_buffer.toString();
			ttype = TT_WORD;
			return ttype;
		}
		else
		{
			sval = null;
			ttype = TT_EOF;
			return ttype;
		}
	}

	public static void main( String[] args )
	{
		try
		{
			SmarterTokenizer st = new SmarterTokenizer( new InputStreamReader( System.in ) );
			for ( st.nextBlock(); st.ttype != StreamTokenizer.TT_EOF; st.nextBlock() )
				System.out.println( "tokenizer: "+st );
		}
		catch (Exception e) { e.printStackTrace(); }
	}
}

class MultipleBuffer
{
	SimpleBuffer current_buffer = null;
	Vector buffer_list = new Vector();

	MultipleBuffer()
	{
		current_buffer = new SimpleBuffer();
		buffer_list.addElement( current_buffer );
	}

	void store_to_buffer( char c )
	{
		if ( ! current_buffer.store_to_buffer( c ) )
		{
			current_buffer = new SimpleBuffer();
			buffer_list.addElement( current_buffer );
			current_buffer.store_to_buffer( c );
		}
	}

	public String toString()
	{
		int i, ii, total_size = (buffer_list.size()-1)*SimpleBuffer.BUFFER_SIZE + current_buffer.count;
		char[] total_buffer = new char[ total_size ];

		for ( i = 0, ii = 0; i < buffer_list.size(); i++ )
		{
			SimpleBuffer sb = (SimpleBuffer) buffer_list.elementAt(i);
			System.arraycopy( sb.buffer, 0, total_buffer, ii, sb.count );
			ii += sb.count;
		}

		return new String( total_buffer );
	}
}

class SimpleBuffer
{
	static final int BUFFER_SIZE = 4096;

	int count = 0;
	char[] buffer = new char[ BUFFER_SIZE ];

	boolean store_to_buffer( char c )
	{
		if ( count < BUFFER_SIZE )
		{
			buffer[ count++ ] = c;
			return true;
		}
		else
			return false;
	}
}