Scan.java example

Explorer
BigSemanticsJava-master
/*
 * Copyright 1996-2002 by Andruid Kerne. All rights reserved.
 * CONFIDENTIAL. Use is subject to license terms.
 */
package ecologylab.bigsemantics.html.old;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import ecologylab.generic.Debug;
import ecologylab.generic.StringBuilderPool;
import ecologylab.generic.StringTools;

/**
 * Hand written HTML scanner; provides tokens to {@link Parser the Parser}.
 */
public class Scan
extends Debug
implements ecologylab.serialization.CharacterConstants
{
   public String	urlString; // just convenient for debug messages
   
   ////////////// class wide //////////////
   BufferedReader		bufferedReader;
   
   /**
	* A *very* efficient lookup table for whitespace characters.
	*/
   static boolean	whitespaceChars[];
   
   static final char SPACE	= ' '; // 0x20
   
   static
   {
      whitespaceChars	= new boolean[SPACE + 1];
      whitespaceChars['\n']	= true;	// 0x0a
      whitespaceChars['\t']	= true;	// 0x09
      whitespaceChars['\f']	= true;	// 0x0c	 form feed
      whitespaceChars['\r']	= true;	// 0x0d
      whitespaceChars[' ']	= true;	// 0x20
   }
   
   //+++++++++++++++++ Scanner State ++++++++++++++++++++//
   int		mode;
   // boolean	ignoreMode; // i added & removed this 2/05 -- andruid
   
   static public final int BUFFER_SIZE	= 512;
   //StringBuffer buffer	= new StringBuffer(BUFFER_SIZE);

   static StringBuilderPool stringBuffersPool = 
	   new StringBuilderPool(16, 16, BUFFER_SIZE);
   
   StringBuilder	buffer  	= stringBuffersPool.nextBuffer();
   
   /**
	* the actual token that was returned by scan().
	*/ 	
   public String	sval;
   String	pushBackBuffer;
   int	  	pushBackVal;

   /**
	* char that delimited the current/last quoted value
	*/
   char		quoteChar	= '"';
   
   /**
	* Looking for entities (triggered by &), tags (triggered by start
	* tag), words, whitespace.
	*/
   static final int	OUTSIDE_TAG_MODE	= 0;
   
   /**
	* Looking for quoted vals, unquoted attr_vals, =, tag close.
	*/
   static final int	INSIDE_TAG_MODE		= 1;
   
   /**
	* triggered by = inside a tag
	*/
   static final int	VAL_MODE		= 2;
   
   /**
	* Looking for quoted vals, unquoted attr_vals, =, tag close.
	*/
   static final int	QUOTED_VAL_MODE		= 3;
   
   /**
	* Looking for end comment.
	*/
   static final int	COMMENT_MODE		= 4;
   
   
   /**
    * looking for comments in the javascript
    */
   static final int SCRIPT_COMMENT_MODE	= 5;

   // from StreamTokenizer
   /**
	* scan() return value for end of file.
	*/
   public static final int TT_EOF		= -1;
   
   // i think that 0ax = 10 decimal
   // public static final int TT_EOL		= '\n';
   // public static final int TT_NUMBER		= -2;
   /**
	* scan() return value for a word of text.
	*/
   public static final int TT_WORD		= -3;
   
   /**
	* scan() return value for a word of text.
	*/
   public static final int TT_UNKNOWN		= -4;
   
   /**
	* scan() return value for whitespace between tokens.
	*/
   public static final int TT_WHITESPACE	= -5;
   
   /**
	* scan() return value for the start of an HTML comment.
	*/
   public static final int TT_COMMENT		= -8;
   
   /**
	* scan() return value for end of an HTML comment.
	*/
   public static final int TT_END_COMMENT	= -9;
   
   /**
	* scan() return value for an HTML tag (an xml element). The actual tag is in sval.
	*/
   public static final int TT_TAG		= -10;

   // ??? or do we just return '>' =  62
   /**
	* scan() return value for end of an HTML tag/element.
	*/
   public static final int TT_TAG_CLOSE		= -11;
   
   /**
	* scan() return value for a named or numbered entity.
	*/
   public static final int TT_ENTITY		= -21;
   
   /**
	* scan() return value for the name of an HTML/XML attribute.
	*/
   public static final int TT_ATTR		= -22;
   
   /**
	* scan() return value for the value of an HTML/XML attribute that was specified without quotes.
	*/
   public static final int TT_UNQUOTED_VAL	= -23;
   
   /**
	* scan() return value for the value of an HTML/XML attribute that was specified inside quotes.
	*/
   public static final int TT_QUOTED_VAL	= -24;

   /**
	* Try a 64k buffer.
	*/
   static final int	READER_BUFFER_SIZE	= 1024 * 64;

   public Scan(InputStream stream)
   {
      //bufferedStream	= new BufferedInputStream(streamArg, BUFFER_SIZE)
      try
			{
				bufferedReader = new BufferedReader(new InputStreamReader(stream, "unicode"));
			}
			catch (UnsupportedEncodingException e)
			{
				// TODO Auto-generated catch block
				error("uh oh! unsupported enconding. this means dnd is broken :(");
				e.printStackTrace();
			}
   }
   
   char	prevChar;
   boolean usePrev = false;

   /**
    * Scan the input for a token. Ignores characters less than 0x20 other than TAB, CR, and LF.
    * @return	a code that indicates the kind of token that was found.
    * 			the token itself is returned in the variable sval.
    */
   
   public int scan()
      throws IOException
	  {
		 int val		= TT_UNKNOWN;
		 int count		= ((mode == COMMENT_MODE) && sval.endsWith("--"))
			? 2 : 0;
		 boolean todo	= true;
		 boolean reset	= false;
		 
		 boolean breakTerm = false;

		 // !!!this is totally implementation dependent!!!
		 // it forces the buffer to be copied, and resized to
		 //      buffer.setLength(buffer.length() + 1); // force copy
		 StringTools.clear(buffer);

		 char c	 = (char) -1;
		 int dashes	= 0;
		 
		 while (todo)
		 {
			if (usePrev)
			{
			   c = prevChar;
			   usePrev = false;
			}
			else
			{
			   int ic = bufferedReader.read();
			   if (ic == -1)
			   {
				  val = TT_EOF;
				  break;
			   }
			   c = (char) ic;
			}
			if (c < 0x20)
			   switch (c)
			   {
			   case TAB:
			   case CR:
			   case LF:
				  break;
			   default:	// ignore other control characters because the XML spec doesnt allow them
				  // (), and we wont do anything useful with them, anyway.
				  return scan();
			   }

			count++;
			switch (mode)
			{
			case OUTSIDE_TAG_MODE:
			   // whitespace is significant
			   // entity is significant
			   // looking for  whitespace, entity, word, tag
			   // tag will cause transition -> INSIDE_TAG_MODE
			   if ((c <= SPACE) && whitespaceChars[c])
			   {
				 if(breakTerm)  // break the term with spacial charater 
				 {
					 breakTerm = false; 
					 todo = false;
				   	 break;
				  }
				  switch (val)
				  {
				  case TT_UNKNOWN:
					 val	= TT_WHITESPACE;
					 break;
				  case TT_WHITESPACE:
					 break;
				  default:		   // could end a tag or a regular word
					 reset	= true;
					 todo	= false;
				  }
			   }
			   else			   // process non-whitespace characters
			   {
				   if(breakTerm)  // break the term with spacial charater 
				   {
					   breakTerm = false; 
					   todo = false;
					   reset = true;  // use the previous character
					   break;
				   }
				  switch (val)
				  {
				  case TT_UNKNOWN:
					 switch (c)
					 {
					 case '<':
						val	= TT_TAG;
						break;
					 case '&':
						val	= TT_ENTITY;
						break;	
					 default:
						buffer.append(c);
						val	= TT_WORD;
					 }
					 break;
				  case TT_TAG:
					 if (buffer.length() == 0)
					 {
						if (!(Character.isLetter(c) || (c == '/') || (c == '!')))
						{		   
							// not a real tag
							buffer.append('<');
							buffer.append(c);
							val	= TT_WORD;
						}
						else
						{
						   buffer.append(c);
						}
					 }
					 else if ((buffer.length() == 2) && (buffer.charAt(0) == '!') &&
							  (buffer.charAt(1) == '-') && (c == '-'))
					 {
						return processComment();
					 }
					 else
					 {
						if (c == '>')
						{
						   todo	= false;
						   reset= true;
						}
						else
						{
						   buffer.append(c);
						}
					 }
					 break;
				  case TT_WHITESPACE:	   // pushBack non whitespace & emit w no sval
					 reset	= true;
					 todo	= false;
					 break;
				  case TT_ENTITY:
					 switch (c)
					 {
					 case '&':	   // funny ways to end an entity
					 case '<':
					 case '>':
						reset	= true;
						// drop through!
					 case ';':
						todo	= false;
						break;
					 default:
						// let Parser lookup find its just a word starting w &
						if (count >= 7)
						   todo	= false;
						buffer.append(c);
					 }
					 break;
					 // already processing some type of word
				  default:	// TT_WORD
					 switch (c)
					 {
					 case '<':
						//		     if (prevWasLessThan)
						//		     {
						//			buffer.append(c);
						//			break;
						//		     }
					 case '&':
						todo	= false;
						reset	= true;
						break;
						
					// Parsing the terms such as abc/edf to 'abc/' 'edf'
					// This should display without "space after" on
					// by eunyee 
					 case '/': // parsing for the url terms such as http://www.abc.com or abc/def
					 case '-': // parsing for the terms like abc-edf
					 case ':':
					 case ',':
					 case '.':
					 case '(':
					 case ')':
					 case '[':
					 case ']':
					 case '{':
					 case '}':
					 case '|':
					 case '\\':
					 case '\'':
					 case '+':
					 case '%':
					 case '^':
					 case '#':
					 case '*':
					 case '$':
					 case '?':
					 case '!':
					 case '~':
					 case '_':
					 case '=':
					 case '@': //email
					 	buffer.append(c);
					breakTerm = true;
					 	break;
					 	
					 default:
						buffer.append(c);
					 }
				  }
			   }
			   break;
			case INSIDE_TAG_MODE:
			   // whitespace signifies end of an attr
			   // whitespace cannot be returned as val
			   // looking for end tag, unquoted_attr_val, start of val
			   // looking for words - attrs
			   // space and equals   =  transitions
			   switch (c)
			   {
			   case ' ':
			   case '\n':
			   case '\r':
			   case '\t':
				  if (buffer.length() > 0)
				  {
					 todo		= false;
					 val		= TT_ATTR;
				  }
				  // else ignore leading whitespace and continue!
				  break;
			   case '>':
				  if (buffer.length() > 0) // an attr is waiting to be emitted
				  {
					 reset	= true;	// pushback & return to else just below
					 val	= TT_ATTR;
				  }
				  else
				  {
					 val	= TT_TAG_CLOSE;
					 mode	= OUTSIDE_TAG_MODE;
				  }
				  todo		= false;
				  break;
			   case '=':
				  if (buffer.length() > 0) // an attr is waiting to be emitted
				  {
					 reset	= true;	// pushback & return to else just below
					 val	= TT_ATTR;
					 todo	= false;
				  }
				  else
					 mode	= VAL_MODE;
				  break;
			   default:
				  if (c > 0x20)
				  {
					  // toss control characters
					 buffer.append(c);
				  }
			   }	       
			   break;
			case VAL_MODE:
			   // we've seen an equals sign. looking for words
			   // start quote, space   =  transitions
			   switch (c)
			   {
			   case ' ':
			   case '\n':
			   case '\r':
			   case '\t':
				  if (buffer.length() > 0)
				  {
					 val	= TT_UNQUOTED_VAL;
					 mode	= INSIDE_TAG_MODE;
					 todo	= false;
				  }
				  // else ignore leading whitespace and continue!
				  break;
			   case '>':
				  if (buffer.length() > 0) // an attr is waiting to be emitted
				  {
					 val	= TT_UNQUOTED_VAL;
					 mode	= INSIDE_TAG_MODE;
					 reset	= true;	// pushback & return to else above
				  }
				  else
				  {
					 val	= TT_TAG_CLOSE;
					 mode	= OUTSIDE_TAG_MODE;
				  }
				  todo	= false;
				  break;
			   case '\'':
			   case '"':
				  if (buffer.length() == 0)
				  {
					 quoteChar	= c;
					 mode		= QUOTED_VAL_MODE;
					 break;
				  }
				  // else fall through (a rather bogus attr name was coded)
			   default:
				  buffer.append(c);
			   }
			   break;
			case QUOTED_VAL_MODE:
			   // end quote   = only transition
			   // whitespace is not significant
			   // end tag ignored
			   switch (c)
			   {
			   case ' ':
			   case '\n': // still delete leading whitespace
			   case '\r':
			   case '\t':
				  if (buffer.length() > 0)
					 buffer.append(" "); // turn into plain space
				  break;
			   default:
				  if (c == quoteChar)
				  {
					 todo	= false;
					 mode	= INSIDE_TAG_MODE;
					 val	= TT_QUOTED_VAL;
				  }
				  else
					 buffer.append(c);
				  break;
			   }	       
			case COMMENT_MODE:
			   // whitespace is not significant
			   // end comment = only transition
			   //	    System.out.println("COMMENT " + c +" w sval=" + sval);
			   if (sval.endsWith("--"))
			   {
				  dashes = 2;
			   }
			   switch (c)
			   {
			   case '-':
				  dashes++;
				  break;
			   case '>':
				  if (dashes >= 2)
				  {
					 val	= TT_END_COMMENT;
					 todo	= false;
					 mode	= OUTSIDE_TAG_MODE;
				  }
				  break;
			   default:
				  dashes	= 0;
			   }
			   break;
			   case SCRIPT_COMMENT_MODE:
					switch(c)
					{
						case '\n':
							mode = OUTSIDE_TAG_MODE;
							todo = false;
							break;
						default:
							break;	
					}
				
				break;
			default:
			   //	    Env.error("Scan.scan() programmer error. Unknown mode="+mode);
			   debug("scan() programmer error. Unknown mode="+mode);
			   todo	= false;
			}
			
		 }
		 if (reset)
		 {
			prevChar	= c;
			usePrev	= true;
		 }
		 // find whitespace
		 // find tag start
		 // if inside tag, find tag end or = or double quote
		 switch (val)
		 {
		 case TT_TAG:
			mode	= INSIDE_TAG_MODE;
			// fall-through
		 case TT_ENTITY:
		 case TT_ATTR:
			StringTools.toLowerCase(buffer);
			sval	= StringTools.toString(buffer);
			break;
		 case TT_WORD:
		 case TT_UNQUOTED_VAL:
		 case TT_QUOTED_VAL:
			sval	= StringTools.toString(buffer);
			break;
		 case TT_END_COMMENT:
			//	 System.out.println("END_COMMENT");
			// case TT_WHITESPACE:
		 default:
			sval	= null;
		 }
		 return val;
	  }
      /*
	  public static void main(String args[])
	  {
      String arg	= args[0];
      InputStream stream	= null;
      if (arg.startsWith("http://"))
      {
	  URL url	= HTMLPage.newURL(null, arg, "", true, true);
	  try
	  {
	  stream
	  = url.openStream();
	  } catch (Exception e) { System.out.println(e); }
	  
      }
      else
	  try
	  {
	  stream		= new FileInputStream(Files.newFile(arg));
	  } catch (Exception e) { System.out.println(e); }
      Scan scanner		= new Scan(new BufferedInputStream(stream));
      int	val;
      try
      {
	  do 
	  {
	  val	= scanner.scan();
	  System.out.println("token = " + val + "\t" + scanner.sval
	  + "\tmode=" + scanner.mode);
	  if ((val == TT_TAG) && scanner.sval.startsWith("!--"))
	  {
	  System.out.println("\nCOMMENT_MODE");
	  scanner.mode	= COMMENT_MODE;
	  }
	  } while (val != TT_EOF);
      } catch (Exception e)
	  {
	  System.out.println(e);
	  e.printStackTrace();
	  }
	  
	  }
	  */
   private int processComment()
	  throws IOException
	  {
		 boolean dash1 = false;
		 boolean dash2 = false;
		 //println("start COMMENT, ignoring: ");
		 do
		 {
			int ic	= bufferedReader.read();;
			
			if (ic == -1)
			{
			   return TT_EOF;
			}
			
			char c		= (char) ic;
			//System.err.print(c);
			if (dash2)
			{
			   if (c == '>')
			   {
				   //println("\nreturn TT_COMMENT");
				  return TT_COMMENT;
			   }
			   else
			   {
				  dash1= false;
				  dash2= false;
			   }
			}
			else if (dash1)
			{
			   if (c == '-')
			   {
				  dash2= true;
			   }
			   else
			   {
				  dash1= false;
			   }
			}
			else
			{
			   if (c == '-')
			   {
				  dash1= true;
			   }
			}

		 } while (true);
	  }
   
   public BufferedReader bufferedReader()
   {
  	 return bufferedReader;
   }
}