/* * Copyright 1996-2002 by Andruid Kerne. All rights reserved. * CONFIDENTIAL. Use is subject to license terms. */ package ecologylab.bigsemantics.html.old; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import ecologylab.generic.Debug; import ecologylab.generic.StringBuilderPool; import ecologylab.generic.StringTools; /** * Hand written HTML scanner; provides tokens to {@link Parser the Parser}. */ public class Scan extends Debug implements ecologylab.serialization.CharacterConstants { public String urlString; // just convenient for debug messages ////////////// class wide ////////////// BufferedReader bufferedReader; /** * A *very* efficient lookup table for whitespace characters. */ static boolean whitespaceChars[]; static final char SPACE = ' '; // 0x20 static { whitespaceChars = new boolean[SPACE + 1]; whitespaceChars['\n'] = true; // 0x0a whitespaceChars['\t'] = true; // 0x09 whitespaceChars['\f'] = true; // 0x0c form feed whitespaceChars['\r'] = true; // 0x0d whitespaceChars[' '] = true; // 0x20 } //+++++++++++++++++ Scanner State ++++++++++++++++++++// int mode; // boolean ignoreMode; // i added & removed this 2/05 -- andruid static public final int BUFFER_SIZE = 512; //StringBuffer buffer = new StringBuffer(BUFFER_SIZE); static StringBuilderPool stringBuffersPool = new StringBuilderPool(16, 16, BUFFER_SIZE); StringBuilder buffer = stringBuffersPool.nextBuffer(); /** * the actual token that was returned by scan(). */ public String sval; String pushBackBuffer; int pushBackVal; /** * char that delimited the current/last quoted value */ char quoteChar = '"'; /** * Looking for entities (triggered by &), tags (triggered by start * tag), words, whitespace. */ static final int OUTSIDE_TAG_MODE = 0; /** * Looking for quoted vals, unquoted attr_vals, =, tag close. */ static final int INSIDE_TAG_MODE = 1; /** * triggered by = inside a tag */ static final int VAL_MODE = 2; /** * Looking for quoted vals, unquoted attr_vals, =, tag close. */ static final int QUOTED_VAL_MODE = 3; /** * Looking for end comment. */ static final int COMMENT_MODE = 4; /** * looking for comments in the javascript */ static final int SCRIPT_COMMENT_MODE = 5; // from StreamTokenizer /** * scan() return value for end of file. */ public static final int TT_EOF = -1; // i think that 0ax = 10 decimal // public static final int TT_EOL = '\n'; // public static final int TT_NUMBER = -2; /** * scan() return value for a word of text. */ public static final int TT_WORD = -3; /** * scan() return value for a word of text. */ public static final int TT_UNKNOWN = -4; /** * scan() return value for whitespace between tokens. */ public static final int TT_WHITESPACE = -5; /** * scan() return value for the start of an HTML comment. */ public static final int TT_COMMENT = -8; /** * scan() return value for end of an HTML comment. */ public static final int TT_END_COMMENT = -9; /** * scan() return value for an HTML tag (an xml element). The actual tag is in sval. */ public static final int TT_TAG = -10; // ??? or do we just return '>' = 62 /** * scan() return value for end of an HTML tag/element. */ public static final int TT_TAG_CLOSE = -11; /** * scan() return value for a named or numbered entity. */ public static final int TT_ENTITY = -21; /** * scan() return value for the name of an HTML/XML attribute. */ public static final int TT_ATTR = -22; /** * scan() return value for the value of an HTML/XML attribute that was specified without quotes. */ public static final int TT_UNQUOTED_VAL = -23; /** * scan() return value for the value of an HTML/XML attribute that was specified inside quotes. */ public static final int TT_QUOTED_VAL = -24; /** * Try a 64k buffer. */ static final int READER_BUFFER_SIZE = 1024 * 64; public Scan(InputStream stream) { //bufferedStream = new BufferedInputStream(streamArg, BUFFER_SIZE) try { bufferedReader = new BufferedReader(new InputStreamReader(stream, "unicode")); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block error("uh oh! unsupported enconding. this means dnd is broken :("); e.printStackTrace(); } } char prevChar; boolean usePrev = false; /** * Scan the input for a token. Ignores characters less than 0x20 other than TAB, CR, and LF. * @return a code that indicates the kind of token that was found. * the token itself is returned in the variable sval. */ public int scan() throws IOException { int val = TT_UNKNOWN; int count = ((mode == COMMENT_MODE) && sval.endsWith("--")) ? 2 : 0; boolean todo = true; boolean reset = false; boolean breakTerm = false; // !!!this is totally implementation dependent!!! // it forces the buffer to be copied, and resized to // buffer.setLength(buffer.length() + 1); // force copy StringTools.clear(buffer); char c = (char) -1; int dashes = 0; while (todo) { if (usePrev) { c = prevChar; usePrev = false; } else { int ic = bufferedReader.read(); if (ic == -1) { val = TT_EOF; break; } c = (char) ic; } if (c < 0x20) switch (c) { case TAB: case CR: case LF: break; default: // ignore other control characters because the XML spec doesnt allow them // (), and we wont do anything useful with them, anyway. return scan(); } count++; switch (mode) { case OUTSIDE_TAG_MODE: // whitespace is significant // entity is significant // looking for whitespace, entity, word, tag // tag will cause transition -> INSIDE_TAG_MODE if ((c <= SPACE) && whitespaceChars[c]) { if(breakTerm) // break the term with spacial charater { breakTerm = false; todo = false; break; } switch (val) { case TT_UNKNOWN: val = TT_WHITESPACE; break; case TT_WHITESPACE: break; default: // could end a tag or a regular word reset = true; todo = false; } } else // process non-whitespace characters { if(breakTerm) // break the term with spacial charater { breakTerm = false; todo = false; reset = true; // use the previous character break; } switch (val) { case TT_UNKNOWN: switch (c) { case '<': val = TT_TAG; break; case '&': val = TT_ENTITY; break; default: buffer.append(c); val = TT_WORD; } break; case TT_TAG: if (buffer.length() == 0) { if (!(Character.isLetter(c) || (c == '/') || (c == '!'))) { // not a real tag buffer.append('<'); buffer.append(c); val = TT_WORD; } else { buffer.append(c); } } else if ((buffer.length() == 2) && (buffer.charAt(0) == '!') && (buffer.charAt(1) == '-') && (c == '-')) { return processComment(); } else { if (c == '>') { todo = false; reset= true; } else { buffer.append(c); } } break; case TT_WHITESPACE: // pushBack non whitespace & emit w no sval reset = true; todo = false; break; case TT_ENTITY: switch (c) { case '&': // funny ways to end an entity case '<': case '>': reset = true; // drop through! case ';': todo = false; break; default: // let Parser lookup find its just a word starting w & if (count >= 7) todo = false; buffer.append(c); } break; // already processing some type of word default: // TT_WORD switch (c) { case '<': // if (prevWasLessThan) // { // buffer.append(c); // break; // } case '&': todo = false; reset = true; break; // Parsing the terms such as abc/edf to 'abc/' 'edf' // This should display without "space after" on // by eunyee case '/': // parsing for the url terms such as http://www.abc.com or abc/def case '-': // parsing for the terms like abc-edf case ':': case ',': case '.': case '(': case ')': case '[': case ']': case '{': case '}': case '|': case '\\': case '\'': case '+': case '%': case '^': case '#': case '*': case '$': case '?': case '!': case '~': case '_': case '=': case '@': //email buffer.append(c); breakTerm = true; break; default: buffer.append(c); } } } break; case INSIDE_TAG_MODE: // whitespace signifies end of an attr // whitespace cannot be returned as val // looking for end tag, unquoted_attr_val, start of val // looking for words - attrs // space and equals = transitions switch (c) { case ' ': case '\n': case '\r': case '\t': if (buffer.length() > 0) { todo = false; val = TT_ATTR; } // else ignore leading whitespace and continue! break; case '>': if (buffer.length() > 0) // an attr is waiting to be emitted { reset = true; // pushback & return to else just below val = TT_ATTR; } else { val = TT_TAG_CLOSE; mode = OUTSIDE_TAG_MODE; } todo = false; break; case '=': if (buffer.length() > 0) // an attr is waiting to be emitted { reset = true; // pushback & return to else just below val = TT_ATTR; todo = false; } else mode = VAL_MODE; break; default: if (c > 0x20) { // toss control characters buffer.append(c); } } break; case VAL_MODE: // we've seen an equals sign. looking for words // start quote, space = transitions switch (c) { case ' ': case '\n': case '\r': case '\t': if (buffer.length() > 0) { val = TT_UNQUOTED_VAL; mode = INSIDE_TAG_MODE; todo = false; } // else ignore leading whitespace and continue! break; case '>': if (buffer.length() > 0) // an attr is waiting to be emitted { val = TT_UNQUOTED_VAL; mode = INSIDE_TAG_MODE; reset = true; // pushback & return to else above } else { val = TT_TAG_CLOSE; mode = OUTSIDE_TAG_MODE; } todo = false; break; case '\'': case '"': if (buffer.length() == 0) { quoteChar = c; mode = QUOTED_VAL_MODE; break; } // else fall through (a rather bogus attr name was coded) default: buffer.append(c); } break; case QUOTED_VAL_MODE: // end quote = only transition // whitespace is not significant // end tag ignored switch (c) { case ' ': case '\n': // still delete leading whitespace case '\r': case '\t': if (buffer.length() > 0) buffer.append(" "); // turn into plain space break; default: if (c == quoteChar) { todo = false; mode = INSIDE_TAG_MODE; val = TT_QUOTED_VAL; } else buffer.append(c); break; } case COMMENT_MODE: // whitespace is not significant // end comment = only transition // System.out.println("COMMENT " + c +" w sval=" + sval); if (sval.endsWith("--")) { dashes = 2; } switch (c) { case '-': dashes++; break; case '>': if (dashes >= 2) { val = TT_END_COMMENT; todo = false; mode = OUTSIDE_TAG_MODE; } break; default: dashes = 0; } break; case SCRIPT_COMMENT_MODE: switch(c) { case '\n': mode = OUTSIDE_TAG_MODE; todo = false; break; default: break; } break; default: // Env.error("Scan.scan() programmer error. Unknown mode="+mode); debug("scan() programmer error. Unknown mode="+mode); todo = false; } } if (reset) { prevChar = c; usePrev = true; } // find whitespace // find tag start // if inside tag, find tag end or = or double quote switch (val) { case TT_TAG: mode = INSIDE_TAG_MODE; // fall-through case TT_ENTITY: case TT_ATTR: StringTools.toLowerCase(buffer); sval = StringTools.toString(buffer); break; case TT_WORD: case TT_UNQUOTED_VAL: case TT_QUOTED_VAL: sval = StringTools.toString(buffer); break; case TT_END_COMMENT: // System.out.println("END_COMMENT"); // case TT_WHITESPACE: default: sval = null; } return val; } /* public static void main(String args[]) { String arg = args[0]; InputStream stream = null; if (arg.startsWith("http://")) { URL url = HTMLPage.newURL(null, arg, "", true, true); try { stream = url.openStream(); } catch (Exception e) { System.out.println(e); } } else try { stream = new FileInputStream(Files.newFile(arg)); } catch (Exception e) { System.out.println(e); } Scan scanner = new Scan(new BufferedInputStream(stream)); int val; try { do { val = scanner.scan(); System.out.println("token = " + val + "\t" + scanner.sval + "\tmode=" + scanner.mode); if ((val == TT_TAG) && scanner.sval.startsWith("!--")) { System.out.println("\nCOMMENT_MODE"); scanner.mode = COMMENT_MODE; } } while (val != TT_EOF); } catch (Exception e) { System.out.println(e); e.printStackTrace(); } } */ private int processComment() throws IOException { boolean dash1 = false; boolean dash2 = false; //println("start COMMENT, ignoring: "); do { int ic = bufferedReader.read();; if (ic == -1) { return TT_EOF; } char c = (char) ic; //System.err.print(c); if (dash2) { if (c == '>') { //println("\nreturn TT_COMMENT"); return TT_COMMENT; } else { dash1= false; dash2= false; } } else if (dash1) { if (c == '-') { dash2= true; } else { dash1= false; } } else { if (c == '-') { dash1= true; } } } while (true); } public BufferedReader bufferedReader() { return bufferedReader; } }