HTMLFilterReader.java example

Explorer
RSSOwl-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.rssowl.core.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Set;

/**
 * A Reader that wraps another reader and attempts to strip out HTML constructs.
 * Entities found in the Text are being replaced if possible.
 * <p>
 * This class is part of Apache Solr (named HTMLStripReader) and is versioned:
 * 773920 (2009-12-05)
 * </p>
 * As opposed to {@link HTMLStripReader}, this extended version allows to pass
 * in a list of HTML tags that are to be stripped from the content.
 *
 * @author bpasero did some modifications to support the filtering.
 */
public class HTMLFilterReader extends Reader {

  /* Some constants being used */
  private static final int MISMATCH = -2;
  private static final int MATCH = -3;
  private static final int DEFAULT_READ_AHEAD = 8192;

  /* Common Entities */
  private static final HashMap<String, Character> fgEntityTable;
  private final boolean fReplaceEntities;

  private final Reader fIn;
  private int fReadAheadLimit = DEFAULT_READ_AHEAD;
  private int fSafeReadAheadLimit = fReadAheadLimit - 3;
  private int fNumWhitespace = 0;
  private int fNumRead = 0;
  private int fLastMark;
  private Set<String> fEscapedTags;

  /* pushback buffer */
  private final StringBuilder fPushed = new StringBuilder();

  /* temporary buffer */
  private final StringBuilder fSb = new StringBuilder();

  static {
    fgEntityTable = new HashMap<String, Character>();

    /* Entity Names */
    final String[] entityName = { "zwnj", "aring", "gt", "yen", "ograve", "Chi", "delta", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "rang", "sup", "trade", "Ntilde", "xi", "upsih", "nbsp", "Atilde", "radic", "otimes", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "aelig", "oelig", "equiv", "ni", "infin", "Psi", "auml", "cup", "Epsilon", "otilde", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "lt", "Icirc", "Eacute", "Lambda", "sbquo", "Prime", "prime", "psi", "Kappa", "rsaquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "Tau", "uacute", "ocirc", "lrm", "zwj", "cedil", "Alpha", "not", "amp", "AElig", "oslash", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "acute", "lceil", "alefsym", "laquo", "shy", "loz", "ge", "Igrave", "nu", "Ograve", "lsaquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "sube", "euro", "rarr", "sdot", "rdquo", "Yacute", "lfloor", "lArr", "Auml", "Dagger", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "brvbar", "Otilde", "szlig", "clubs", "diams", "agrave", "Ocirc", "Iota", "Theta", "Pi", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "zeta", "Scaron", "frac14", "egrave", "sub", "iexcl", "frac12", "ordf", "sum", "prop", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "Uuml", "ntilde", "atilde", "asymp", "uml", "prod", "nsub", "reg", "rArr", "Oslash", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "emsp", "THORN", "yuml", "aacute", "Mu", "hArr", "le", "thinsp", "dArr", "ecirc", "bdquo",  //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$//$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "Sigma", "Aring", "tilde", "nabla", "mdash", "uarr", "times", "Ugrave", "Eta", "Agrave", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "chi", "real", "circ", "eth", "rceil", "iuml", "gamma", "lambda", "harr", "Egrave", "frac34", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "dagger", "divide", "Ouml", "image", "ndash", "hellip", "igrave", "Yuml", "ang", "alpha", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "frasl", "ETH", "lowast", "Nu", "plusmn", "bull", "sup1", "sup2", "sup3", "Aacute", "cent", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "oline", "Beta", "perp", "Delta", "there4", "pi", "iota", "empty", "euml", "notin", "iacute", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "para", "epsilon", "weierp", "OElig", "uuml", "larr", "icirc", "Upsilon", "omicron", "upsilon", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$
        "copy", "Iuml", "Oacute", "Xi", "kappa", "ccedil", "Ucirc", "cap", "mu", "scaron", "lsquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "isin", "Zeta", "minus", "deg", "and", "tau", "pound", "curren", "int", "ucirc", "rfloor", "ensp", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ //$NON-NLS-12$
        "crarr", "ugrave", "exist", "cong", "theta", "oplus", "permil", "Acirc", "piv", "Euml", "Phi",  //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$//$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "Iacute", "quot", "Uacute", "Omicron", "ne", "iquest", "eta", "rsquo", "yacute", "Rho", "darr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "Ecirc", "Omega", "acirc", "sim", "phi", "sigmaf", "macr", "thetasym", "Ccedil", "ordm", "uArr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "forall", "beta", "fnof", "rho", "micro", "eacute", "omega", "middot", "Gamma", "rlm", "lang", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "spades", "supe", "thorn", "ouml", "or", "raquo", "part", "sect", "ldquo", "hearts", "sigma", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$
        "oacute", "apos" }; //$NON-NLS-1$ //$NON-NLS-2$

    /* Entity Values */
    final char[] entityVal = { 8204, 229, 62, 165, 242, 935, 948, 9002, 8835, 8482, 209, 958, 978, 160,
        195, 8730, 8855, 230, 339, 8801, 8715, 8734, 936, 228, 8746, 917, 245, 60, 206, 201, 923, 8218,
        8243, 8242, 968, 922, 8250, 932, 250, 244, 8206, 8205, 184, 913, 172, 38, 198, 248, 180, 8968,
        8501, 171, 173, 9674, 8805, 204, 957, 210, 8249, 8838, 8364, 8594, 8901, 8221, 221, 8970, 8656,
        196, 8225, 166, 213, 223, 9827, 9830, 224, 212, 921, 920, 928, 950, 352, 188, 232, 8834, 161,
        189, 170, 8721, 8733, 220, 241, 227, 8776, 168, 8719, 8836, 174, 8658, 216, 8195, 222, 255, 225,
        924, 8660, 8804, 8201, 8659, 234, 8222, 931, 197, 732, 8711, 8212, 8593, 215, 217, 919, 192, 967,
        8476, 710, 240, 8969, 239, 947, 955, 8596, 200, 190, 8224, 247, 214, 8465, 8211, 8230, 236, 376,
        8736, 945, 8260, 208, 8727, 925, 177, 8226, 185, 178, 179, 193, 162, 8254, 914, 8869, 916, 8756,
        960, 953, 8709, 235, 8713, 237, 182, 949, 8472, 338, 252, 8592, 238, 933, 959, 965, 169, 207, 211,
        926, 954, 231, 219, 8745, 956, 353, 8216, 8712, 918, 8722, 176, 8743, 964, 163, 164, 8747, 251,
        8971, 8194, 8629, 249, 8707, 8773, 952, 8853, 8240, 194, 982, 203, 934, 205, 34, 218, 927, 8800,
        191, 951, 8217, 253, 929, 8595, 202, 937, 226, 8764, 966, 962, 175, 977, 199, 186, 8657, 8704, 946,
        402, 961, 181, 233, 969, 183, 915, 8207, 9001, 9824, 8839, 254, 246, 8744, 187, 8706, 167, 8220,
        9829, 963, 243, 39 };

    /* Fill Entities */
    for (int i = 0; i < entityName.length; i++)
      fgEntityTable.put(entityName[i], new Character(entityVal[i]));

    /* Special-case nbsp to a simple space instead of 0xa0 */
    fgEntityTable.put("nbsp", new Character(' ')); //$NON-NLS-1$
  }

  /**
   * @param source
   * @param escapedTags
   * @param replaceEntities
   */
  public HTMLFilterReader(Reader source, Set<String> escapedTags, boolean replaceEntities) {
    super();
    fIn = source.markSupported() ? source : new BufferedReader(source);
    fEscapedTags = escapedTags;
    fReplaceEntities = replaceEntities;
  }

  private int next() throws IOException {
    int len = fPushed.length();
    if (len > 0) {
      int ch = fPushed.charAt(len - 1);
      fPushed.setLength(len - 1);
      return ch;
    }
    fNumRead++;
    return fIn.read();
  }

  private int nextSkipWS() throws IOException {
    int ch = next();
    while (isSpace(ch))
      ch = next();
    return ch;
  }

  private int peek() throws IOException {
    int len = fPushed.length();
    if (len > 0) {
      return fPushed.charAt(len - 1);
    }
    int ch = fIn.read();
    push(ch);
    return ch;
  }

  private void push(int ch) {
    fPushed.append((char) ch);
  }

  private boolean isSpace(int ch) {
    switch (ch) {
      case ' ':
      case '\n':
      case '\r':
      case '\t':
        return true;
      default:
        return false;
    }
  }

  private boolean isHex(int ch) {
    return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
  }

  private boolean isAlpha(int ch) {
    return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z';
  }

  private boolean isDigit(int ch) {
    return ch >= '0' && ch <= '9';
  }

  /***
   * From HTML 4.0 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
   * CombiningChar | Extender [5] Name ::= (Letter | '_' | ':') (NameChar)* [6]
   * Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::=
   * Nmtoken (#x20 Nmtoken)*
   ***/

  // should I include all id chars allowable by HTML/XML here?
  // including accented chars, ':', etc?
  private boolean isIdChar(int ch) {
    // return Character.isUnicodeIdentifierPart(ch);
    // isUnicodeIdentiferPart doesn't include '-'... shoudl I still
    // use it and add in '-',':',etc?
    return isAlpha(ch) || isDigit(ch) || ch == '.' || ch == '-' || ch == '_' || ch == ':' || Character.isLetter(ch);

  }

  private boolean isFirstIdChar(int ch) {
    return Character.isUnicodeIdentifierStart(ch);
    // return isAlpha(ch) || ch=='_' || Character.isLetter(ch);
  }

  private void saveState() throws IOException {
    fLastMark = fNumRead;
    fIn.mark(fReadAheadLimit);
  }

  private void restoreState() throws IOException {
    fIn.reset();
    fPushed.setLength(0);
  }

  private int readNumericEntity() throws IOException {
    // "&#" has already been read at this point

    // is this decimal, hex, or nothing at all.
    int ch = next();
    int base = 10;
    fSb.setLength(0);

    if (isDigit(ch)) {
      // decimal character entity
      fSb.append((char) ch);
      for (int i = 0; i < 10; i++) {
        ch = next();
        if (isDigit(ch)) {
          fSb.append((char) ch);
        } else {
          break;
        }
      }
    } else if (ch == 'x') {
      // hex character entity
      base = 16;
      fSb.setLength(0);
      for (int i = 0; i < 10; i++) {
        ch = next();
        if (isHex(ch)) {
          fSb.append((char) ch);
        } else {
          break;
        }
      }
    } else {
      return MISMATCH;
    }

    // In older HTML, an entity may not have always been terminated
    // with a semicolon.  We'll also treat EOF or whitespace as terminating
    // the entity.
    try {
      if (ch == ';' || ch == -1) {
        fNumWhitespace = fSb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
        return Integer.parseInt(fSb.toString(), base);
      }

      // if whitespace terminated the entity, we need to return
      // that whitespace on the next call to read().
      if (isSpace(ch)) {
        push(ch);
        fNumWhitespace = fSb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char
        return Integer.parseInt(fSb.toString(), base);
      }
    } catch (NumberFormatException e) {
      return MISMATCH;
    }

    // Not an entity...
    return MISMATCH;
  }

  private int readEntity() throws IOException {
    int ch = next();
    if (ch == '#')
      return readNumericEntity();

    //read an entity reference

    // for an entity reference, require the ';' for safety.
    // otherwise we may try and convert part of some company
    // names to an entity.  "Alpha&Beta Corp" for instance.
    //
    // Perhaps I should special case some of the
    // more common ones like & to make the ';' optional...

    fSb.setLength(0);
    fSb.append((char) ch);

    for (int i = 0; i < fSafeReadAheadLimit; i++) {
      ch = next();
      if (Character.isLetter(ch)) {
        fSb.append((char) ch);
      } else {
        break;
      }
    }

    if (ch == ';' && fReplaceEntities) {
      String entity = fSb.toString();
      Character entityChar = fgEntityTable.get(entity);
      if (entityChar != null) {
        fNumWhitespace = entity.length() + 1;
        return entityChar.charValue();
      }
    }

    return MISMATCH;
  }

  /***
   * valid comments according to HTML specs <!-- Hello --> <!-- Hello -- --
   * Hello--> <!----> <!------ Hello --> <!> <!------> Hello --> #comments
   * inside of an entity decl: <!ENTITY amp CDATA "&" -- ampersand, U+0026
   * ISOnum --> Turns out, IE & mozilla don't parse comments correctly. Since
   * this is meant to be a practical stripper, I'll just try and duplicate what
   * the browsers do. <!-- (stuff_including_markup)* --> <!FOO (stuff, not
   * including markup) > <! (stuff, not including markup)* >
   ***/

  private int readBang(boolean inScript) throws IOException {
    // at this point, "<!" has been read
    int ret = readComment(inScript);
    if (ret == MATCH)
      return MATCH;

    if ((fNumRead - fLastMark) < fSafeReadAheadLimit || peek() == '>') {

      int ch = next();
      if (ch == '>')
        return MATCH;

      // if it starts with <! and isn't a comment,
      // simply read until ">"
      //since we did readComment already, it may be the case that we are already deep into the read ahead buffer
      //so, we may need to abort sooner
      while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
        ch = next();
        if (ch == '>') {
          return MATCH;
        } else if (ch < 0) {
          return MISMATCH;
        }
      }
    }
    return MISMATCH;
  }

  // tries to read comments the way browsers do, not
  // strictly by the standards.
  //
  // GRRRR.  it turns out that in the wild, a <script> can have a HTML comment
  // that contains a script that contains a quoted comment.
  // <script><!-- document.write("<!--embedded comment-->") --></script>
  //
  private int readComment(boolean inScript) throws IOException {
    // at this point "<!" has  been read
    int ch = next();
    if (ch != '-') {
      // not a comment
      push(ch);
      return MISMATCH;
    }

    ch = next();
    if (ch != '-') {
      // not a comment
      push(ch);
      push('-');
      return MISMATCH;
    }
    /* two extra calls to next() here, so make sure we don't read past our mark */
    while ((fNumRead - fLastMark) < fSafeReadAheadLimit - 3) {
      ch = next();
      if (ch < 0)
        return MISMATCH;
      if (ch == '-') {
        ch = next();
        if (ch < 0)
          return MISMATCH;
        if (ch != '-') {
          push(ch);
          continue;
        }

        ch = next();
        if (ch < 0)
          return MISMATCH;
        if (ch != '>') {
          push(ch);
          push('-');
          continue;
        }

        return MATCH;
      } else if ((ch == '\'' || ch == '"') && inScript) {
        push(ch);
        readScriptString();
        // if this wasn't a string, there's not much we can do
        // at this point without having a stack of stream states in
        // order to "undo" just the latest.
      } else if (ch == '<') {
        eatSSI();
      }

    }
    return MISMATCH;

  }

  private int readTag() throws IOException {
    // at this point '<' has already been read
    int ch = next();
    if (!isAlpha(ch)) {
      push(ch);
      return MISMATCH;
    }

    fSb.setLength(0);
    fSb.append((char) ch);
    while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {

      ch = next();
      if (isIdChar(ch)) {
        fSb.append((char) ch);
      } else if (ch == '/') {
        // Hmmm, a tag can close with "/>" as well as "/ >"
        // read end tag '/>' or '/ >', etc
        return nextSkipWS() == '>' ? MATCH : MISMATCH;
      } else {
        break;
      }
    }
    if (fEscapedTags != null && !fEscapedTags.contains(fSb.toString().toLowerCase())) {
      //if this is a reservedTag, then keep it
      return MISMATCH;
    }
    // After the tag id, there needs to be either whitespace or
    // '>'
    if (!(ch == '>' || isSpace(ch))) {
      return MISMATCH;
    }

    if (ch != '>') {
      // process attributes
      while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
        ch = next();
        if (isSpace(ch)) {
          continue;
        } else if (isFirstIdChar(ch)) {
          push(ch);
          int ret = readAttr2();
          if (ret == MISMATCH)
            return ret;
        } else if (ch == '/' || ch == '"') {
          // read end tag '/>' or '/ >', etc
          return nextSkipWS() == '>' ? MATCH : MISMATCH;
        } else if (ch == '>') {
          break;
        } else {
          return MISMATCH;
        }

      }
      if ((fNumRead - fLastMark) >= fSafeReadAheadLimit) {
        return MISMATCH;//exit out if we exceeded the buffer
      }
    }

    // We only get to this point after we have read the
    // entire tag.  Now let's see if it's a special tag.
    String name = fSb.toString();
    if (name.equalsIgnoreCase("script") || name.equalsIgnoreCase("style")) { //$NON-NLS-1$ //$NON-NLS-2$
      // The content of script and style elements is
      //  CDATA in HTML 4 but PCDATA in XHTML.

      /*
       * From HTML4: Although the STYLE and SCRIPT elements use CDATA for their
       * data model, for these elements, CDATA must be handled differently by
       * user agents. Markup and entities must be treated as raw text and passed
       * to the application as is. The first occurrence of the character
       * sequence "</" (end-tag open delimiter) is treated as terminating the
       * end of the element's content. In valid documents, this would be the end
       * tag for the element.
       */

      // discard everything until endtag is hit (except
      // if it occurs in a comment.

      // reset the stream mark to here, since we know that we sucessfully matched
      // a tag, and if we can't find the end tag, this is where we will want
      // to roll back to.
      saveState();
      fPushed.setLength(0);
      return findEndTag();
    }
    return MATCH;
  }

  // find an end tag, but beware of comments...
  // <script><!-- </script> -->foo</script>
  // beware markup in script strings: </script>...document.write("</script>")foo</script>
  // Do I need to worry about CDATA sections "<![CDATA["  ?
  int findEndTag() throws IOException {

    while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
      int ch = next();
      if (ch == '<') {
        ch = next();
        // skip looking for end-tag in comments
        if (ch == '!') {
          int ret = readBang(true);
          if (ret == MATCH)
            continue;
          // yikes... what now?  It wasn't a comment, but I can't get
          // back to the state I was at.  Just continue from where I
          // am I guess...
          continue;
        }
        // did we match "</"
        if (ch != '/') {
          push(ch);
          continue;
        }
        int ret = readName(false);
        if (ret == MISMATCH)
          return MISMATCH;
        ch = nextSkipWS();
        if (ch != '>')
          return MISMATCH;
        return MATCH;
      } else if (ch == '\'' || ch == '"') {
        // read javascript string to avoid a false match.
        push(ch);
        int ret = readScriptString();
        // what to do about a non-match (non-terminated string?)
        // play it safe and index the rest of the data I guess...
        if (ret == MISMATCH)
          return MISMATCH;
      } else if (ch < 0) {
        return MISMATCH;
      }

    }
    return MISMATCH;
  }

  // read a string escaped by backslashes
  private int readScriptString() throws IOException {
    int quoteChar = next();
    if (quoteChar != '\'' && quoteChar != '"')
      return MISMATCH;

    while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
      int ch = next();
      if (ch == quoteChar)
        return MATCH;
      else if (ch == '\\') {
        ch = next();
      } else if (ch < 0) {
        return MISMATCH;
      } else if (ch == '<') {
        eatSSI();
      }

    }
    return MISMATCH;
  }

  private int readName(boolean checkEscaped) throws IOException {
    StringBuilder builder = (checkEscaped && fEscapedTags != null) ? new StringBuilder() : null;
    int ch = read();
    if (builder != null)
      builder.append((char) ch);
    if (!isFirstIdChar(ch))
      return MISMATCH;
    ch = read();
    if (builder != null)
      builder.append((char) ch);
    while (isIdChar(ch)) {
      ch = read();
      if (builder != null)
        builder.append((char) ch);
    }
    if (ch != -1) {
      push(ch);

    }
    //strip off the trailing >
    if (builder != null && !fEscapedTags.contains(builder.substring(0, builder.length() - 1).toLowerCase())) {
      return MISMATCH;
    }
    return MATCH;
  }

  /***
   * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)*
   * "'" need to also handle unquoted attributes, and attributes w/o values: <td
   * * * * * * * * * * * * * * * * * * id=msviGlobalToolbar height="22" nowrap *
   * * * * align=left>
   ***/

  // This reads attributes and attempts to handle any
  // embedded server side includes that would otherwise
  // mess up the quote handling.
  //  <a href="a/<!--#echo "path"-->">
  private int readAttr2() throws IOException {
    if ((fNumRead - fLastMark < fSafeReadAheadLimit)) {
      int ch = read();
      if (!isFirstIdChar(ch))
        return MISMATCH;
      ch = read();
      while (isIdChar(ch) && ((fNumRead - fLastMark) < fSafeReadAheadLimit)) {
        ch = read();
      }
      if (isSpace(ch))
        ch = nextSkipWS();

      // attributes may not have a value at all!
      // if (ch != '=') return MISMATCH;
      if (ch != '=') {
        push(ch);
        return MATCH;
      }

      int quoteChar = nextSkipWS();

      if (quoteChar == '"' || quoteChar == '\'') {
        while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
          ch = next();
          if (ch < 0)
            return MISMATCH;
          else if (ch == '<') {
            eatSSI();
          } else if (ch == quoteChar) {
            return MATCH;
            //} else if (ch=='<') {
            //  return MISMATCH;
          }

        }
      } else {
        // unquoted attribute
        while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
          ch = next();
          if (ch < 0)
            return MISMATCH;
          else if (isSpace(ch)) {
            push(ch);
            return MATCH;
          } else if (ch == '>') {
            push(ch);
            return MATCH;
          } else if (ch == '<') {
            eatSSI();
          }

        }
      }
    }
    return MISMATCH;
  }

  // skip past server side include
  private int eatSSI() throws IOException {
    // at this point, only a "<" was read.
    // on a mismatch, push back the last char so that if it was
    // a quote that closes the attribute, it will be re-read and matched.
    int ch = next();
    if (ch != '!') {
      push(ch);
      return MISMATCH;
    }
    ch = next();
    if (ch != '-') {
      push(ch);
      return MISMATCH;
    }
    ch = next();
    if (ch != '-') {
      push(ch);
      return MISMATCH;
    }
    ch = next();
    if (ch != '#') {
      push(ch);
      return MISMATCH;
    }

    push('#');
    push('-');
    push('-');
    return readComment(false);
  }

  private int readProcessingInstruction() throws IOException {
    // "<?" has already been read
    while ((fNumRead - fLastMark) < fSafeReadAheadLimit) {
      int ch = next();
      if (ch == '?' && peek() == '>') {
        next();
        return MATCH;
      } else if (ch == -1) {
        return MISMATCH;
      }

    }
    return MISMATCH;
  }

  /*
   * @see java.io.Reader#read()
   */
  @Override
  public int read() throws IOException {
    // Do we ever want to preserve CDATA sections?
    // where do we have to worry about them?
    // <![ CDATA [ unescaped markup ]]>
    if (fNumWhitespace > 0) {
      fNumWhitespace--;
      return ' ';
    }
    //do not limit this one by the READAHEAD
    while (true) {
      int lastNumRead = fNumRead;
      int ch = next();

      switch (ch) {
        case '&':
          saveState();
          ch = readEntity();
          if (ch >= 0)
            return ch;
          if (ch == MISMATCH) {
            restoreState();

            return '&';
          }
          break;

        case '<':
          saveState();
          ch = next();
          int ret = MISMATCH;
          if (ch == '!') {
            ret = readBang(false);
          } else if (ch == '/') {
            ret = readName(true);
            if (ret == MATCH) {
              ch = nextSkipWS();
              ret = ch == '>' ? MATCH : MISMATCH;
            }
          } else if (isAlpha(ch)) {
            push(ch);
            ret = readTag();
          } else if (ch == '?') {
            ret = readProcessingInstruction();
          }

          // matched something to be discarded, so break
          // from this case and continue in the loop
          if (ret == MATCH) {
            //break;//was
            //return whitespace from
            fNumWhitespace = (fNumRead - lastNumRead) - 1;//tack on the -1 since we are returning a space right now
            return ' ';
          }

          // didn't match any HTML constructs, so roll back
          // the stream state and just return '<'
          restoreState();
          return '<';

        default:
          return ch;
      }
    }
  }

  /*
   * @see java.io.Reader#read(char[], int, int)
   */
  @Override
  public int read(char cbuf[], int off, int len) throws IOException {
    int i = 0;
    for (i = 0; i < len; i++) {
      int ch = read();
      if (ch == -1)
        break;
      cbuf[off++] = (char) ch;
    }
    if (i == 0) {
      if (len == 0)
        return 0;
      return -1;
    }
    return i;
  }

  /*
   * @see java.io.Reader#close()
   */
  @Override
  public void close() throws IOException {
    fIn.close();
  }
}