HTMLStripReader.java example

Explorer
RSSOwl-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.rssowl.core.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;

/**
 * A Reader that wraps another reader and attempts to strip out HTML constructs.
 * Entities found in the Text are being replaced if possible.
 * <p>
 * This class is part of Apache Solr and is versioned: 472574 (2006-11-08)
 * </p>
 */
public class HTMLStripReader extends Reader {

  /* Some constants being used */
  private static final int MISMATCH = -2;
  private static final int MATCH = -3;
  private static final int READAHEAD = 4096;

  /* Common Entities */
  private static final Map<String, Character> fgEntityTable;
  private final boolean fReplaceEntities;

  /* Wrapped Reader */
  private final Reader fIn;

  /* pushback buffer */
  private final StringBuilder fPushed = new StringBuilder();

  /* temporary buffer */
  private final StringBuilder fStrBuf = new StringBuilder();

  /* Static Initializer: Cache Entities */
  static {
    fgEntityTable = new HashMap<String, Character>();

    /* Entity Names */
    final String[] entityName = { "zwnj", "aring", "gt", "yen", "ograve", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
        "Chi", "delta", "rang", "sup", "trade", "Ntilde", "xi", "upsih", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "nbsp", "Atilde", "radic", "otimes", "aelig", "oelig", "equiv", "ni", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "infin", "Psi", "auml", "cup", "Epsilon", "otilde", "lt", "Icirc", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "Eacute", "Lambda", "sbquo", "Prime", "prime", "psi", "Kappa", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "rsaquo", "Tau", "uacute", "ocirc", "lrm", "zwj", "cedil", "Alpha", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "not", "amp", "AElig", "oslash", "acute", "lceil", "alefsym", "laquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "shy", "loz", "ge", "Igrave", "nu", "Ograve", "lsaquo", "sube", "euro", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$
        "rarr", "sdot", "rdquo", "Yacute", "lfloor", "lArr", "Auml", "Dagger", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "brvbar", "Otilde", "szlig", "clubs", "diams", "agrave", "Ocirc", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "Iota", "Theta", "Pi", "zeta", "Scaron", "frac14", "egrave", "sub", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "iexcl", "frac12", "ordf", "sum", "prop", "Uuml", "ntilde", "atilde", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "asymp", "uml", "prod", "nsub", "reg", "rArr", "Oslash", "emsp", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "THORN", "yuml", "aacute", "Mu", "hArr", "le", "thinsp", "dArr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "ecirc", "bdquo", "Sigma", "Aring", "tilde", "nabla", "mdash", "uarr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "times", "Ugrave", "Eta", "Agrave", "chi", "real", "circ", "eth", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "rceil", "iuml", "gamma", "lambda", "harr", "Egrave", "frac34", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "dagger", "divide", "Ouml", "image", "ndash", "hellip", "igrave", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "Yuml", "ang", "alpha", "frasl", "ETH", "lowast", "Nu", "plusmn", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "bull", "sup1", "sup2", "sup3", "Aacute", "cent", "oline", "Beta", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "perp", "Delta", "there4", "pi", "iota", "empty", "euml", "notin", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "iacute", "para", "epsilon", "weierp", "OElig", "uuml", "larr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "icirc", "Upsilon", "omicron", "upsilon", "copy", "Iuml", "Oacute", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
        "Xi", "kappa", "ccedil", "Ucirc", "cap", "mu", "scaron", "lsquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "isin", "Zeta", "minus", "deg", "and", "tau", "pound", "curren", "int", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$
        "ucirc", "rfloor", "ensp", "crarr", "ugrave", "exist", "cong", "theta", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "oplus", "permil", "Acirc", "piv", "Euml", "Phi", "Iacute", "quot", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "Uacute", "Omicron", "ne", "iquest", "eta", "rsquo", "yacute", "Rho", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "darr", "Ecirc", "Omega", "acirc", "sim", "phi", "sigmaf", "macr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "thetasym", "Ccedil", "ordm", "uArr", "forall", "beta", "fnof", "rho", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "micro", "eacute", "omega", "middot", "Gamma", "rlm", "lang", "spades", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "supe", "thorn", "ouml", "or", "raquo", "part", "sect", "ldquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
        "hearts", "sigma", "oacute", "apos" //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
     };

    /* Entity Values */
    final char[] entityVal = { 8204, 229, 62, 165, 242, 935, 948, 9002, 8835,
        8482, 209, 958, 978, 160, 195, 8730, 8855, 230, 339, 8801, 8715, 8734,
        936, 228, 8746, 917, 245, 60, 206, 201, 923, 8218, 8243, 8242, 968,
        922, 8250, 932, 250, 244, 8206, 8205, 184, 913, 172, 38, 198, 248, 180,
        8968, 8501, 171, 173, 9674, 8805, 204, 957, 210, 8249, 8838, 8364,
        8594, 8901, 8221, 221, 8970, 8656, 196, 8225, 166, 213, 223, 9827,
        9830, 224, 212, 921, 920, 928, 950, 352, 188, 232, 8834, 161, 189, 170,
        8721, 8733, 220, 241, 227, 8776, 168, 8719, 8836, 174, 8658, 216, 8195,
        222, 255, 225, 924, 8660, 8804, 8201, 8659, 234, 8222, 931, 197, 732,
        8711, 8212, 8593, 215, 217, 919, 192, 967, 8476, 710, 240, 8969, 239,
        947, 955, 8596, 200, 190, 8224, 247, 214, 8465, 8211, 8230, 236, 376,
        8736, 945, 8260, 208, 8727, 925, 177, 8226, 185, 178, 179, 193, 162,
        8254, 914, 8869, 916, 8756, 960, 953, 8709, 235, 8713, 237, 182, 949,
        8472, 338, 252, 8592, 238, 933, 959, 965, 169, 207, 211, 926, 954, 231,
        219, 8745, 956, 353, 8216, 8712, 918, 8722, 176, 8743, 964, 163, 164,
        8747, 251, 8971, 8194, 8629, 249, 8707, 8773, 952, 8853, 8240, 194,
        982, 203, 934, 205, 34, 218, 927, 8800, 191, 951, 8217, 253, 929, 8595,
        202, 937, 226, 8764, 966, 962, 175, 977, 199, 186, 8657, 8704, 946,
        402, 961, 181, 233, 969, 183, 915, 8207, 9001, 9824, 8839, 254, 246,
        8744, 187, 8706, 167, 8220, 9829, 963, 243, 39
    };

    /* Fill Entities */
    for (int i = 0; i < entityName.length; i++)
      fgEntityTable.put(entityName[i], Character.valueOf(entityVal[i]));

    /* Special-case nbsp to a simple space instead of 0xa0 */
    fgEntityTable.put("nbsp", Character.valueOf(' ')); //$NON-NLS-1$
  }

  /**
   * Creates a new <code>HTMLStripReader</code> that wraps another reader and
   * attempts to strip out HTML constructs.
   *
   * @param source The <code>Reader</code> to wrap around.
   */
  public HTMLStripReader(Reader source) {
    this(source, true);
  }

  /**
   * Creates a new <code>HTMLStripReader</code> that wraps another reader and
   * attempts to strip out HTML constructs.
   *
   * @param source The <code>Reader</code> to wrap around.
   * @param replaceEntities <code>true</code> to replace entities and
   * <code>false</code> otherwise.
   */
  public HTMLStripReader(Reader source, boolean replaceEntities) {
    super();
    fIn = source.markSupported() ? source : new BufferedReader(source);
    fReplaceEntities = replaceEntities;
  }

  private int next() throws IOException {
    int len = fPushed.length();

    if (len > 0) {
      int ch = fPushed.charAt(len - 1);
      fPushed.setLength(len - 1);
      return ch;
    }

    return fIn.read();
  }

  private int nextSkipWS() throws IOException {
    int ch = next();

    while (isSpace(ch))
      ch = next();

    return ch;
  }

  private int peek() throws IOException {
    int len = fPushed.length();
    if (len > 0)
      return fPushed.charAt(len - 1);

    int ch = fIn.read();
    push(ch);

    return ch;
  }

  private void push(int ch) {
    fPushed.append((char) ch);
  }

  private boolean isSpace(int ch) {
    switch (ch) {
      case ' ':
      case '\n':
      case '\r':
      case '\t':
        return true;
      default:
        return false;
    }
  }

  private boolean isHex(int ch) {
    return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
  }

  private boolean isAlpha(int ch) {
    return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z';
  }

  private boolean isDigit(int ch) {
    return ch >= '0' && ch <= '9';
  }

  private boolean isIdChar(int ch) {
    return isAlpha(ch) || isDigit(ch) || ch == '.' || ch == '-' || ch == '_' || ch == ':' || Character.isLetter(ch);
  }

  private boolean isFirstIdChar(int ch) {
    return Character.isUnicodeIdentifierStart(ch);
  }

  private void saveState() throws IOException {
    fIn.mark(READAHEAD);
  }

  private void restoreState() throws IOException {
    fIn.reset();
    fPushed.setLength(0);
  }

  private int readNumericEntity() throws IOException {
    int ch = next();
    int base = 10;
    fStrBuf.setLength(0);

    /* Decimal character entity */
    if (isDigit(ch)) {
      fStrBuf.append((char) ch);
      for (int i = 0; i < 10; i++) {
        ch = next();
        if (isDigit(ch)) {
          fStrBuf.append((char) ch);
        } else {
          break;
        }
      }
    }

    /* Hex character entity */
    else if (ch == 'x') {
      base = 16;
      fStrBuf.setLength(0);
      for (int i = 0; i < 10; i++) {
        ch = next();
        if (isHex(ch)) {
          fStrBuf.append((char) ch);
        } else {
          break;
        }
      }
    } else {
      return MISMATCH;
    }

    /*
     * In older HTML, an entity may not have always been terminated with a
     * semicolon. We'll also treat EOF or whitespace as terminating the entity.
     */
    if (ch == ';' || ch == -1) {
      return Integer.parseInt(fStrBuf.toString(), base);
    }

    /*
     * if whitespace terminated the entity, we need to return that whitespace on
     * the next call to read().
     */
    if (isSpace(ch)) {
      push(ch);
      return Integer.parseInt(fStrBuf.toString(), base);
    }

    /* Not an entity... */
    return MISMATCH;
  }

  private int readEntity() throws IOException {
    int ch = next();
    if (ch == '#')
      return readNumericEntity();

    /*
     * read an entity reference for an entity reference, require the ';' for
     * safety. otherwise we may try and convert part of some company names to an
     * entity. "Alpha&Beta Corp" for instance.
     */
    fStrBuf.setLength(0);
    fStrBuf.append((char) ch);

    for (int i = 0; i < READAHEAD; i++) {
      ch = next();
      if (Character.isLetter(ch)) {
        fStrBuf.append((char) ch);
      } else {
        break;
      }
    }

    if (ch == ';' && fReplaceEntities) {
      String entity = fStrBuf.toString();
      Character entityChar = fgEntityTable.get(entity);
      if (entityChar != null) {
        return entityChar.charValue();
      }
    }

    return MISMATCH;
  }

  private int readBang(boolean inScript) throws IOException {

    /* at this point, "<!" has been read */
    int ret = readComment(inScript);
    if (ret == MATCH)
      return MATCH;

    int ch = next();
    if (ch == '>')
      return MATCH;

    /* if it starts with <! and isn't a comment, simply read until ">" */
    while (true) {
      ch = next();
      if (ch == '>') {
        return MATCH;
      } else if (ch < 0) {
        return MISMATCH;
      }
    }
  }

  /* Tries to read comments the way browsers do, not strictly by the standards */
  private int readComment(boolean inScript) throws IOException {

    /* at this point "<!" has been read */
    int ch = next();
    if (ch != '-') {
      push(ch);
      return MISMATCH;
    }

    ch = next();
    if (ch != '-') {
      push(ch);
      push('-');
      return MISMATCH;
    }

    while (true) {
      ch = next();
      if (ch < 0)
        return MISMATCH;
      if (ch == '-') {
        ch = next();
        if (ch < 0)
          return MISMATCH;
        if (ch != '-') {
          push(ch);
          continue;
        }

        ch = next();
        if (ch < 0)
          return MISMATCH;
        if (ch != '>') {
          push(ch);
          push('-');
          continue;
        }

        return MATCH;
      } else if ((ch == '\'' || ch == '"') && inScript) {
        push(ch);
        readScriptString();

        /*
         * if this wasn't a string, there's not much we can do at this point
         * without having a stack of stream states in order to "undo" just the
         * latest.
         */
      } else if (ch == '<') {
        eatSSI();
      }
    }
  }

  private int readTag() throws IOException {
    int ch = next();
    if (!isAlpha(ch)) {
      push(ch);
      return MISMATCH;
    }

    fStrBuf.setLength(0);
    fStrBuf.append((char) ch);

    while (true) {
      ch = next();
      if (isIdChar(ch)) {
        fStrBuf.append((char) ch);
      } else if (ch == '/') {
        return nextSkipWS() == '>' ? MATCH : MISMATCH;
      } else {
        break;
      }
    }

    /* After the tag id, there needs to be either whitespace or '>' */
    if (!(ch == '>' || isSpace(ch))) {
      return MISMATCH;
    }

    if (ch != '>') {
      while (true) {
        ch = next();
        if (isSpace(ch)) {
          continue;
        } else if (isFirstIdChar(ch)) {
          push(ch);
          int ret = readAttr2();
          if (ret == MISMATCH)
            return ret;
        } else if (ch == '/') {
          return nextSkipWS() == '>' ? MATCH : MISMATCH;
        } else if (ch == '>') {
          break;
        } else {
          return MISMATCH;
        }
      }
    }

    /*
     * We only get to this point after we have read the entire tag. Now let's
     * see if it's a special tag.
     */
    String name = fStrBuf.toString();
    if (name.equals("script") || name.equals("style")) { //$NON-NLS-1$ //$NON-NLS-2$
      // The content of script and style elements is
      //  CDATA in HTML 4 but PCDATA in XHTML.

      /*
       * From HTML4: Although the STYLE and SCRIPT elements use CDATA for their
       * data model, for these elements, CDATA must be handled differently by
       * user agents. Markup and entities must be treated as raw text and passed
       * to the application as is. The first occurrence of the character
       * sequence "</" (end-tag open delimiter) is treated as terminating the
       * end of the element's content. In valid documents, this would be the end
       * tag for the element.
       */

      // discard everything until endtag is hit (except
      // if it occurs in a comment.
      // reset the stream mark to here, since we know that we sucessfully matched
      // a tag, and if we can't find the end tag, this is where we will want
      // to roll back to.
      saveState();
      fPushed.setLength(0);
      return findEndTag();
    }
    return MATCH;
  }

  /*
   * find an end tag, but beware of comments... <script><!-- </script> -->foo</script>
   * beware markup in script strings: </script>...document.write("</script>")foo</script>
   */
  int findEndTag() throws IOException {
    while (true) {
      int ch = next();
      if (ch == '<') {
        ch = next();
        // skip looking for end-tag in comments
        if (ch == '!') {
          int ret = readBang(true);
          if (ret == MATCH)
            continue;
          continue;
        }
        // did we match "</"
        if (ch != '/') {
          push(ch);
          continue;
        }
        int ret = readName();
        if (ret == MISMATCH)
          return MISMATCH;
        ch = nextSkipWS();
        if (ch != '>')
          return MISMATCH;
        return MATCH;
      } else if (ch == '\'' || ch == '"') {
        // read javascript string to avoid a false match.
        push(ch);
        int ret = readScriptString();
        // what to do about a non-match (non-terminated string?)
        // play it safe and index the rest of the data I guess...
        if (ret == MISMATCH)
          return MISMATCH;
      } else if (ch < 0) {
        return MISMATCH;
      }
    }
  }

  /* Read a string escaped by backslashes */
  private int readScriptString() throws IOException {
    int quoteChar = next();
    if (quoteChar != '\'' && quoteChar != '"')
      return MISMATCH;
    while (true) {
      int ch = next();
      if (ch == quoteChar)
        return MATCH;
      else if (ch == '\\') {
        ch = next();
      } else if (ch < 0) {
        return MISMATCH;
      } else if (ch == '<') {
        eatSSI();
      }
    }
  }

  private int readName() throws IOException {
    int ch = read();
    if (!isFirstIdChar(ch))
      return MISMATCH;
    ch = read();
    while (isIdChar(ch))
      ch = read();
    if (ch != -1)
      push(ch);
    return MATCH;
  }

  /*
   * This reads attributes and attempts to handle any embedded server side
   * includes that would otherwise mess up the quote handling. <a href="a/<!--#echo
   * "path"-->">
   */
  private int readAttr2() throws IOException {
    int ch = read();
    if (!isFirstIdChar(ch))
      return MISMATCH;
    ch = read();
    while (isIdChar(ch))
      ch = read();
    if (isSpace(ch))
      ch = nextSkipWS();

    // attributes may not have a value at all!
    if (ch != '=') {
      push(ch);
      return MATCH;
    }

    int quoteChar = nextSkipWS();

    if (quoteChar == '"' || quoteChar == '\'') {
      while (true) {
        ch = next();
        if (ch < 0)
          return MISMATCH;
        else if (ch == '<') {
          eatSSI();
        } else if (ch == quoteChar) {
          return MATCH;
        }
      }
    }

    /* unquoted attribute */
    while (true) {
      ch = next();
      if (ch < 0)
        return MISMATCH;
      else if (isSpace(ch)) {
        push(ch);
        return MATCH;
      } else if (ch == '>') {
        push(ch);
        return MATCH;
      } else if (ch == '<') {
        eatSSI();
      }
    }

  }

  // skip past server side include
  // at this point, only a "<" was read.
  // on a mismatch, push back the last char so that if it was
  // a quote that closes the attribute, it will be re-read and matched.
  private int eatSSI() throws IOException {
    int ch = next();
    if (ch != '!') {
      push(ch);
      return MISMATCH;
    }
    ch = next();
    if (ch != '-') {
      push(ch);
      return MISMATCH;
    }
    ch = next();
    if (ch != '-') {
      push(ch);
      return MISMATCH;
    }
    ch = next();
    if (ch != '#') {
      push(ch);
      return MISMATCH;
    }

    push('#');
    push('-');
    push('-');
    return readComment(false);
  }

  private int readProcessingInstruction() throws IOException {
    while (true) {
      int ch = next();
      if (ch == '?' && peek() == '>') {
        next();
        return MATCH;
      } else if (ch == -1) {
        return MISMATCH;
      }
    }
  }

  /*
   * @see java.io.Reader#read()
   */
  @Override
  public int read() throws IOException {
    while (true) {
      int ch = next();

      switch (ch) {
        case '&':
          saveState();
          ch = readEntity();
          if (ch >= 0)
            return ch;
          if (ch == MISMATCH) {
            restoreState();
            return '&';
          }
          break;

        case '<':
          saveState();
          ch = next();
          int ret = MISMATCH;
          if (ch == '!') {
            ret = readBang(false);
          } else if (ch == '/') {
            ret = readName();
            if (ret == MATCH) {
              ch = nextSkipWS();
              ret = ch == '>' ? MATCH : MISMATCH;
            }
          } else if (isAlpha(ch)) {
            push(ch);
            ret = readTag();
          } else if (ch == '?') {
            ret = readProcessingInstruction();
          }

          /*
           * matched something to be discarded, so break from this case and
           * continue in the loop
           */
          if (ret == MATCH)
            break;

          /*
           * didn't match any HTML constructs, so roll back the stream state and
           * just return '<'
           */
          restoreState();
          return '<';

        default:
          return ch;
      }
    }
  }

  /*
   * @see java.io.Reader#read(char[], int, int)
   */
  @Override
  public int read(char cbuf[], int off, int len) throws IOException {
    int i = 0;
    for (i = 0; i < len; i++) {
      int ch = read();
      if (ch == -1)
        break;
      cbuf[off++] = (char) ch;
    }
    if (i == 0) {
      if (len == 0)
        return 0;
      return -1;
    }
    return i;
  }

  /*
   * @see java.io.Reader#close()
   */
  @Override
  public void close() throws IOException {
    fIn.close();
  }
}