/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.rssowl.core.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/**
* A Reader that wraps another reader and attempts to strip out HTML constructs.
* Entities found in the Text are being replaced if possible.
* <p>
* This class is part of Apache Solr and is versioned: 472574 (2006-11-08)
* </p>
*/
public class HTMLStripReader extends Reader {
/* Some constants being used */
private static final int MISMATCH = -2;
private static final int MATCH = -3;
private static final int READAHEAD = 4096;
/* Common Entities */
private static final Map<String, Character> fgEntityTable;
private final boolean fReplaceEntities;
/* Wrapped Reader */
private final Reader fIn;
/* pushback buffer */
private final StringBuilder fPushed = new StringBuilder();
/* temporary buffer */
private final StringBuilder fStrBuf = new StringBuilder();
/* Static Initializer: Cache Entities */
static {
fgEntityTable = new HashMap<String, Character>();
/* Entity Names */
final String[] entityName = { "zwnj", "aring", "gt", "yen", "ograve", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
"Chi", "delta", "rang", "sup", "trade", "Ntilde", "xi", "upsih", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"nbsp", "Atilde", "radic", "otimes", "aelig", "oelig", "equiv", "ni", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"infin", "Psi", "auml", "cup", "Epsilon", "otilde", "lt", "Icirc", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"Eacute", "Lambda", "sbquo", "Prime", "prime", "psi", "Kappa", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
"rsaquo", "Tau", "uacute", "ocirc", "lrm", "zwj", "cedil", "Alpha", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"not", "amp", "AElig", "oslash", "acute", "lceil", "alefsym", "laquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"shy", "loz", "ge", "Igrave", "nu", "Ograve", "lsaquo", "sube", "euro", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$
"rarr", "sdot", "rdquo", "Yacute", "lfloor", "lArr", "Auml", "Dagger", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"brvbar", "Otilde", "szlig", "clubs", "diams", "agrave", "Ocirc", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
"Iota", "Theta", "Pi", "zeta", "Scaron", "frac14", "egrave", "sub", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"iexcl", "frac12", "ordf", "sum", "prop", "Uuml", "ntilde", "atilde", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"asymp", "uml", "prod", "nsub", "reg", "rArr", "Oslash", "emsp", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"THORN", "yuml", "aacute", "Mu", "hArr", "le", "thinsp", "dArr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"ecirc", "bdquo", "Sigma", "Aring", "tilde", "nabla", "mdash", "uarr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"times", "Ugrave", "Eta", "Agrave", "chi", "real", "circ", "eth", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"rceil", "iuml", "gamma", "lambda", "harr", "Egrave", "frac34", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
"dagger", "divide", "Ouml", "image", "ndash", "hellip", "igrave", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
"Yuml", "ang", "alpha", "frasl", "ETH", "lowast", "Nu", "plusmn", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"bull", "sup1", "sup2", "sup3", "Aacute", "cent", "oline", "Beta", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"perp", "Delta", "there4", "pi", "iota", "empty", "euml", "notin", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"iacute", "para", "epsilon", "weierp", "OElig", "uuml", "larr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
"icirc", "Upsilon", "omicron", "upsilon", "copy", "Iuml", "Oacute", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$
"Xi", "kappa", "ccedil", "Ucirc", "cap", "mu", "scaron", "lsquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"isin", "Zeta", "minus", "deg", "and", "tau", "pound", "curren", "int", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$
"ucirc", "rfloor", "ensp", "crarr", "ugrave", "exist", "cong", "theta", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"oplus", "permil", "Acirc", "piv", "Euml", "Phi", "Iacute", "quot", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"Uacute", "Omicron", "ne", "iquest", "eta", "rsquo", "yacute", "Rho", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"darr", "Ecirc", "Omega", "acirc", "sim", "phi", "sigmaf", "macr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"thetasym", "Ccedil", "ordm", "uArr", "forall", "beta", "fnof", "rho", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"micro", "eacute", "omega", "middot", "Gamma", "rlm", "lang", "spades", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"supe", "thorn", "ouml", "or", "raquo", "part", "sect", "ldquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$
"hearts", "sigma", "oacute", "apos" //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
};
/* Entity Values */
final char[] entityVal = { 8204, 229, 62, 165, 242, 935, 948, 9002, 8835,
8482, 209, 958, 978, 160, 195, 8730, 8855, 230, 339, 8801, 8715, 8734,
936, 228, 8746, 917, 245, 60, 206, 201, 923, 8218, 8243, 8242, 968,
922, 8250, 932, 250, 244, 8206, 8205, 184, 913, 172, 38, 198, 248, 180,
8968, 8501, 171, 173, 9674, 8805, 204, 957, 210, 8249, 8838, 8364,
8594, 8901, 8221, 221, 8970, 8656, 196, 8225, 166, 213, 223, 9827,
9830, 224, 212, 921, 920, 928, 950, 352, 188, 232, 8834, 161, 189, 170,
8721, 8733, 220, 241, 227, 8776, 168, 8719, 8836, 174, 8658, 216, 8195,
222, 255, 225, 924, 8660, 8804, 8201, 8659, 234, 8222, 931, 197, 732,
8711, 8212, 8593, 215, 217, 919, 192, 967, 8476, 710, 240, 8969, 239,
947, 955, 8596, 200, 190, 8224, 247, 214, 8465, 8211, 8230, 236, 376,
8736, 945, 8260, 208, 8727, 925, 177, 8226, 185, 178, 179, 193, 162,
8254, 914, 8869, 916, 8756, 960, 953, 8709, 235, 8713, 237, 182, 949,
8472, 338, 252, 8592, 238, 933, 959, 965, 169, 207, 211, 926, 954, 231,
219, 8745, 956, 353, 8216, 8712, 918, 8722, 176, 8743, 964, 163, 164,
8747, 251, 8971, 8194, 8629, 249, 8707, 8773, 952, 8853, 8240, 194,
982, 203, 934, 205, 34, 218, 927, 8800, 191, 951, 8217, 253, 929, 8595,
202, 937, 226, 8764, 966, 962, 175, 977, 199, 186, 8657, 8704, 946,
402, 961, 181, 233, 969, 183, 915, 8207, 9001, 9824, 8839, 254, 246,
8744, 187, 8706, 167, 8220, 9829, 963, 243, 39
};
/* Fill Entities */
for (int i = 0; i < entityName.length; i++)
fgEntityTable.put(entityName[i], Character.valueOf(entityVal[i]));
/* Special-case nbsp to a simple space instead of 0xa0 */
fgEntityTable.put("nbsp", Character.valueOf(' ')); //$NON-NLS-1$
}
/**
* Creates a new <code>HTMLStripReader</code> that wraps another reader and
* attempts to strip out HTML constructs.
*
* @param source The <code>Reader</code> to wrap around.
*/
public HTMLStripReader(Reader source) {
this(source, true);
}
/**
* Creates a new <code>HTMLStripReader</code> that wraps another reader and
* attempts to strip out HTML constructs.
*
* @param source The <code>Reader</code> to wrap around.
* @param replaceEntities <code>true</code> to replace entities and
* <code>false</code> otherwise.
*/
public HTMLStripReader(Reader source, boolean replaceEntities) {
super();
fIn = source.markSupported() ? source : new BufferedReader(source);
fReplaceEntities = replaceEntities;
}
private int next() throws IOException {
int len = fPushed.length();
if (len > 0) {
int ch = fPushed.charAt(len - 1);
fPushed.setLength(len - 1);
return ch;
}
return fIn.read();
}
private int nextSkipWS() throws IOException {
int ch = next();
while (isSpace(ch))
ch = next();
return ch;
}
private int peek() throws IOException {
int len = fPushed.length();
if (len > 0)
return fPushed.charAt(len - 1);
int ch = fIn.read();
push(ch);
return ch;
}
private void push(int ch) {
fPushed.append((char) ch);
}
private boolean isSpace(int ch) {
switch (ch) {
case ' ':
case '\n':
case '\r':
case '\t':
return true;
default:
return false;
}
}
private boolean isHex(int ch) {
return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
}
private boolean isAlpha(int ch) {
return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z';
}
private boolean isDigit(int ch) {
return ch >= '0' && ch <= '9';
}
private boolean isIdChar(int ch) {
return isAlpha(ch) || isDigit(ch) || ch == '.' || ch == '-' || ch == '_' || ch == ':' || Character.isLetter(ch);
}
private boolean isFirstIdChar(int ch) {
return Character.isUnicodeIdentifierStart(ch);
}
private void saveState() throws IOException {
fIn.mark(READAHEAD);
}
private void restoreState() throws IOException {
fIn.reset();
fPushed.setLength(0);
}
private int readNumericEntity() throws IOException {
int ch = next();
int base = 10;
fStrBuf.setLength(0);
/* Decimal character entity */
if (isDigit(ch)) {
fStrBuf.append((char) ch);
for (int i = 0; i < 10; i++) {
ch = next();
if (isDigit(ch)) {
fStrBuf.append((char) ch);
} else {
break;
}
}
}
/* Hex character entity */
else if (ch == 'x') {
base = 16;
fStrBuf.setLength(0);
for (int i = 0; i < 10; i++) {
ch = next();
if (isHex(ch)) {
fStrBuf.append((char) ch);
} else {
break;
}
}
} else {
return MISMATCH;
}
/*
* In older HTML, an entity may not have always been terminated with a
* semicolon. We'll also treat EOF or whitespace as terminating the entity.
*/
if (ch == ';' || ch == -1) {
return Integer.parseInt(fStrBuf.toString(), base);
}
/*
* if whitespace terminated the entity, we need to return that whitespace on
* the next call to read().
*/
if (isSpace(ch)) {
push(ch);
return Integer.parseInt(fStrBuf.toString(), base);
}
/* Not an entity... */
return MISMATCH;
}
private int readEntity() throws IOException {
int ch = next();
if (ch == '#')
return readNumericEntity();
/*
* read an entity reference for an entity reference, require the ';' for
* safety. otherwise we may try and convert part of some company names to an
* entity. "Alpha&Beta Corp" for instance.
*/
fStrBuf.setLength(0);
fStrBuf.append((char) ch);
for (int i = 0; i < READAHEAD; i++) {
ch = next();
if (Character.isLetter(ch)) {
fStrBuf.append((char) ch);
} else {
break;
}
}
if (ch == ';' && fReplaceEntities) {
String entity = fStrBuf.toString();
Character entityChar = fgEntityTable.get(entity);
if (entityChar != null) {
return entityChar.charValue();
}
}
return MISMATCH;
}
private int readBang(boolean inScript) throws IOException {
/* at this point, "<!" has been read */
int ret = readComment(inScript);
if (ret == MATCH)
return MATCH;
int ch = next();
if (ch == '>')
return MATCH;
/* if it starts with <! and isn't a comment, simply read until ">" */
while (true) {
ch = next();
if (ch == '>') {
return MATCH;
} else if (ch < 0) {
return MISMATCH;
}
}
}
/* Tries to read comments the way browsers do, not strictly by the standards */
private int readComment(boolean inScript) throws IOException {
/* at this point "<!" has been read */
int ch = next();
if (ch != '-') {
push(ch);
return MISMATCH;
}
ch = next();
if (ch != '-') {
push(ch);
push('-');
return MISMATCH;
}
while (true) {
ch = next();
if (ch < 0)
return MISMATCH;
if (ch == '-') {
ch = next();
if (ch < 0)
return MISMATCH;
if (ch != '-') {
push(ch);
continue;
}
ch = next();
if (ch < 0)
return MISMATCH;
if (ch != '>') {
push(ch);
push('-');
continue;
}
return MATCH;
} else if ((ch == '\'' || ch == '"') && inScript) {
push(ch);
readScriptString();
/*
* if this wasn't a string, there's not much we can do at this point
* without having a stack of stream states in order to "undo" just the
* latest.
*/
} else if (ch == '<') {
eatSSI();
}
}
}
private int readTag() throws IOException {
int ch = next();
if (!isAlpha(ch)) {
push(ch);
return MISMATCH;
}
fStrBuf.setLength(0);
fStrBuf.append((char) ch);
while (true) {
ch = next();
if (isIdChar(ch)) {
fStrBuf.append((char) ch);
} else if (ch == '/') {
return nextSkipWS() == '>' ? MATCH : MISMATCH;
} else {
break;
}
}
/* After the tag id, there needs to be either whitespace or '>' */
if (!(ch == '>' || isSpace(ch))) {
return MISMATCH;
}
if (ch != '>') {
while (true) {
ch = next();
if (isSpace(ch)) {
continue;
} else if (isFirstIdChar(ch)) {
push(ch);
int ret = readAttr2();
if (ret == MISMATCH)
return ret;
} else if (ch == '/') {
return nextSkipWS() == '>' ? MATCH : MISMATCH;
} else if (ch == '>') {
break;
} else {
return MISMATCH;
}
}
}
/*
* We only get to this point after we have read the entire tag. Now let's
* see if it's a special tag.
*/
String name = fStrBuf.toString();
if (name.equals("script") || name.equals("style")) { //$NON-NLS-1$ //$NON-NLS-2$
// The content of script and style elements is
// CDATA in HTML 4 but PCDATA in XHTML.
/*
* From HTML4: Although the STYLE and SCRIPT elements use CDATA for their
* data model, for these elements, CDATA must be handled differently by
* user agents. Markup and entities must be treated as raw text and passed
* to the application as is. The first occurrence of the character
* sequence "</" (end-tag open delimiter) is treated as terminating the
* end of the element's content. In valid documents, this would be the end
* tag for the element.
*/
// discard everything until endtag is hit (except
// if it occurs in a comment.
// reset the stream mark to here, since we know that we sucessfully matched
// a tag, and if we can't find the end tag, this is where we will want
// to roll back to.
saveState();
fPushed.setLength(0);
return findEndTag();
}
return MATCH;
}
/*
* find an end tag, but beware of comments... <script><!-- </script> -->foo</script>
* beware markup in script strings: </script>...document.write("</script>")foo</script>
*/
int findEndTag() throws IOException {
while (true) {
int ch = next();
if (ch == '<') {
ch = next();
// skip looking for end-tag in comments
if (ch == '!') {
int ret = readBang(true);
if (ret == MATCH)
continue;
continue;
}
// did we match "</"
if (ch != '/') {
push(ch);
continue;
}
int ret = readName();
if (ret == MISMATCH)
return MISMATCH;
ch = nextSkipWS();
if (ch != '>')
return MISMATCH;
return MATCH;
} else if (ch == '\'' || ch == '"') {
// read javascript string to avoid a false match.
push(ch);
int ret = readScriptString();
// what to do about a non-match (non-terminated string?)
// play it safe and index the rest of the data I guess...
if (ret == MISMATCH)
return MISMATCH;
} else if (ch < 0) {
return MISMATCH;
}
}
}
/* Read a string escaped by backslashes */
private int readScriptString() throws IOException {
int quoteChar = next();
if (quoteChar != '\'' && quoteChar != '"')
return MISMATCH;
while (true) {
int ch = next();
if (ch == quoteChar)
return MATCH;
else if (ch == '\\') {
ch = next();
} else if (ch < 0) {
return MISMATCH;
} else if (ch == '<') {
eatSSI();
}
}
}
private int readName() throws IOException {
int ch = read();
if (!isFirstIdChar(ch))
return MISMATCH;
ch = read();
while (isIdChar(ch))
ch = read();
if (ch != -1)
push(ch);
return MATCH;
}
/*
* This reads attributes and attempts to handle any embedded server side
* includes that would otherwise mess up the quote handling. <a href="a/<!--#echo
* "path"-->">
*/
private int readAttr2() throws IOException {
int ch = read();
if (!isFirstIdChar(ch))
return MISMATCH;
ch = read();
while (isIdChar(ch))
ch = read();
if (isSpace(ch))
ch = nextSkipWS();
// attributes may not have a value at all!
if (ch != '=') {
push(ch);
return MATCH;
}
int quoteChar = nextSkipWS();
if (quoteChar == '"' || quoteChar == '\'') {
while (true) {
ch = next();
if (ch < 0)
return MISMATCH;
else if (ch == '<') {
eatSSI();
} else if (ch == quoteChar) {
return MATCH;
}
}
}
/* unquoted attribute */
while (true) {
ch = next();
if (ch < 0)
return MISMATCH;
else if (isSpace(ch)) {
push(ch);
return MATCH;
} else if (ch == '>') {
push(ch);
return MATCH;
} else if (ch == '<') {
eatSSI();
}
}
}
// skip past server side include
// at this point, only a "<" was read.
// on a mismatch, push back the last char so that if it was
// a quote that closes the attribute, it will be re-read and matched.
private int eatSSI() throws IOException {
int ch = next();
if (ch != '!') {
push(ch);
return MISMATCH;
}
ch = next();
if (ch != '-') {
push(ch);
return MISMATCH;
}
ch = next();
if (ch != '-') {
push(ch);
return MISMATCH;
}
ch = next();
if (ch != '#') {
push(ch);
return MISMATCH;
}
push('#');
push('-');
push('-');
return readComment(false);
}
private int readProcessingInstruction() throws IOException {
while (true) {
int ch = next();
if (ch == '?' && peek() == '>') {
next();
return MATCH;
} else if (ch == -1) {
return MISMATCH;
}
}
}
/*
* @see java.io.Reader#read()
*/
@Override
public int read() throws IOException {
while (true) {
int ch = next();
switch (ch) {
case '&':
saveState();
ch = readEntity();
if (ch >= 0)
return ch;
if (ch == MISMATCH) {
restoreState();
return '&';
}
break;
case '<':
saveState();
ch = next();
int ret = MISMATCH;
if (ch == '!') {
ret = readBang(false);
} else if (ch == '/') {
ret = readName();
if (ret == MATCH) {
ch = nextSkipWS();
ret = ch == '>' ? MATCH : MISMATCH;
}
} else if (isAlpha(ch)) {
push(ch);
ret = readTag();
} else if (ch == '?') {
ret = readProcessingInstruction();
}
/*
* matched something to be discarded, so break from this case and
* continue in the loop
*/
if (ret == MATCH)
break;
/*
* didn't match any HTML constructs, so roll back the stream state and
* just return '<'
*/
restoreState();
return '<';
default:
return ch;
}
}
}
/*
* @see java.io.Reader#read(char[], int, int)
*/
@Override
public int read(char cbuf[], int off, int len) throws IOException {
int i = 0;
for (i = 0; i < len; i++) {
int ch = read();
if (ch == -1)
break;
cbuf[off++] = (char) ch;
}
if (i == 0) {
if (len == 0)
return 0;
return -1;
}
return i;
}
/*
* @see java.io.Reader#close()
*/
@Override
public void close() throws IOException {
fIn.close();
}
}