/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.rssowl.core.util; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Set; /** * A Reader that wraps another reader and attempts to strip out HTML constructs. * Entities found in the Text are being replaced if possible. * <p> * This class is part of Apache Solr (named HTMLStripReader) and is versioned: * 773920 (2009-12-05) * </p> * As opposed to {@link HTMLStripReader}, this extended version allows to pass * in a list of HTML tags that are to be stripped from the content. * * @author bpasero did some modifications to support the filtering. */ public class HTMLFilterReader extends Reader { /* Some constants being used */ private static final int MISMATCH = -2; private static final int MATCH = -3; private static final int DEFAULT_READ_AHEAD = 8192; /* Common Entities */ private static final HashMap<String, Character> fgEntityTable; private final boolean fReplaceEntities; private final Reader fIn; private int fReadAheadLimit = DEFAULT_READ_AHEAD; private int fSafeReadAheadLimit = fReadAheadLimit - 3; private int fNumWhitespace = 0; private int fNumRead = 0; private int fLastMark; private Set<String> fEscapedTags; /* pushback buffer */ private final StringBuilder fPushed = new StringBuilder(); /* temporary buffer */ private final StringBuilder fSb = new StringBuilder(); static { fgEntityTable = new HashMap<String, Character>(); /* Entity Names */ final String[] entityName = { "zwnj", "aring", "gt", "yen", "ograve", "Chi", "delta", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ "rang", "sup", "trade", "Ntilde", "xi", "upsih", "nbsp", "Atilde", "radic", "otimes", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "aelig", "oelig", "equiv", "ni", "infin", "Psi", "auml", "cup", "Epsilon", "otilde", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "lt", "Icirc", "Eacute", "Lambda", "sbquo", "Prime", "prime", "psi", "Kappa", "rsaquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "Tau", "uacute", "ocirc", "lrm", "zwj", "cedil", "Alpha", "not", "amp", "AElig", "oslash", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "acute", "lceil", "alefsym", "laquo", "shy", "loz", "ge", "Igrave", "nu", "Ograve", "lsaquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "sube", "euro", "rarr", "sdot", "rdquo", "Yacute", "lfloor", "lArr", "Auml", "Dagger", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "brvbar", "Otilde", "szlig", "clubs", "diams", "agrave", "Ocirc", "Iota", "Theta", "Pi", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "zeta", "Scaron", "frac14", "egrave", "sub", "iexcl", "frac12", "ordf", "sum", "prop", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "Uuml", "ntilde", "atilde", "asymp", "uml", "prod", "nsub", "reg", "rArr", "Oslash", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "emsp", "THORN", "yuml", "aacute", "Mu", "hArr", "le", "thinsp", "dArr", "ecirc", "bdquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$//$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "Sigma", "Aring", "tilde", "nabla", "mdash", "uarr", "times", "Ugrave", "Eta", "Agrave", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "chi", "real", "circ", "eth", "rceil", "iuml", "gamma", "lambda", "harr", "Egrave", "frac34", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "dagger", "divide", "Ouml", "image", "ndash", "hellip", "igrave", "Yuml", "ang", "alpha", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "frasl", "ETH", "lowast", "Nu", "plusmn", "bull", "sup1", "sup2", "sup3", "Aacute", "cent", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "oline", "Beta", "perp", "Delta", "there4", "pi", "iota", "empty", "euml", "notin", "iacute", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "para", "epsilon", "weierp", "OElig", "uuml", "larr", "icirc", "Upsilon", "omicron", "upsilon", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ "copy", "Iuml", "Oacute", "Xi", "kappa", "ccedil", "Ucirc", "cap", "mu", "scaron", "lsquo", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "isin", "Zeta", "minus", "deg", "and", "tau", "pound", "curren", "int", "ucirc", "rfloor", "ensp", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ //$NON-NLS-12$ "crarr", "ugrave", "exist", "cong", "theta", "oplus", "permil", "Acirc", "piv", "Euml", "Phi", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$//$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "Iacute", "quot", "Uacute", "Omicron", "ne", "iquest", "eta", "rsquo", "yacute", "Rho", "darr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "Ecirc", "Omega", "acirc", "sim", "phi", "sigmaf", "macr", "thetasym", "Ccedil", "ordm", "uArr", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "forall", "beta", "fnof", "rho", "micro", "eacute", "omega", "middot", "Gamma", "rlm", "lang", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "spades", "supe", "thorn", "ouml", "or", "raquo", "part", "sect", "ldquo", "hearts", "sigma", //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ "oacute", "apos" }; //$NON-NLS-1$ //$NON-NLS-2$ /* Entity Values */ final char[] entityVal = { 8204, 229, 62, 165, 242, 935, 948, 9002, 8835, 8482, 209, 958, 978, 160, 195, 8730, 8855, 230, 339, 8801, 8715, 8734, 936, 228, 8746, 917, 245, 60, 206, 201, 923, 8218, 8243, 8242, 968, 922, 8250, 932, 250, 244, 8206, 8205, 184, 913, 172, 38, 198, 248, 180, 8968, 8501, 171, 173, 9674, 8805, 204, 957, 210, 8249, 8838, 8364, 8594, 8901, 8221, 221, 8970, 8656, 196, 8225, 166, 213, 223, 9827, 9830, 224, 212, 921, 920, 928, 950, 352, 188, 232, 8834, 161, 189, 170, 8721, 8733, 220, 241, 227, 8776, 168, 8719, 8836, 174, 8658, 216, 8195, 222, 255, 225, 924, 8660, 8804, 8201, 8659, 234, 8222, 931, 197, 732, 8711, 8212, 8593, 215, 217, 919, 192, 967, 8476, 710, 240, 8969, 239, 947, 955, 8596, 200, 190, 8224, 247, 214, 8465, 8211, 8230, 236, 376, 8736, 945, 8260, 208, 8727, 925, 177, 8226, 185, 178, 179, 193, 162, 8254, 914, 8869, 916, 8756, 960, 953, 8709, 235, 8713, 237, 182, 949, 8472, 338, 252, 8592, 238, 933, 959, 965, 169, 207, 211, 926, 954, 231, 219, 8745, 956, 353, 8216, 8712, 918, 8722, 176, 8743, 964, 163, 164, 8747, 251, 8971, 8194, 8629, 249, 8707, 8773, 952, 8853, 8240, 194, 982, 203, 934, 205, 34, 218, 927, 8800, 191, 951, 8217, 253, 929, 8595, 202, 937, 226, 8764, 966, 962, 175, 977, 199, 186, 8657, 8704, 946, 402, 961, 181, 233, 969, 183, 915, 8207, 9001, 9824, 8839, 254, 246, 8744, 187, 8706, 167, 8220, 9829, 963, 243, 39 }; /* Fill Entities */ for (int i = 0; i < entityName.length; i++) fgEntityTable.put(entityName[i], new Character(entityVal[i])); /* Special-case nbsp to a simple space instead of 0xa0 */ fgEntityTable.put("nbsp", new Character(' ')); //$NON-NLS-1$ } /** * @param source * @param escapedTags * @param replaceEntities */ public HTMLFilterReader(Reader source, Set<String> escapedTags, boolean replaceEntities) { super(); fIn = source.markSupported() ? source : new BufferedReader(source); fEscapedTags = escapedTags; fReplaceEntities = replaceEntities; } private int next() throws IOException { int len = fPushed.length(); if (len > 0) { int ch = fPushed.charAt(len - 1); fPushed.setLength(len - 1); return ch; } fNumRead++; return fIn.read(); } private int nextSkipWS() throws IOException { int ch = next(); while (isSpace(ch)) ch = next(); return ch; } private int peek() throws IOException { int len = fPushed.length(); if (len > 0) { return fPushed.charAt(len - 1); } int ch = fIn.read(); push(ch); return ch; } private void push(int ch) { fPushed.append((char) ch); } private boolean isSpace(int ch) { switch (ch) { case ' ': case '\n': case '\r': case '\t': return true; default: return false; } } private boolean isHex(int ch) { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); } private boolean isAlpha(int ch) { return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z'; } private boolean isDigit(int ch) { return ch >= '0' && ch <= '9'; } /*** * From HTML 4.0 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | * CombiningChar | Extender [5] Name ::= (Letter | '_' | ':') (NameChar)* [6] * Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::= * Nmtoken (#x20 Nmtoken)* ***/ // should I include all id chars allowable by HTML/XML here? // including accented chars, ':', etc? private boolean isIdChar(int ch) { // return Character.isUnicodeIdentifierPart(ch); // isUnicodeIdentiferPart doesn't include '-'... shoudl I still // use it and add in '-',':',etc? return isAlpha(ch) || isDigit(ch) || ch == '.' || ch == '-' || ch == '_' || ch == ':' || Character.isLetter(ch); } private boolean isFirstIdChar(int ch) { return Character.isUnicodeIdentifierStart(ch); // return isAlpha(ch) || ch=='_' || Character.isLetter(ch); } private void saveState() throws IOException { fLastMark = fNumRead; fIn.mark(fReadAheadLimit); } private void restoreState() throws IOException { fIn.reset(); fPushed.setLength(0); } private int readNumericEntity() throws IOException { // "&#" has already been read at this point // is this decimal, hex, or nothing at all. int ch = next(); int base = 10; fSb.setLength(0); if (isDigit(ch)) { // decimal character entity fSb.append((char) ch); for (int i = 0; i < 10; i++) { ch = next(); if (isDigit(ch)) { fSb.append((char) ch); } else { break; } } } else if (ch == 'x') { // hex character entity base = 16; fSb.setLength(0); for (int i = 0; i < 10; i++) { ch = next(); if (isHex(ch)) { fSb.append((char) ch); } else { break; } } } else { return MISMATCH; } // In older HTML, an entity may not have always been terminated // with a semicolon. We'll also treat EOF or whitespace as terminating // the entity. try { if (ch == ';' || ch == -1) { fNumWhitespace = fSb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char return Integer.parseInt(fSb.toString(), base); } // if whitespace terminated the entity, we need to return // that whitespace on the next call to read(). if (isSpace(ch)) { push(ch); fNumWhitespace = fSb.length() + 2;// + 2 accounts for &, #, and ;, then, take away 1 for the fact that we do output a char return Integer.parseInt(fSb.toString(), base); } } catch (NumberFormatException e) { return MISMATCH; } // Not an entity... return MISMATCH; } private int readEntity() throws IOException { int ch = next(); if (ch == '#') return readNumericEntity(); //read an entity reference // for an entity reference, require the ';' for safety. // otherwise we may try and convert part of some company // names to an entity. "Alpha&Beta Corp" for instance. // // Perhaps I should special case some of the // more common ones like & to make the ';' optional... fSb.setLength(0); fSb.append((char) ch); for (int i = 0; i < fSafeReadAheadLimit; i++) { ch = next(); if (Character.isLetter(ch)) { fSb.append((char) ch); } else { break; } } if (ch == ';' && fReplaceEntities) { String entity = fSb.toString(); Character entityChar = fgEntityTable.get(entity); if (entityChar != null) { fNumWhitespace = entity.length() + 1; return entityChar.charValue(); } } return MISMATCH; } /*** * valid comments according to HTML specs <!-- Hello --> <!-- Hello -- -- * Hello--> <!----> <!------ Hello --> <!> <!------> Hello --> #comments * inside of an entity decl: <!ENTITY amp CDATA "&" -- ampersand, U+0026 * ISOnum --> Turns out, IE & mozilla don't parse comments correctly. Since * this is meant to be a practical stripper, I'll just try and duplicate what * the browsers do. <!-- (stuff_including_markup)* --> <!FOO (stuff, not * including markup) > <! (stuff, not including markup)* > ***/ private int readBang(boolean inScript) throws IOException { // at this point, "<!" has been read int ret = readComment(inScript); if (ret == MATCH) return MATCH; if ((fNumRead - fLastMark) < fSafeReadAheadLimit || peek() == '>') { int ch = next(); if (ch == '>') return MATCH; // if it starts with <! and isn't a comment, // simply read until ">" //since we did readComment already, it may be the case that we are already deep into the read ahead buffer //so, we may need to abort sooner while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { ch = next(); if (ch == '>') { return MATCH; } else if (ch < 0) { return MISMATCH; } } } return MISMATCH; } // tries to read comments the way browsers do, not // strictly by the standards. // // GRRRR. it turns out that in the wild, a <script> can have a HTML comment // that contains a script that contains a quoted comment. // <script><!-- document.write("<!--embedded comment-->") --></script> // private int readComment(boolean inScript) throws IOException { // at this point "<!" has been read int ch = next(); if (ch != '-') { // not a comment push(ch); return MISMATCH; } ch = next(); if (ch != '-') { // not a comment push(ch); push('-'); return MISMATCH; } /* two extra calls to next() here, so make sure we don't read past our mark */ while ((fNumRead - fLastMark) < fSafeReadAheadLimit - 3) { ch = next(); if (ch < 0) return MISMATCH; if (ch == '-') { ch = next(); if (ch < 0) return MISMATCH; if (ch != '-') { push(ch); continue; } ch = next(); if (ch < 0) return MISMATCH; if (ch != '>') { push(ch); push('-'); continue; } return MATCH; } else if ((ch == '\'' || ch == '"') && inScript) { push(ch); readScriptString(); // if this wasn't a string, there's not much we can do // at this point without having a stack of stream states in // order to "undo" just the latest. } else if (ch == '<') { eatSSI(); } } return MISMATCH; } private int readTag() throws IOException { // at this point '<' has already been read int ch = next(); if (!isAlpha(ch)) { push(ch); return MISMATCH; } fSb.setLength(0); fSb.append((char) ch); while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { ch = next(); if (isIdChar(ch)) { fSb.append((char) ch); } else if (ch == '/') { // Hmmm, a tag can close with "/>" as well as "/ >" // read end tag '/>' or '/ >', etc return nextSkipWS() == '>' ? MATCH : MISMATCH; } else { break; } } if (fEscapedTags != null && !fEscapedTags.contains(fSb.toString().toLowerCase())) { //if this is a reservedTag, then keep it return MISMATCH; } // After the tag id, there needs to be either whitespace or // '>' if (!(ch == '>' || isSpace(ch))) { return MISMATCH; } if (ch != '>') { // process attributes while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { ch = next(); if (isSpace(ch)) { continue; } else if (isFirstIdChar(ch)) { push(ch); int ret = readAttr2(); if (ret == MISMATCH) return ret; } else if (ch == '/' || ch == '"') { // read end tag '/>' or '/ >', etc return nextSkipWS() == '>' ? MATCH : MISMATCH; } else if (ch == '>') { break; } else { return MISMATCH; } } if ((fNumRead - fLastMark) >= fSafeReadAheadLimit) { return MISMATCH;//exit out if we exceeded the buffer } } // We only get to this point after we have read the // entire tag. Now let's see if it's a special tag. String name = fSb.toString(); if (name.equalsIgnoreCase("script") || name.equalsIgnoreCase("style")) { //$NON-NLS-1$ //$NON-NLS-2$ // The content of script and style elements is // CDATA in HTML 4 but PCDATA in XHTML. /* * From HTML4: Although the STYLE and SCRIPT elements use CDATA for their * data model, for these elements, CDATA must be handled differently by * user agents. Markup and entities must be treated as raw text and passed * to the application as is. The first occurrence of the character * sequence "</" (end-tag open delimiter) is treated as terminating the * end of the element's content. In valid documents, this would be the end * tag for the element. */ // discard everything until endtag is hit (except // if it occurs in a comment. // reset the stream mark to here, since we know that we sucessfully matched // a tag, and if we can't find the end tag, this is where we will want // to roll back to. saveState(); fPushed.setLength(0); return findEndTag(); } return MATCH; } // find an end tag, but beware of comments... // <script><!-- </script> -->foo</script> // beware markup in script strings: </script>...document.write("</script>")foo</script> // Do I need to worry about CDATA sections "<![CDATA[" ? int findEndTag() throws IOException { while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { int ch = next(); if (ch == '<') { ch = next(); // skip looking for end-tag in comments if (ch == '!') { int ret = readBang(true); if (ret == MATCH) continue; // yikes... what now? It wasn't a comment, but I can't get // back to the state I was at. Just continue from where I // am I guess... continue; } // did we match "</" if (ch != '/') { push(ch); continue; } int ret = readName(false); if (ret == MISMATCH) return MISMATCH; ch = nextSkipWS(); if (ch != '>') return MISMATCH; return MATCH; } else if (ch == '\'' || ch == '"') { // read javascript string to avoid a false match. push(ch); int ret = readScriptString(); // what to do about a non-match (non-terminated string?) // play it safe and index the rest of the data I guess... if (ret == MISMATCH) return MISMATCH; } else if (ch < 0) { return MISMATCH; } } return MISMATCH; } // read a string escaped by backslashes private int readScriptString() throws IOException { int quoteChar = next(); if (quoteChar != '\'' && quoteChar != '"') return MISMATCH; while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { int ch = next(); if (ch == quoteChar) return MATCH; else if (ch == '\\') { ch = next(); } else if (ch < 0) { return MISMATCH; } else if (ch == '<') { eatSSI(); } } return MISMATCH; } private int readName(boolean checkEscaped) throws IOException { StringBuilder builder = (checkEscaped && fEscapedTags != null) ? new StringBuilder() : null; int ch = read(); if (builder != null) builder.append((char) ch); if (!isFirstIdChar(ch)) return MISMATCH; ch = read(); if (builder != null) builder.append((char) ch); while (isIdChar(ch)) { ch = read(); if (builder != null) builder.append((char) ch); } if (ch != -1) { push(ch); } //strip off the trailing > if (builder != null && !fEscapedTags.contains(builder.substring(0, builder.length() - 1).toLowerCase())) { return MISMATCH; } return MATCH; } /*** * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* * "'" need to also handle unquoted attributes, and attributes w/o values: <td * * * * * * * * * * * * * * * * * * id=msviGlobalToolbar height="22" nowrap * * * * * align=left> ***/ // This reads attributes and attempts to handle any // embedded server side includes that would otherwise // mess up the quote handling. // <a href="a/<!--#echo "path"-->"> private int readAttr2() throws IOException { if ((fNumRead - fLastMark < fSafeReadAheadLimit)) { int ch = read(); if (!isFirstIdChar(ch)) return MISMATCH; ch = read(); while (isIdChar(ch) && ((fNumRead - fLastMark) < fSafeReadAheadLimit)) { ch = read(); } if (isSpace(ch)) ch = nextSkipWS(); // attributes may not have a value at all! // if (ch != '=') return MISMATCH; if (ch != '=') { push(ch); return MATCH; } int quoteChar = nextSkipWS(); if (quoteChar == '"' || quoteChar == '\'') { while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { ch = next(); if (ch < 0) return MISMATCH; else if (ch == '<') { eatSSI(); } else if (ch == quoteChar) { return MATCH; //} else if (ch=='<') { // return MISMATCH; } } } else { // unquoted attribute while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { ch = next(); if (ch < 0) return MISMATCH; else if (isSpace(ch)) { push(ch); return MATCH; } else if (ch == '>') { push(ch); return MATCH; } else if (ch == '<') { eatSSI(); } } } } return MISMATCH; } // skip past server side include private int eatSSI() throws IOException { // at this point, only a "<" was read. // on a mismatch, push back the last char so that if it was // a quote that closes the attribute, it will be re-read and matched. int ch = next(); if (ch != '!') { push(ch); return MISMATCH; } ch = next(); if (ch != '-') { push(ch); return MISMATCH; } ch = next(); if (ch != '-') { push(ch); return MISMATCH; } ch = next(); if (ch != '#') { push(ch); return MISMATCH; } push('#'); push('-'); push('-'); return readComment(false); } private int readProcessingInstruction() throws IOException { // "<?" has already been read while ((fNumRead - fLastMark) < fSafeReadAheadLimit) { int ch = next(); if (ch == '?' && peek() == '>') { next(); return MATCH; } else if (ch == -1) { return MISMATCH; } } return MISMATCH; } /* * @see java.io.Reader#read() */ @Override public int read() throws IOException { // Do we ever want to preserve CDATA sections? // where do we have to worry about them? // <![ CDATA [ unescaped markup ]]> if (fNumWhitespace > 0) { fNumWhitespace--; return ' '; } //do not limit this one by the READAHEAD while (true) { int lastNumRead = fNumRead; int ch = next(); switch (ch) { case '&': saveState(); ch = readEntity(); if (ch >= 0) return ch; if (ch == MISMATCH) { restoreState(); return '&'; } break; case '<': saveState(); ch = next(); int ret = MISMATCH; if (ch == '!') { ret = readBang(false); } else if (ch == '/') { ret = readName(true); if (ret == MATCH) { ch = nextSkipWS(); ret = ch == '>' ? MATCH : MISMATCH; } } else if (isAlpha(ch)) { push(ch); ret = readTag(); } else if (ch == '?') { ret = readProcessingInstruction(); } // matched something to be discarded, so break // from this case and continue in the loop if (ret == MATCH) { //break;//was //return whitespace from fNumWhitespace = (fNumRead - lastNumRead) - 1;//tack on the -1 since we are returning a space right now return ' '; } // didn't match any HTML constructs, so roll back // the stream state and just return '<' restoreState(); return '<'; default: return ch; } } } /* * @see java.io.Reader#read(char[], int, int) */ @Override public int read(char cbuf[], int off, int len) throws IOException { int i = 0; for (i = 0; i < len; i++) { int ch = read(); if (ch == -1) break; cbuf[off++] = (char) ch; } if (i == 0) { if (len == 0) return 0; return -1; } return i; } /* * @see java.io.Reader#close() */ @Override public void close() throws IOException { fIn.close(); } }