/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package org.albite.io.html; import org.albite.io.decoders.AlbiteStreamReader; import java.io.IOException; import java.io.Reader; import java.util.Hashtable; /** * * @author albus */ public class XhtmlStreamReader extends Reader implements HTMLSubstitues { private static final int SEARCH_BUFFER = 2048; private static final Hashtable ENTITIES = new Hashtable(200); private final AlbiteStreamReader in; private final char[] buffer = new char[10]; private Hashtable customEntities; public XhtmlStreamReader( final AlbiteStreamReader in, final boolean readXmlDecl, final boolean readDoctypeDecl) throws IOException { this.in = in; if (readXmlDecl) { /* * Read xml decl */ xmldecl(); } if (readDoctypeDecl) { /* * Read doc decl */ doctypedecl(); } } private void xmldecl() throws IOException { /* * Try to read the xmldecl */ /* * Mark four times as much, for the underlying AlbiteStreamReader, * might read multibyte (4-byte) UTF-8 chars */ in.mark(SEARCH_BUFFER * 4); try { char[] buf = new char[SEARCH_BUFFER]; int read = in.read(buf); if (read > 0) { String xmldecl = new String(buf, 0, read); if (!xmldecl.startsWith("<?xml")) { in.reset(); return; } int xend = xmldecl.indexOf("?>"); if (xend == -1) { in.reset(); return; } xmldecl = xmldecl.substring(0, xend + 2); final int[] encodingPosition = readAttribute(xmldecl, "encoding"); if (encodingPosition == null) { in.reset(); return; } final String encoding = xmldecl.substring( encodingPosition[0], encodingPosition[0] + encodingPosition[1]); try { in.setEncoding(encoding); } catch (Exception e) { /* * Do nothing: the reader will continue with * its current settings */ } in.reset(); skip(xmldecl.length()); } else { in.reset(); } } catch (StringIndexOutOfBoundsException e) { in.reset(); } catch (IllegalArgumentException e) { in.reset(); } } private void doctypedecl() throws IOException { /* * Try to read the doctypedecl */ /* * Mark four times as much, for the underlying AlbiteStreamReader, * might read multibyte (4-byte) UTF-8 chars */ in.mark(SEARCH_BUFFER * 4); try { char[] buf = new char[SEARCH_BUFFER]; int read = in.read(buf); if (read > 0) { String ddecl = new String(buf, 0, read); final String dstring = "<!DOCTYPE"; int dstart = ddecl.indexOf(dstring); if (dstart == -1) { in.reset(); return; } int dend = ddecl.indexOf('>', dstring.length() + dstart); int doptstart = ddecl.indexOf('[', dstring.length() + dstart); if (dend == -1) { in.reset(); return; } if (doptstart == -1 || dend < doptstart) { /* * No internal decl. * Just skip the doctype */ in.reset(); skip(dend + 1); return; } int doptend = ddecl.indexOf(']', doptstart + 1); dend = ddecl.indexOf('>', doptend + 1); if (dend == -1) { in.reset(); return; } final String intdecl = ddecl.substring(doptstart, doptend + 1); ddecl = ddecl.substring(dstart, dend + 1); final String entstring = "<!ENTITY"; int entstart = 0; int entend = 0; while (true) { entstart = intdecl.indexOf(entstring, entstart); if (entstart == -1) { break; } entstart += entstring.length(); entend = intdecl.indexOf('>', entstart); if (entend == -1) { break; } try { int replstart = intdecl.indexOf('"', entstart); if (replstart == -1) { continue; } int replend = intdecl.indexOf('"', replstart + 1); if (replend == -1) { continue; } final String entityName = intdecl.substring(entstart, replstart).trim(); if (entityName.indexOf('%') != -1) { /* * PEReference entities are not supported */ continue; } final String entityValue = intdecl.substring(replstart + 2, replend - 1); int entityIntVal = processEntity(entityValue); if (entityIntVal != 0) { /* * add the entity */ if (customEntities == null) { customEntities = new Hashtable(20); } customEntities.put(entityName, new Integer(entityIntVal)); } } catch (StringIndexOutOfBoundsException e) { /* * Just skip this entity */ continue; } } in.reset(); skip(dend + 1); } else { in.reset(); } } catch (StringIndexOutOfBoundsException e) { in.reset(); } catch (IllegalArgumentException e) { in.reset(); } } private void skip(int left) throws IOException { while (left > 0) { left -= in.skip(left); } } public static int[] readAttribute( String tagString, String attribute) { tagString = tagString.toLowerCase(); attribute = attribute.toLowerCase(); attribute += "="; try { int start = tagString.indexOf(attribute); if (start != -1) { start += attribute.length(); final char ch = tagString.charAt(start); if (ch == '"' || ch == '\'') { start++; final int end = tagString.indexOf(ch, start); if (end != -1) { return new int[] {start, end - start}; } } } } catch (StringIndexOutOfBoundsException e) {} return null; } public int read() throws IOException { int read = in.read(); if (read == -1) { return read; } if (read == 0x003C) { // < return START_TAG_INT; } if (read == 0x003E) { // > return END_TAG_INT; } if (read == 38) { //'&' in.mark(10); for (int len = 0; len < 10; len++) { read = in.read(); if (read == 59) { //; /* * Found entity */ return processEntity(new String(buffer, 0, len)); } if (read == -1 || read == 0x20 || read == 0x9 || read == 0xD || read == 0xA) { /* * Couldn't find entity's end before EOF * or this is not a valid entry. */ break; } buffer[len] = (char) read; } /* * Couldn't find entity, so reset stream from position after entity */ read = 38; in.reset(); } return read; } private int processEntity(final String entityName) { Object entityValue; entityValue = ENTITIES.get(entityName); if (entityValue != null) { /* * Entity found in the main table */ return ((Integer) entityValue).intValue(); } if (customEntities != null) { entityValue = customEntities.get(entityName); if (entityValue != null) { /* * found in document's table */ return ((Integer) entityValue).intValue(); } } /* * Is it a number? * Ӓ * ꯍ */ if (entityName.length() > 0 && entityName.charAt(0) == '#') { if (entityName.length() > 3 && (entityName.charAt(1) == 'x' || entityName.charAt(1) == 'X')) { //at least x or X and one digit /* * ꯍ */ try { return Integer.parseInt(entityName.substring(2), 16); } catch (NumberFormatException e) {} } else if (entityName.length() > 2) { //at least one digit /* * &1234; */ try { return Integer.parseInt(entityName.substring(1)); } catch (NumberFormatException e) {} } } /* * The entity couldn't be read, so a default value * (the null char for now) will be returned. */ return 0; } public int read(char[] cbuf, int off, int len) throws IOException { int read = 0; for (int i = 0; i < len; i++) { read = read(); if (read == -1) { /* * EOF */ return i; } cbuf[i + off] = (char) read; } return len; } public void close() throws IOException { in.close(); } static { /* * HTML Entities. See the link * http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references */ /* * HTMLspecial */ ENTITIES.put("quot", new Integer('"')); ENTITIES.put("amp", new Integer('&')); ENTITIES.put("apos", new Integer('\'')); ENTITIES.put("lt", new Integer('<')); ENTITIES.put("gt", new Integer('>')); /* * HTMLlat1 */ ENTITIES.put("nbsp", new Integer('\u00A0')); ENTITIES.put("iexcl", new Integer('\u00A1')); ENTITIES.put("cent", new Integer('\u00A2')); ENTITIES.put("pound", new Integer('\u00A3')); ENTITIES.put("curren", new Integer('\u00A4')); ENTITIES.put("yen", new Integer('\u00A5')); ENTITIES.put("brvbar", new Integer('\u00A6')); ENTITIES.put("sect", new Integer('\u00A7')); ENTITIES.put("uml", new Integer('\u00A8')); ENTITIES.put("copy", new Integer('\u00A9')); ENTITIES.put("ordf", new Integer('\u00AA')); ENTITIES.put("laquo", new Integer('\u00AB')); ENTITIES.put("not", new Integer('\u00AC')); ENTITIES.put("shy", new Integer('\u00AD')); ENTITIES.put("reg", new Integer('\u00AE')); ENTITIES.put("macr", new Integer('\u00AF')); ENTITIES.put("deg", new Integer('\u00B0')); ENTITIES.put("plusmn", new Integer('\u00B1')); ENTITIES.put("sup2", new Integer('\u00B2')); ENTITIES.put("sup3", new Integer('\u00B3')); ENTITIES.put("acute", new Integer('\u00B4')); ENTITIES.put("micro", new Integer('\u00B5')); ENTITIES.put("para", new Integer('\u00B6')); ENTITIES.put("middot", new Integer('\u00B7')); ENTITIES.put("cedil", new Integer('\u00B8')); ENTITIES.put("sup1", new Integer('\u00B9')); ENTITIES.put("ordm", new Integer('\u00BA')); ENTITIES.put("raquo", new Integer('\u00BB')); ENTITIES.put("frac14", new Integer('\u00BC')); ENTITIES.put("frac12", new Integer('\u00BD')); ENTITIES.put("frac34", new Integer('\u00BE')); ENTITIES.put("iquest", new Integer('\u00BF')); ENTITIES.put("Agrave", new Integer('\u00C0')); ENTITIES.put("Aacute", new Integer('\u00C1')); ENTITIES.put("Acirc", new Integer('\u00C2')); ENTITIES.put("Atilde", new Integer('\u00C3')); ENTITIES.put("Auml", new Integer('\u00C4')); ENTITIES.put("Aring", new Integer('\u00C5')); ENTITIES.put("AElig", new Integer('\u00C6')); ENTITIES.put("Ccedil", new Integer('\u00C7')); ENTITIES.put("Egrave", new Integer('\u00C8')); ENTITIES.put("Eacute", new Integer('\u00C9')); ENTITIES.put("Ecirc", new Integer('\u00CA')); ENTITIES.put("Euml", new Integer('\u00CB')); ENTITIES.put("Igrave", new Integer('\u00CC')); ENTITIES.put("Iacute", new Integer('\u00CD')); ENTITIES.put("Icirc", new Integer('\u00CE')); ENTITIES.put("Iuml", new Integer('\u00CF')); ENTITIES.put("ETH", new Integer('\u00D0')); ENTITIES.put("Ntilde", new Integer('\u00D1')); ENTITIES.put("Ograve", new Integer('\u00D2')); ENTITIES.put("Oacute", new Integer('\u00D3')); ENTITIES.put("Ocirc", new Integer('\u00D4')); ENTITIES.put("Otilde", new Integer('\u00D5')); ENTITIES.put("Ouml", new Integer('\u00D6')); ENTITIES.put("times", new Integer('\u00D7')); ENTITIES.put("Oslash", new Integer('\u00D8')); ENTITIES.put("Ugrave", new Integer('\u00D9')); ENTITIES.put("Uacute", new Integer('\u00DA')); ENTITIES.put("Ucirc", new Integer('\u00DB')); ENTITIES.put("Uuml", new Integer('\u00DC')); ENTITIES.put("Yacute", new Integer('\u00DD')); ENTITIES.put("THORN", new Integer('\u00DE')); ENTITIES.put("szlig", new Integer('\u00DF')); ENTITIES.put("agrave", new Integer('\u00E0')); ENTITIES.put("aacute", new Integer('\u00E1')); ENTITIES.put("acirc", new Integer('\u00E2')); ENTITIES.put("atilde", new Integer('\u00E3')); ENTITIES.put("auml", new Integer('\u00E4')); ENTITIES.put("aring", new Integer('\u00E5')); ENTITIES.put("aelig", new Integer('\u00E6')); ENTITIES.put("ccedil", new Integer('\u00E7')); ENTITIES.put("egrave", new Integer('\u00E8')); ENTITIES.put("eacute", new Integer('\u00E9')); ENTITIES.put("ecirc", new Integer('\u00EA')); ENTITIES.put("euml", new Integer('\u00EB')); ENTITIES.put("igrave", new Integer('\u00EC')); ENTITIES.put("iacute", new Integer('\u00ED')); ENTITIES.put("icirc", new Integer('\u00EE')); ENTITIES.put("iuml", new Integer('\u00EF')); ENTITIES.put("eth", new Integer('\u00F0')); ENTITIES.put("ntilde", new Integer('\u00F1')); ENTITIES.put("ograve", new Integer('\u00F2')); ENTITIES.put("oacute", new Integer('\u00F3')); ENTITIES.put("ocirc", new Integer('\u00F4')); ENTITIES.put("otilde", new Integer('\u00F5')); ENTITIES.put("ouml", new Integer('\u00F6')); ENTITIES.put("divide", new Integer('\u00F7')); ENTITIES.put("oslash", new Integer('\u00F8')); ENTITIES.put("ugrave", new Integer('\u00F9')); ENTITIES.put("uacute", new Integer('\u00FA')); ENTITIES.put("ucirc", new Integer('\u00FB')); ENTITIES.put("uuml", new Integer('\u00FC')); ENTITIES.put("yacute", new Integer('\u00FD')); ENTITIES.put("thorn", new Integer('\u00FE')); ENTITIES.put("yuml", new Integer('\u00FF')); /* * HTMLspecial */ ENTITIES.put("OElig", new Integer('\u0152')); ENTITIES.put("oelig", new Integer('\u0153')); ENTITIES.put("Scaron", new Integer('\u0160')); ENTITIES.put("scaron", new Integer('\u0161')); ENTITIES.put("Yuml", new Integer('\u0178')); ENTITIES.put("fnof", new Integer('\u0192')); ENTITIES.put("circ", new Integer('\u02C6')); ENTITIES.put("tilde", new Integer('\u02DC')); /* * HTMLsymbols */ ENTITIES.put("Alpha", new Integer('\u0391')); ENTITIES.put("Beta", new Integer('\u0392')); ENTITIES.put("Gamma", new Integer('\u0393')); ENTITIES.put("Delta", new Integer('\u0394')); ENTITIES.put("Epsilon", new Integer('\u0395')); ENTITIES.put("Zeta", new Integer('\u0396')); ENTITIES.put("Eta", new Integer('\u0397')); ENTITIES.put("Theta", new Integer('\u0398')); ENTITIES.put("Iota", new Integer('\u0399')); ENTITIES.put("Kappa", new Integer('\u039A')); ENTITIES.put("Lambda", new Integer('\u039B')); ENTITIES.put("Mu", new Integer('\u039C')); ENTITIES.put("Nu", new Integer('\u039D')); ENTITIES.put("Xi", new Integer('\u039E')); ENTITIES.put("Omicron", new Integer('\u039F')); ENTITIES.put("Pi", new Integer('\u03A0')); ENTITIES.put("Rho", new Integer('\u03A1')); ENTITIES.put("Sigma", new Integer('\u03A3')); ENTITIES.put("Tau", new Integer('\u03A4')); ENTITIES.put("Upsilon", new Integer('\u03A5')); ENTITIES.put("Phi", new Integer('\u03A6')); ENTITIES.put("Chi", new Integer('\u03A7')); ENTITIES.put("Psi", new Integer('\u03A8')); ENTITIES.put("Omega", new Integer('\u03A9')); ENTITIES.put("alpha", new Integer('\u03B1')); ENTITIES.put("beta", new Integer('\u03B2')); ENTITIES.put("gamma", new Integer('\u03B3')); ENTITIES.put("delta", new Integer('\u03B4')); ENTITIES.put("epsilon", new Integer('\u03B5')); ENTITIES.put("zeta", new Integer('\u03B6')); ENTITIES.put("eta", new Integer('\u03B7')); ENTITIES.put("theta", new Integer('\u03B8')); ENTITIES.put("iota", new Integer('\u03B9')); ENTITIES.put("kappa", new Integer('\u03BA')); ENTITIES.put("lambda", new Integer('\u03BB')); ENTITIES.put("mu", new Integer('\u03BC')); ENTITIES.put("nu", new Integer('\u03BD')); ENTITIES.put("xi", new Integer('\u03BE')); ENTITIES.put("omicron", new Integer('\u03BF')); ENTITIES.put("pi", new Integer('\u03C0')); ENTITIES.put("rho", new Integer('\u03C1')); ENTITIES.put("sigmaf", new Integer('\u03C2')); ENTITIES.put("sigma", new Integer('\u03C3')); ENTITIES.put("tau", new Integer('\u03C4')); ENTITIES.put("upsilon", new Integer('\u03C5')); ENTITIES.put("phi", new Integer('\u03C6')); ENTITIES.put("chi", new Integer('\u03C7')); ENTITIES.put("psi", new Integer('\u03C8')); ENTITIES.put("omega", new Integer('\u03C9')); ENTITIES.put("thetasym",new Integer('\u03D1')); ENTITIES.put("upsih", new Integer('\u03D2')); ENTITIES.put("piv", new Integer('\u03D6')); /* * HTMLspecial */ ENTITIES.put("ensp", new Integer('\u2002')); ENTITIES.put("emsp", new Integer('\u2003')); ENTITIES.put("thinsp", new Integer('\u2009')); ENTITIES.put("ndash", new Integer('\u2013')); ENTITIES.put("mdash", new Integer('\u2014')); ENTITIES.put("lsquo", new Integer('\u2018')); ENTITIES.put("rsquo", new Integer('\u2019')); ENTITIES.put("sbquo", new Integer('\u201A')); ENTITIES.put("ldquo", new Integer('\u201C')); ENTITIES.put("rdquo", new Integer('\u201D')); ENTITIES.put("bdquo", new Integer('\u201E')); ENTITIES.put("dagger", new Integer('\u2020')); ENTITIES.put("Dagger", new Integer('\u2021')); ENTITIES.put("bull", new Integer('\u2022')); ENTITIES.put("hellip", new Integer('\u2026')); ENTITIES.put("permil", new Integer('\u2030')); ENTITIES.put("prime", new Integer('\u2032')); ENTITIES.put("Prime", new Integer('\u2033')); ENTITIES.put("lsaquo", new Integer('\u2039')); ENTITIES.put("rsaquo", new Integer('\u203A')); /* *HTMLsymbol */ ENTITIES.put("oline", new Integer('\u203E')); ENTITIES.put("frasl", new Integer('\u2044')); /* * HTMLspecial */ ENTITIES.put("euro", new Integer('\u20AC')); /* * HTMLsymbol */ ENTITIES.put("image", new Integer('\u2111')); ENTITIES.put("weierp", new Integer('\u2118')); ENTITIES.put("real", new Integer('\u211C')); ENTITIES.put("trade", new Integer('\u2122')); ENTITIES.put("alefsym", new Integer('\u2135')); ENTITIES.put("larr", new Integer('\u2190')); ENTITIES.put("uarr", new Integer('\u2191')); ENTITIES.put("rarr", new Integer('\u2192')); ENTITIES.put("darr", new Integer('\u2193')); ENTITIES.put("harr", new Integer('\u2194')); ENTITIES.put("crarr", new Integer('\u21B5')); ENTITIES.put("lArr", new Integer('\u21D0')); ENTITIES.put("uArr", new Integer('\u21D1')); ENTITIES.put("rArr", new Integer('\u21D2')); ENTITIES.put("dArr", new Integer('\u21D3')); ENTITIES.put("hArr", new Integer('\u21D4')); ENTITIES.put("forall", new Integer('\u2200')); ENTITIES.put("part", new Integer('\u2202')); ENTITIES.put("exist", new Integer('\u2203')); ENTITIES.put("empty", new Integer('\u2205')); ENTITIES.put("nabla", new Integer('\u2207')); ENTITIES.put("isin", new Integer('\u2208')); ENTITIES.put("notin", new Integer('\u2209')); ENTITIES.put("ni", new Integer('\u220B')); ENTITIES.put("prod", new Integer('\u220F')); ENTITIES.put("sum", new Integer('\u2211')); ENTITIES.put("minus", new Integer('\u2212')); ENTITIES.put("lowast", new Integer('\u2217')); ENTITIES.put("radic", new Integer('\u221A')); ENTITIES.put("prop", new Integer('\u221D')); ENTITIES.put("infin", new Integer('\u221E')); ENTITIES.put("ang", new Integer('\u2220')); ENTITIES.put("and", new Integer('\u2227')); ENTITIES.put("or", new Integer('\u2228')); ENTITIES.put("cap", new Integer('\u2229')); ENTITIES.put("cup", new Integer('\u222A')); ENTITIES.put("int", new Integer('\u222B')); ENTITIES.put("there4", new Integer('\u2234')); ENTITIES.put("sim", new Integer('\u223C')); ENTITIES.put("cong", new Integer('\u2245')); ENTITIES.put("asymp", new Integer('\u2248')); ENTITIES.put("ne", new Integer('\u2260')); ENTITIES.put("equiv", new Integer('\u2261')); ENTITIES.put("le", new Integer('\u2264')); ENTITIES.put("ge", new Integer('\u2265')); ENTITIES.put("sub", new Integer('\u2282')); ENTITIES.put("sup", new Integer('\u2283')); ENTITIES.put("nsub", new Integer('\u2284')); ENTITIES.put("sube", new Integer('\u2286')); ENTITIES.put("supe", new Integer('\u2287')); ENTITIES.put("oplus", new Integer('\u2295')); ENTITIES.put("otimes", new Integer('\u2297')); ENTITIES.put("perp", new Integer('\u22A5')); ENTITIES.put("sdot", new Integer('\u22C5')); ENTITIES.put("lceil", new Integer('\u2308')); ENTITIES.put("rceil", new Integer('\u2309')); ENTITIES.put("lfloor", new Integer('\u230A')); ENTITIES.put("rfloor", new Integer('\u230B')); ENTITIES.put("lang", new Integer('\u2329')); ENTITIES.put("rang", new Integer('\u232A')); ENTITIES.put("loz", new Integer('\u25CA')); ENTITIES.put("spades", new Integer('\u2660')); ENTITIES.put("clubs", new Integer('\u2663')); ENTITIES.put("hearts", new Integer('\u2665')); ENTITIES.put("diams", new Integer('\u2666')); } }