// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. // // TagSoup is licensed under the Apache License, // Version 2.0. You may obtain a copy of this license at // http://www.apache.org/licenses/LICENSE-2.0 . You may also have // additional legal rights not granted by this license. // // TagSoup is distributed in the hope that it will be useful, but // unless required by applicable law or agreed to in writing, TagSoup // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS // OF ANY KIND, either express or implied; not even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // // // The TagSoup parser package com.onegravity.rteditor.converter.tagsoup; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.DTDHandler; import org.xml.sax.EntityResolver; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; import org.xml.sax.XMLReader; import org.xml.sax.ext.LexicalHandler; import org.xml.sax.helpers.DefaultHandler; import android.annotation.SuppressLint; /** * The SAX parser class. */ public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler { // XMLReader implementation private ContentHandler theContentHandler = this; private LexicalHandler theLexicalHandler = this; private DTDHandler theDTDHandler = this; private ErrorHandler theErrorHandler = this; private EntityResolver theEntityResolver = this; private Schema theSchema; private Scanner theScanner; private AutoDetector theAutoDetector; // Default values for feature flags private static boolean DEFAULT_NAMESPACES = true; private static boolean DEFAULT_IGNORE_BOGONS = false; private static boolean DEFAULT_BOGONS_EMPTY = false; private static boolean DEFAULT_ROOT_BOGONS = true; private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true; private static boolean DEFAULT_TRANSLATE_COLONS = false; private static boolean DEFAULT_RESTART_ELEMENTS = true; private static boolean DEFAULT_IGNORABLE_WHITESPACE = false; private static boolean DEFAULT_CDATA_ELEMENTS = true; // Feature flags. private boolean namespaces = DEFAULT_NAMESPACES; private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS; private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY; private boolean rootBogons = DEFAULT_ROOT_BOGONS; private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES; private boolean translateColons = DEFAULT_TRANSLATE_COLONS; private boolean restartElements = DEFAULT_RESTART_ELEMENTS; private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS; /** * A value of "true" indicates namespace URIs and unprefixed local names for * element and attribute names will be available. */ public final static String namespacesFeature = "http://xml.org/sax/features/namespaces"; /** * A value of "true" indicates that XML qualified names (with prefixes) and * attributes (including xmlns* attributes) will be available. We don't * support this value. */ public final static String namespacePrefixesFeature = "http://xml.org/sax/features/namespace-prefixes"; /** * Reports whether this parser processes external general entities (it * doesn't). */ public final static String externalGeneralEntitiesFeature = "http://xml.org/sax/features/external-general-entities"; /** * Reports whether this parser processes external parameter entities (it * doesn't). */ public final static String externalParameterEntitiesFeature = "http://xml.org/sax/features/external-parameter-entities"; /** * May be examined only during a parse, after the startDocument() callback * has been completed; read-only. The value is true if the document * specified standalone="yes" in its XML declaration, and otherwise is * false. (It's always false.) */ public final static String isStandaloneFeature = "http://xml.org/sax/features/is-standalone"; /** * A value of "true" indicates that the LexicalHandler will report the * beginning and end of parameter entities (it won't). */ public final static String lexicalHandlerParameterEntitiesFeature = "http://xml.org/sax/features/lexical-handler/parameter-entities"; /** * A value of "true" indicates that system IDs in declarations will be * absolutized (relative to their base URIs) before reporting. (This returns * true but doesn't actually do anything.) */ public final static String resolveDTDURIsFeature = "http://xml.org/sax/features/resolve-dtd-uris"; /** * Has a value of "true" if all XML names (for elements, prefixes, * attributes, entities, notations, and local names), as well as Namespace * URIs, will have been interned using java.lang.String.intern. This * supports fast testing of equality/inequality against string constants, * rather than forcing slower calls to String.equals(). (We always intern.) */ public final static String stringInterningFeature = "http://xml.org/sax/features/string-interning"; /** * Returns "true" if the Attributes objects passed by this parser in * ContentHandler.startElement() implement the org.xml.sax.ext.Attributes2 * interface. (They don't.) */ public final static String useAttributes2Feature = "http://xml.org/sax/features/use-attributes2"; /** * Returns "true" if the Locator objects passed by this parser in * ContentHandler.setDocumentLocator() implement the * org.xml.sax.ext.Locator2 interface. (They don't.) */ public final static String useLocator2Feature = "http://xml.org/sax/features/use-locator2"; /** * Returns "true" if, when setEntityResolver is given an object implementing * the org.xml.sax.ext.EntityResolver2 interface, those new methods will be * used. (They won't be.) */ public final static String useEntityResolver2Feature = "http://xml.org/sax/features/use-entity-resolver2"; /** * Controls whether the parser is reporting all validity errors (We don't * report any validity errors.) */ public final static String validationFeature = "http://xml.org/sax/features/validation"; /** * Controls whether the parser reports Unicode normalization errors as * described in section 2.13 and Appendix B of the XML 1.1 Recommendation. * (We don't normalize.) */ public final static String unicodeNormalizationCheckingFeature = "http://xml.org/sax/features/unicode-normalization-checking"; /** * Controls whether, when the namespace-prefixes feature is set, the parser * treats namespace declaration attributes as being in the * http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) */ public final static String xmlnsURIsFeature = "http://xml.org/sax/features/xmlns-uris"; /** * Returns "true" if the parser supports both XML 1.1 and XML 1.0. (Always * false.) */ public final static String XML11Feature = "http://xml.org/sax/features/xml-1.1"; /** * A value of "true" indicates that the parser will ignore unknown elements. */ public final static String ignoreBogonsFeature = "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"; /** * A value of "true" indicates that the parser will give unknown elements a * content model of EMPTY; a value of "false", a content model of ANY. */ public final static String bogonsEmptyFeature = "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"; /** * A value of "true" indicates that the parser will allow unknown elements * to be the root element. */ public final static String rootBogonsFeature = "http://www.ccil.org/~cowan/tagsoup/features/root-bogons"; /** * A value of "true" indicates that the parser will return default attribute * values for missing attributes that have default values. */ public final static String defaultAttributesFeature = "http://www.ccil.org/~cowan/tagsoup/features/default-attributes"; /** * A value of "true" indicates that the parser will translate colons into * underscores in names. */ public final static String translateColonsFeature = "http://www.ccil.org/~cowan/tagsoup/features/translate-colons"; /** * A value of "true" indicates that the parser will attempt to restart the * restartable elements. */ public final static String restartElementsFeature = "http://www.ccil.org/~cowan/tagsoup/features/restart-elements"; /** * A value of "true" indicates that the parser will transmit whitespace in * element-only content via the SAX ignorableWhitespace callback. Normally * this is not done, because HTML is an SGML application and SGML suppresses * such whitespace. */ public final static String ignorableWhitespaceFeature = "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"; /** * A value of "true" indicates that the parser will treat CDATA elements * specially. Normally true, since the input is by default HTML. */ public final static String CDATAElementsFeature = "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"; /** * Used to see some syntax events that are essential in some applications: * comments, CDATA delimiters, selected general entity inclusions, and the * start and end of the DTD (and declaration of document element name). The * Object must implement org.xml.sax.ext.LexicalHandler. */ public final static String lexicalHandlerProperty = "http://xml.org/sax/properties/lexical-handler"; /** * Specifies the Scanner object this Parser uses. */ public final static String scannerProperty = "http://www.ccil.org/~cowan/tagsoup/properties/scanner"; /** * Specifies the Schema object this Parser uses. */ public final static String schemaProperty = "http://www.ccil.org/~cowan/tagsoup/properties/schema"; /** * Specifies the AutoDetector (for encoding detection) this Parser uses. */ public final static String autoDetectorProperty = "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"; // Due to sucky Java order of initialization issues, these // entries are maintained separately from the initial values of // the corresponding instance variables, but care must be taken // to keep them in sync. private HashMap<String, Boolean> theFeatures = new HashMap<String, Boolean>(); { theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES)); theFeatures.put(namespacePrefixesFeature, Boolean.FALSE); theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE); theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE); theFeatures.put(isStandaloneFeature, Boolean.FALSE); theFeatures.put(lexicalHandlerParameterEntitiesFeature, Boolean.FALSE); theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE); theFeatures.put(stringInterningFeature, Boolean.TRUE); theFeatures.put(useAttributes2Feature, Boolean.FALSE); theFeatures.put(useLocator2Feature, Boolean.FALSE); theFeatures.put(useEntityResolver2Feature, Boolean.FALSE); theFeatures.put(validationFeature, Boolean.FALSE); theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); theFeatures.put(XML11Feature, Boolean.FALSE); theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS)); theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY)); theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS)); theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES)); theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS)); theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS)); theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE)); theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS)); } // Private clone of Boolean.valueOf that is guaranteed to return // Boolean.TRUE or Boolean.FALSE private static Boolean truthValue(boolean b) { return b ? Boolean.TRUE : Boolean.FALSE; } @Override public boolean getFeature(String name) throws SAXNotRecognizedException, SAXNotSupportedException { Boolean b = (Boolean) theFeatures.get(name); if (b == null) { throw new SAXNotRecognizedException("Unknown feature " + name); } return b.booleanValue(); } @Override public void setFeature(String name, boolean value) throws SAXNotRecognizedException, SAXNotSupportedException { Boolean b = (Boolean) theFeatures.get(name); if (b == null) { throw new SAXNotRecognizedException("Unknown feature " + name); } if (value) theFeatures.put(name, Boolean.TRUE); else theFeatures.put(name, Boolean.FALSE); if (name.equals(namespacesFeature)) namespaces = value; else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value; else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value; else if (name.equals(rootBogonsFeature)) rootBogons = value; else if (name.equals(defaultAttributesFeature)) defaultAttributes = value; else if (name.equals(translateColonsFeature)) translateColons = value; else if (name.equals(restartElementsFeature)) restartElements = value; else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value; else if (name.equals(CDATAElementsFeature)) CDATAElements = value; } @Override public Object getProperty(String name) throws SAXNotRecognizedException, SAXNotSupportedException { if (name.equals(lexicalHandlerProperty)) { return theLexicalHandler == this ? null : theLexicalHandler; } else if (name.equals(scannerProperty)) { return theScanner; } else if (name.equals(schemaProperty)) { return theSchema; } else if (name.equals(autoDetectorProperty)) { return theAutoDetector; } else { throw new SAXNotRecognizedException("Unknown property " + name); } } @Override public void setProperty(String name, Object value) throws SAXNotRecognizedException, SAXNotSupportedException { if (name.equals(lexicalHandlerProperty)) { if (value == null) { theLexicalHandler = this; } else if (value instanceof LexicalHandler) { theLexicalHandler = (LexicalHandler) value; } else { throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler"); } } else if (name.equals(scannerProperty)) { if (value instanceof Scanner) { theScanner = (Scanner) value; } else { throw new SAXNotSupportedException("Your scanner is not a Scanner"); } } else if (name.equals(schemaProperty)) { if (value instanceof Schema) { theSchema = (Schema) value; } else { throw new SAXNotSupportedException("Your schema is not a Schema"); } } else if (name.equals(autoDetectorProperty)) { if (value instanceof AutoDetector) { theAutoDetector = (AutoDetector) value; } else { throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector"); } } else { throw new SAXNotRecognizedException("Unknown property " + name); } } @Override public void setEntityResolver(EntityResolver resolver) { theEntityResolver = (resolver == null) ? this : resolver; } @Override public EntityResolver getEntityResolver() { return (theEntityResolver == this) ? null : theEntityResolver; } @Override public void setDTDHandler(DTDHandler handler) { theDTDHandler = (handler == null) ? this : handler; } @Override public DTDHandler getDTDHandler() { return (theDTDHandler == this) ? null : theDTDHandler; } @Override public void setContentHandler(ContentHandler handler) { theContentHandler = (handler == null) ? this : handler; } @Override public ContentHandler getContentHandler() { return (theContentHandler == this) ? null : theContentHandler; } @Override public void setErrorHandler(ErrorHandler handler) { theErrorHandler = (handler == null) ? this : handler; } @Override public ErrorHandler getErrorHandler() { return (theErrorHandler == this) ? null : theErrorHandler; } @Override public void parse(String systemid) throws IOException, SAXException { parse(new InputSource(systemid)); } @Override public synchronized void parse(InputSource input) throws IOException, SAXException { setup(); Reader r = getReader(input); theContentHandler.startDocument(); theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId()); if (theScanner instanceof Locator) { theContentHandler.setDocumentLocator((Locator) theScanner); } if (!(theSchema.getURI().equals(""))) { theContentHandler.startPrefixMapping(theSchema.getPrefix(), theSchema.getURI()); } theScanner.scan(r, this); } // Sets up instance variables that haven't been set by setFeature private void setup() { if (theSchema == null) theSchema = new HTMLSchema(); if (theScanner == null) theScanner = new HTMLScanner(); if (theAutoDetector == null) { theAutoDetector = new AutoDetector() { public Reader autoDetectingReader(InputStream i) { return new InputStreamReader(i); } }; } theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes); thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes); theNewElement = null; theAttributeName = null; thePITarget = null; theSaved = null; theEntity = 0; virginStack = true; theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null; } // Return a Reader based on the contents of an InputSource // Buffer both the InputStream and the Reader private Reader getReader(InputSource s) throws SAXException, IOException { Reader r = s.getCharacterStream(); InputStream i = s.getByteStream(); String encoding = s.getEncoding(); String publicid = s.getPublicId(); String systemid = s.getSystemId(); if (r == null) { if (i == null) i = getInputStream(publicid, systemid); // i = new BufferedInputStream(i); if (encoding == null) { r = theAutoDetector.autoDetectingReader(i); } else { try { r = new InputStreamReader(i, encoding); } catch (UnsupportedEncodingException e) { r = new InputStreamReader(i); } } } // r = new BufferedReader(r); return r; } // Get an InputStream based on a publicid and a systemid private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException { URL basis = new URL("file", "", System.getProperty("user.dir") + "/."); URL url = new URL(basis, systemid); URLConnection c = url.openConnection(); return c.getInputStream(); } // We don't process publicids (who uses them anyhow?) // ScanHandler implementation private Element theNewElement = null; private String theAttributeName = null; private boolean theDoctypeIsPresent = false; private String theDoctypePublicId = null; private String theDoctypeSystemId = null; private String theDoctypeName = null; private String thePITarget = null; private Element theStack = null; private Element theSaved = null; private Element thePCDATA = null; private int theEntity = 0; // needs to support chars past U+FFFF @Override public void adup(char[] buff, int offset, int length) throws SAXException { if (theNewElement != null && theAttributeName != null) { theNewElement.setAttribute(theAttributeName, null, theAttributeName); theAttributeName = null; } } @SuppressLint("DefaultLocale") @Override public void aname(char[] buff, int offset, int length) throws SAXException { if (theNewElement != null) { // currently we don't rely on Schema to canonicalize attribute names. theAttributeName = makeName(buff, offset, length).toLowerCase(); } } @Override public void aval(char[] buff, int offset, int length) throws SAXException { if (theNewElement != null && theAttributeName != null) { String value = new String(buff, offset, length); value = expandEntities(value); theNewElement.setAttribute(theAttributeName, null, value); theAttributeName = null; } } // Expand entity references in attribute values selectively. // Currently we expand a reference iff it is properly terminated // with a semicolon. private String expandEntities(String src) { int refStart = -1; int len = src.length(); char[] dst = new char[len]; int dstlen = 0; for (int i = 0; i < len; i++) { char ch = src.charAt(i); dst[dstlen++] = ch; if (ch == '&' && refStart == -1) { // start of a ref excluding & refStart = dstlen; } else if (refStart == -1) { // not in a ref } else if (Character.isLetter(ch) || Character.isDigit(ch) || ch == '#') { // valid entity char } else if (ch == ';') { // properly terminated ref int ent = lookupEntity(dst, refStart, dstlen - refStart - 1); if (ent > 0xFFFF) { ent -= 0x10000; dst[refStart - 1] = (char) ((ent >> 10) + 0xD800); dst[refStart] = (char) ((ent & 0x3FF) + 0xDC00); dstlen = refStart + 1; } else if (ent != 0) { dst[refStart - 1] = (char) ent; dstlen = refStart; } refStart = -1; } else { // improperly terminated ref refStart = -1; } } return new String(dst, 0, dstlen); } @Override public void entity(char[] buff, int offset, int length) throws SAXException { theEntity = lookupEntity(buff, offset, length); } // Process numeric character references, // deferring to the schema for named ones. private int lookupEntity(char[] buff, int offset, int length) { int result = 0; if (length < 1) return result; // length) + "]"); if (buff[offset] == '#') { if (length > 1 && (buff[offset + 1] == 'x' || buff[offset + 1] == 'X')) { try { return Integer.parseInt(new String(buff, offset + 2, length - 2), 16); } catch (NumberFormatException e) { return 0; } } try { return Integer.parseInt( new String(buff, offset + 1, length - 1), 10); } catch (NumberFormatException e) { return 0; } } return theSchema.getEntity(new String(buff, offset, length)); } @Override public void eof(char[] buff, int offset, int length) throws SAXException { if (virginStack) rectify(thePCDATA); while (theStack.next() != null) { pop(); } if (!(theSchema.getURI().equals(""))) theContentHandler.endPrefixMapping(theSchema.getPrefix()); theContentHandler.endDocument(); } @Override public void etag(char[] buff, int offset, int length) throws SAXException { if (etag_cdata(buff, offset, length)) return; etag_basic(buff, offset, length); } private static char[] etagchars = {'<', '/', '>'}; private boolean etag_cdata(char[] buff, int offset, int length) throws SAXException { String currentName = theStack.name(); // If this is a CDATA element and the tag doesn't match, // or isn't properly formed (junk after the name), // restart CDATA mode and process the tag as characters. if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { boolean realTag = (length == currentName.length()); if (realTag) { for (int i = 0; i < length; i++) { if (Character.toLowerCase(buff[offset + i]) != Character .toLowerCase(currentName.charAt(i))) { realTag = false; break; } } } if (!realTag) { theContentHandler.characters(etagchars, 0, 2); theContentHandler.characters(buff, offset, length); theContentHandler.characters(etagchars, 2, 1); theScanner.startCDATA(); return true; } } return false; } private void etag_basic(char[] buff, int offset, int length) throws SAXException { theNewElement = null; String name; if (length != 0) { // Canonicalize case of name name = makeName(buff, offset, length); ElementType type = theSchema.getElementType(name); if (type == null) return; // mysterious end-tag name = type.name(); } else { name = theStack.name(); } Element sp; boolean inNoforce = false; for (sp = theStack; sp != null; sp = sp.next()) { if (sp.name().equals(name)) break; if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true; } if (sp == null) return; // Ignore unknown etags if (sp.next() == null || sp.next().next() == null) return; if (inNoforce) { // inside an F_NOFORCE element? sp.preclose(); // preclose the matching element } else { // restartably pop everything above us while (theStack != sp) { restartablyPop(); } pop(); } // pop any preclosed elements now at the top while (theStack.isPreclosed()) { pop(); } restart(null); } // Push restartables on the stack if possible // e is the next element to be started, if we know what it is private void restart(Element e) throws SAXException { while (theSaved != null && theStack.canContain(theSaved) && (e == null || theSaved.canContain(e))) { Element next = theSaved.next(); push(theSaved); theSaved = next; } } // Pop the stack irrevocably private void pop() throws SAXException { if (theStack == null) return; // empty stack String name = theStack.name(); String localName = theStack.localName(); String namespace = theStack.namespace(); String prefix = prefixOf(name); if (!namespaces) namespace = localName = ""; theContentHandler.endElement(namespace, localName, name); if (foreign(prefix, namespace)) { theContentHandler.endPrefixMapping(prefix); // "] for elements to " + namespace); } Attributes atts = theStack.atts(); for (int i = atts.getLength() - 1; i >= 0; i--) { String attNamespace = atts.getURI(i); String attPrefix = prefixOf(atts.getQName(i)); if (foreign(attPrefix, attNamespace)) { theContentHandler.endPrefixMapping(attPrefix); // "] for attributes to " + attNamespace); } } theStack = theStack.next(); } // Pop the stack restartably private void restartablyPop() throws SAXException { Element popped = theStack; pop(); if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) { popped.anonymize(); popped.setNext(theSaved); theSaved = popped; } } // Push element onto stack private boolean virginStack = true; private void push(Element e) throws SAXException { String name = e.name(); String localName = e.localName(); String namespace = e.namespace(); String prefix = prefixOf(name); e.clean(); if (!namespaces) namespace = localName = ""; if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) { try { theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId); } catch (IOException ew) { } // Can't be thrown for root I believe. } if (foreign(prefix, namespace)) { theContentHandler.startPrefixMapping(prefix, namespace); // + namespace); } Attributes atts = e.atts(); int len = atts.getLength(); for (int i = 0; i < len; i++) { String attNamespace = atts.getURI(i); String attPrefix = prefixOf(atts.getQName(i)); if (foreign(attPrefix, attNamespace)) { theContentHandler.startPrefixMapping(attPrefix, attNamespace); // "] for attributes to " + attNamespace); } } theContentHandler.startElement(namespace, localName, name, e.atts()); e.setNext(theStack); theStack = e; virginStack = false; if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { theScanner.startCDATA(); } } // Get the prefix from a QName private String prefixOf(String name) { int i = name.indexOf(':'); String prefix = ""; if (i != -1) prefix = name.substring(0, i); return prefix; } // Return true if we have a foreign name private boolean foreign(String prefix, String namespace) { // " for foreignness -- "); boolean foreign = !(prefix.equals("") || namespace.equals("") || namespace.equals(theSchema.getURI())); return foreign; } /** * Parsing the complete XML Document Type Definition is way too complex, but * for many simple cases we can extract something useful from it. * <p> * doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' * S?)? '>' DeclSep ::= PEReference | S intSubset ::= (markupdecl | * DeclSep)* markupdecl ::= elementdecl | AttlistDecl | EntityDecl | * NotationDecl | PI | Comment ExternalID ::= 'SYSTEM' S SystemLiteral | * 'PUBLIC' S PubidLiteral S SystemLiteral */ @Override public void decl(char[] buff, int offset, int length) throws SAXException { String s = new String(buff, offset, length); String name = null; String systemid = null; String publicid = null; String[] v = split(s); if (v.length > 0 && "DOCTYPE".equalsIgnoreCase(v[0])) { if (theDoctypeIsPresent) return; // one doctype only! theDoctypeIsPresent = true; if (v.length > 1) { name = v[1]; if (v.length > 3 && "SYSTEM".equals(v[2])) { systemid = v[3]; } else if (v.length > 3 && "PUBLIC".equals(v[2])) { publicid = v[3]; if (v.length > 4) { systemid = v[4]; } else { systemid = ""; } } } } publicid = trimquotes(publicid); systemid = trimquotes(systemid); if (name != null) { publicid = cleanPublicid(publicid); theLexicalHandler.startDTD(name, publicid, systemid); theLexicalHandler.endDTD(); theDoctypeName = name; theDoctypePublicId = publicid; if (theScanner instanceof Locator) { // Must resolve systemid theDoctypeSystemId = ((Locator) theScanner).getSystemId(); try { theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString(); } catch (Exception ignore) { } } } } // If the String is quoted, trim the quotes. private static String trimquotes(String in) { if (in == null) return in; int length = in.length(); if (length == 0) return in; char s = in.charAt(0); char e = in.charAt(length - 1); if (s == e && (s == '\'' || s == '"')) { in = in.substring(1, in.length() - 1); } return in; } // Split the supplied String into words or phrases seperated by spaces. // Recognises quotes around a phrase and doesn't split it. private static String[] split(String val) throws IllegalArgumentException { val = val.trim(); if (val.length() == 0) { return new String[0]; } else { ArrayList<String> l = new ArrayList<String>(); int s = 0; int e = 0; boolean sq = false; // single quote boolean dq = false; // double quote char lastc = 0; int len = val.length(); for (e = 0; e < len; e++) { char c = val.charAt(e); if (!dq && c == '\'' && lastc != '\\') { sq = !sq; if (s < 0) s = e; } else if (!sq && c == '\"' && lastc != '\\') { dq = !dq; if (s < 0) s = e; } else if (!sq && !dq) { if (Character.isWhitespace(c)) { if (s >= 0) l.add(val.substring(s, e)); s = -1; } else if (s < 0 && c != ' ') { s = e; } } lastc = c; } l.add(val.substring(s, e)); return (String[]) l.toArray(new String[0]); } } // Replace junk in publicids with spaces private static String legal = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"; private String cleanPublicid(String src) { if (src == null) return null; int len = src.length(); StringBuffer dst = new StringBuffer(len); boolean suppressSpace = true; for (int i = 0; i < len; i++) { char ch = src.charAt(i); if (legal.indexOf(ch) != -1) { // legal but not whitespace dst.append(ch); suppressSpace = false; } else if (suppressSpace) { // normalizable whitespace or junk ; } else { dst.append(' '); suppressSpace = true; } } return dst.toString().trim(); // trim any final junk whitespace } @Override public void gi(char[] buff, int offset, int length) throws SAXException { if (theNewElement != null) return; String name = makeName(buff, offset, length); if (name == null) return; ElementType type = theSchema.getElementType(name); if (type == null) { // Suppress unknown elements if ignore-bogons is on if (ignoreBogons) return; int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY; int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~Schema.M_ROOT); theSchema.elementType(name, bogonModel, bogonMemberOf, 0); if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name()); type = theSchema.getElementType(name); } theNewElement = new Element(type, defaultAttributes); } @Override public void cdsect(char[] buff, int offset, int length) throws SAXException { theLexicalHandler.startCDATA(); pcdata(buff, offset, length); theLexicalHandler.endCDATA(); } @Override public void pcdata(char[] buff, int offset, int length) throws SAXException { if (length == 0) return; boolean allWhite = true; for (int i = 0; i < length; i++) { if (!Character.isWhitespace(buff[offset + i])) { allWhite = false; } } if (allWhite && !theStack.canContain(thePCDATA)) { if (ignorableWhitespace) { theContentHandler.ignorableWhitespace(buff, offset, length); } } else { rectify(thePCDATA); theContentHandler.characters(buff, offset, length); } } @Override public void pitarget(char[] buff, int offset, int length) throws SAXException { if (theNewElement != null) return; thePITarget = makeName(buff, offset, length).replace(':', '_'); } @Override public void pi(char[] buff, int offset, int length) throws SAXException { if (theNewElement != null || thePITarget == null) return; if ("xml".equalsIgnoreCase(thePITarget)) return; // if (length > 0 && buff[length - 1] == '?') if (length > 0 && buff[length - 1] == '?') length--; // remove trailing ? theContentHandler.processingInstruction(thePITarget, new String(buff, offset, length)); thePITarget = null; } @Override public void stagc(char[] buff, int offset, int length) throws SAXException { if (theNewElement == null) return; rectify(theNewElement); if (theStack.model() == Schema.M_EMPTY) { // Force an immediate end tag etag_basic(buff, offset, length); } } @Override public void stage(char[] buff, int offset, int length) throws SAXException { if (theNewElement == null) return; rectify(theNewElement); // Force an immediate end tag etag_basic(buff, offset, length); } @Override public void cmnt(char[] buff, int offset, int length) throws SAXException { theLexicalHandler.comment(buff, offset, length); } // Rectify the stack, pushing and popping as needed // so that the argument can be safely pushed private void rectify(Element e) throws SAXException { Element sp; while (true) { for (sp = theStack; sp != null; sp = sp.next()) { if (sp.canContain(e)) break; } if (sp != null) break; ElementType parentType = e.parent(); if (parentType == null) break; Element parent = new Element(parentType, defaultAttributes); // parent.name()); parent.setNext(e); e = parent; } if (sp == null) return; // don't know what to do while (theStack != sp) { if (theStack == null || theStack.next() == null || theStack.next().next() == null) break; restartablyPop(); } while (e != null) { Element nexte = e.next(); if (!e.name().equals("<pcdata>")) push(e); e = nexte; restart(e); } theNewElement = null; } @Override public int getEntity() { return theEntity; } // Return the argument as a valid XML name // This no longer lowercases the result: we depend on Schema to // canonicalize case. private String makeName(char[] buff, int offset, int length) { StringBuffer dst = new StringBuffer(length + 2); boolean seenColon = false; boolean start = true; // String src = new String(buff, offset, length); // DEBUG for (; length-- > 0; offset++) { char ch = buff[offset]; if (Character.isLetter(ch) || ch == '_') { start = false; dst.append(ch); } else if (Character.isDigit(ch) || ch == '-' || ch == '.') { if (start) dst.append('_'); start = false; dst.append(ch); } else if (ch == ':' && !seenColon) { seenColon = true; if (start) dst.append('_'); start = true; dst.append(translateColons ? '_' : ch); } } int dstLength = dst.length(); if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_'); return dst.toString().intern(); } // Default LexicalHandler implementation @Override public void comment(char[] ch, int start, int length) throws SAXException { } @Override public void endCDATA() throws SAXException { } @Override public void endDTD() throws SAXException { } @Override public void endEntity(String name) throws SAXException { } @Override public void startCDATA() throws SAXException { } @Override public void startDTD(String name, String publicid, String systemid) throws SAXException { } @Override public void startEntity(String name) throws SAXException { } }