/******************************************************************************* * Copyright (c) 2008 Scott Stanchfield. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Based on the ANTLR parser generator by Terence Parr, http://antlr.org * Ric Klaren <klaren@cs.utwente.nl> * Scott Stanchfield - Modifications for XML Parsing *******************************************************************************/ package com.javadude.antxr.scanner; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.SAXParser; import org.xml.sax.Attributes; import org.xml.sax.DTDHandler; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; import com.javadude.antxr.CommonToken; import com.javadude.antxr.Token; import com.javadude.antxr.TokenStream; import com.javadude.antxr.TokenStreamException; /** * An XML token stream. You can pass any SAX parser, with whatever configuration * you want for use as the scanner. */ public class XMLTokenStream implements TokenStream { private boolean[] startTag; private Map<String, Map<String, Integer>> namespaces = new HashMap<String, Map<String,Integer>>(); private Map<String, Integer> tokens = new HashMap<String, Integer>(); private BlockingQueue<Object> blockingQueue; private int pcdataNum; private StringBuffer currentCharacters = new StringBuffer(); private int currentCharactersLine = -1; private int currentCharactersColumn = -1; private int endTagValue; private int otherTagValue = -1; /** * Create the xml token stream. This version does not gate the number of * tokens read by the SAX parser. <i>Note that this can cause the entire * XML to be read into memory!</i> If you have a small XML document to * parse, this is more efficient, but large XML documents can cause memory * problems. If you want to use a large XML file, call the other constructor * and pass it a maximumQueueSize and resumeQueueSize. * @param tokenNames An array of token names for your parser. You can get * this by passing YourParser._tokenNames, where YourParser * is an XML parser generated by ANTXR * @param namespaceMap A map of namespace/prefix mappings. You can get this * by passing YourParser.getNamespaceMap(), where * YourParser is an XML parser generated by ANTXR * @param in The XML InputSource containing the XML to parse * @param parser The SAX Parser that you want to use to scan (and possibly * validate) your XML * @param entityResolver An XML Entity resolver for the SAX parse (if needed), or null * @param dtdHandler and XML DTD Handler for theSAX parse (if needed), or null */ public XMLTokenStream(String[] tokenNames, Map<String, String> namespaceMap, InputSource in, SAXParser parser, EntityResolver entityResolver, DTDHandler dtdHandler) { this(tokenNames, namespaceMap, in, parser, entityResolver, dtdHandler, -1, -1); } /** * Create the xml token stream. This version does not gate the number of * tokens read by the SAX parser. <i>Note that this can cause the entire * XML to be read into memory!</i> If you have a small XML document to * parse, this is more efficient, but large XML documents can cause memory * problems. If you want to use a large XML file, call the other constructor * and pass it a maximumQueueSize and resumeQueueSize. * @param tokenNames An array of token names for your parser. You can get * this by passing YourParser._tokenNames, where YourParser * is an XML parser generated by ANTXR * @param namespaceMap A map of namespace/prefix mappings. You can get this * by passing YourParser.getNamespaceMap(), where * YourParser is an XML parser generated by ANTXR * @param in The XML InputSource containing the XML to parse * @param parser The SAX Parser that you want to use to scan (and possibly * validate) your XML * @param entityResolver An XML Entity resolver for the SAX parse (if needed), or null * @param dtdHandler and XML DTD Handler for theSAX parse (if needed), or null * @param maximumQueueSize the maximum number of tokens you want to place * in the blocking queue ready for the ANTXR parser * to fetch. This will put the SAX parse on hold * until resumeQueue size is reached. * @param resumeQueueSize The number of buffered tokens at which you will * resume the SAX parse */ public XMLTokenStream(String[] tokenNames, Map<String, String> namespaceMap, InputSource in, SAXParser parser, EntityResolver entityResolver, DTDHandler dtdHandler, int maximumQueueSize, int resumeQueueSize) { readTokens(tokenNames, namespaceMap); // TODO avoid NPE on following Integer tokenNum = tokens.get("PCDATA"); if (tokenNum == null) { pcdataNum = -99; } else { pcdataNum = tokenNum.intValue(); } blockingQueue = new BlockingQueue<Object>(maximumQueueSize,resumeQueueSize); parse(parser, in, entityResolver, dtdHandler); } /** * Set up the tokens to use when scanning * @param tokenNames The names of the tokens in the grammar * @param namespaceMap A mapping that includes prefixes */ private void readTokens(String[] tokenNames, Map<String, String> namespaceMap) { startTag = new boolean[tokenNames.length]; Pattern pattern = Pattern.compile("\"<((.*):)?(.*)>\""); for (int i = 0; i < tokenNames.length; i++) { String tokenName = tokenNames[i]; Matcher matcher = pattern.matcher(tokenName); Integer integerValue = new Integer(i); if (matcher.matches()) { String namespace = matcher.group(2); String tag = matcher.group(3); if (namespace == null) { namespace = namespaceMap.get("$DEFAULT"); } addTag(namespace, tag, integerValue); } else { tokens.put(tokenName,integerValue); if ("XML_END_TAG".equals(tokenName)) { endTagValue = integerValue.intValue(); } if ("OTHER_TAG".equals(tokenName)) { otherTagValue = integerValue.intValue(); } } } } // TODO if only one namespace, optimize further (no hashmap lookup) /** * Get the numerical token number for an XML tag * @param namespace The tag's namespace * @param tag The tag name * @return The tag's token id */ private Integer getTokenValue(String namespace, String tag) { return getTags(namespace).get(tag); } /** * Add an XML tag to our mapping * @param namespace The namespace/prefix map from the grammar * @param tag The xml tag to store * @param integerValue The integer value of the tag */ private void addTag(String namespace, String tag, Integer integerValue) { if (namespace == null) { namespace = ""; } getTags(namespace).put(tag, integerValue); startTag[integerValue.intValue()] = true; } /** * State whether the given token is an XML start tag * @param token the token to check * @return true if it's a start tag, false otherwise */ public boolean isStartTag(Token token) { return startTag[token.getType()]; } /** * Get all the tags defined in the given namespace * @param namespace The namespace to check * @return A map of tags to token ids */ private Map<String, Integer> getTags(String namespace) { if (namespace == null) { namespace = ""; } Map<String, Integer> tags = namespaces.get(namespace); if (tags == null) { tags = new HashMap<String, Integer>(); namespaces.put(namespace, tags); } return tags; } /** * Start parsing the XML * @param parser The SAX parser to use * @param in The XML to parse * @param entityResolver The user-defined entity resolver (or null) * @param dtdHandler The user-defined DTD handler (or null) */ private void parse(final SAXParser parser, final InputSource in, EntityResolver entityResolver, DTDHandler dtdHandler) { final ANTXRXMLHandler handler = new ANTXRXMLHandler(entityResolver, dtdHandler); Thread saxParseThread = new Thread("saxParserCreatingXMLTokens") { @Override public void run() { try { parser.parse(in,handler); } catch (Throwable t) { blockingQueue.enqueue(t); // stuff any exceptions in the queue } } }; saxParseThread.setDaemon(true); saxParseThread.start(); } /** {@inheritDoc} */ public Token nextToken() throws TokenStreamException { try { Object o = blockingQueue.dequeue(); if (o instanceof Throwable) { throw (Throwable)o; } return (Token)o; } catch (Throwable e) { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); e.printStackTrace(pw); pw.close(); String lineCol = ""; if (e instanceof SAXParseException) { SAXParseException se = (SAXParseException) e; lineCol = " (line " + se.getLineNumber() + " col " + se.getColumnNumber() + ")"; } throw new TokenStreamException("Error during XML parse" + lineCol + ':' + sw); } } /** * The SAX handler that glues the SAX parser to our blocking queue. * This class grabs notifications of tags from the SAX parser, creates * ANTXR tokens from them, and stuffs the tokens in the blocking queue. * The nextToken method returns tokens off the queue when asked. * * If the caller passes in a DTD and/or entity resolver, we delegate to * them when appropriate during the SAX parse. */ class ANTXRXMLHandler extends DefaultHandler { private Locator locator; private EntityResolver entityResolver; private DTDHandler dtdHandler; /** * Create the handler * @param entityResolver A user-defined entity resolver to delegate to * @param dtdHandler A user-defined dtd handler to delegate to */ public ANTXRXMLHandler(EntityResolver entityResolver, DTDHandler dtdHandler) { this.entityResolver = entityResolver; this.dtdHandler = dtdHandler; } /** {@inheritDoc} */ @Override public void setDocumentLocator(Locator locator) { this.locator = locator; } /** {@inheritDoc} */ @Override public void characters(char[] ch, int start, int length) throws SAXException { // collect all adjacent character chunks into a single PCDATA // to return to the parser // if PCDATA isn't used in the parser, don't collect characters if (pcdataNum == -99) { return; } if (currentCharactersLine == -1) { currentCharactersLine = locator.getLineNumber(); currentCharactersColumn = locator.getColumnNumber(); } currentCharacters.append(ch, start, length); } /** * Finish our PCDATA and send it to the parser. */ protected void finishCharacters() { // if PCDATA isn't used in the parser, don't collect characters if (pcdataNum == -99) { return; } int line = currentCharactersLine; int column = currentCharactersColumn; currentCharactersLine = -1; currentCharactersColumn = -1; String characters = currentCharacters.toString(); currentCharacters.delete(0, currentCharacters.length()); if ("".equals(characters.trim())) { return; } Token token = new CommonToken(pcdataNum,characters); token.setLine(line); token.setColumn(column); blockingQueue.enqueue(token); } /** {@inheritDoc} */ @Override public void endElement(String uri, String localName, String qName) throws SAXException { finishCharacters(); // if we were working on a PCDATA, send it! // queue an XML_END_TAG token Token token = new CommonToken(endTagValue,""); token.setLine(locator.getLineNumber()); token.setColumn(locator.getColumnNumber()); blockingQueue.enqueue(token); } /** {@inheritDoc} */ @Override public void endDocument() throws SAXException { finishCharacters(); // if we were working on a PCDATA, send it! // queue an EOF_TOKEN CommonToken eofToken = new CommonToken(Token.EOF_TYPE,""); eofToken.setLine(locator.getLineNumber()); eofToken.setColumn(locator.getColumnNumber()); blockingQueue.enqueue(eofToken); } /** {@inheritDoc} */ @Override public void error(SAXParseException e) throws SAXException { finishCharacters(); // if we were working on a PCDATA, send it! throw e; } /** {@inheritDoc} */ @Override public void fatalError(SAXParseException e) throws SAXException { finishCharacters(); // if we were working on a PCDATA, send it! throw e; } /** {@inheritDoc} */ @Override public void warning(SAXParseException e) throws SAXException { finishCharacters(); // if we were working on a PCDATA, send it! throw e; } /** {@inheritDoc} */ @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { finishCharacters(); // if we were working on a PCDATA, send it! // queue a start tag token for the tag if ("".equals(localName)) { localName = qName; } blockingQueue.enqueue(createXMLToken(uri, localName, attributes)); } /** * Create an XML token. * @param uri The namespace of the tag * @param localName The local name of the tag * @param attributes The tag attributes * @return An XMLToken * @throws SAXException If we have trouble accessing the SAX attributes */ private XMLToken createXMLToken(String uri, String localName, Attributes attributes) throws SAXException { Integer id = getTokenValue(uri, localName); String name = ""; if (uri != null && !"".equals(uri.trim())) { name += uri + ":"; } name += localName; int tokenValue; if (id != null) { tokenValue = id.intValue(); } else if (otherTagValue != -1) { tokenValue = otherTagValue; } else { throw new SAXException("Tag '" + name + "' not defined in parser grammar"); } List<Attribute> attributeList; if (attributes == null || attributes.getLength() == 0) { attributeList = Collections.emptyList(); } else { attributeList = new ArrayList<Attribute>(attributes.getLength()); for (int i = 0; i < attributes.getLength(); i++) { String localAttributeName = attributes.getLocalName(i); if ("".equals(localAttributeName)) { localAttributeName = attributes.getQName(i); } String namespace = attributes.getURI(i); String value = attributes.getValue(i); String type = attributes.getType(i); Attribute attribute = new Attribute(namespace,localAttributeName,value,type); attributeList.add(attribute); } } XMLToken token = new XMLToken(tokenValue, name, attributeList); token.setLine(locator.getLineNumber()); token.setColumn(locator.getColumnNumber()); return token; } /** {@inheritDoc} */ @Override public void notationDecl(String name, String publicId, String systemId) throws SAXException { // If we have an explicit DTD handler, delegate to it if (dtdHandler != null) { dtdHandler.notationDecl(name, publicId, systemId); } else { super.notationDecl(name, publicId, systemId); } } /** {@inheritDoc} */ @Override public void unparsedEntityDecl(String name, String publicId, String systemId, String notationName) throws SAXException { // If we have an explicit DTD handler, delegate to it if (dtdHandler != null) { dtdHandler.unparsedEntityDecl(name, publicId, systemId, notationName); } else { super.unparsedEntityDecl(name, publicId, systemId, notationName); } } /** {@inheritDoc} */ @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException { // If we have an explicit entity resolver, delegate to it if (entityResolver != null) { try { return entityResolver.resolveEntity(publicId, systemId); } catch (IOException e) { throw new SAXException(e); } } try { return super.resolveEntity(publicId, systemId); } catch (Exception e) { throw new SAXException(e); } } } }