/** * (The MIT License) * * Copyright (c) 2008 - 2012: * * * {Aaron Patterson}[http://tenderlovemaking.com] * * {Mike Dalessio}[http://mike.daless.io] * * {Charles Nutter}[http://blog.headius.com] * * {Sergio Arbeo}[http://www.serabe.com] * * {Patrick Mahoney}[http://polycrystal.org] * * {Yoko Harada}[http://yokolet.blogspot.com] * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * 'Software'), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package nokogiri; import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; import static nokogiri.internals.NokogiriHelpers.stringOrBlank; import java.io.IOException; import java.io.InputStream; import java.util.LinkedList; import java.util.List; import java.util.Stack; import nokogiri.internals.NokogiriEntityResolver; import nokogiri.internals.ParserContext; import nokogiri.internals.ParserContext.Options; import nokogiri.internals.ReaderNode; import nokogiri.internals.ReaderNode.ClosingNode; import nokogiri.internals.ReaderNode.ElementNode; import nokogiri.internals.ReaderNode.TextNode; import nokogiri.internals.UncloseableInputStream; import org.apache.xerces.impl.Constants; import org.apache.xerces.impl.xs.opti.DefaultXMLDocumentHandler; import org.apache.xerces.parsers.StandardParserConfiguration; import org.apache.xerces.util.EntityResolver2Wrapper; import org.apache.xerces.xni.Augmentations; import org.apache.xerces.xni.NamespaceContext; import org.apache.xerces.xni.QName; import org.apache.xerces.xni.XMLAttributes; import org.apache.xerces.xni.XMLLocator; import org.apache.xerces.xni.XMLResourceIdentifier; import org.apache.xerces.xni.XMLString; import org.apache.xerces.xni.XNIException; import org.apache.xerces.xni.parser.XMLErrorHandler; import org.apache.xerces.xni.parser.XMLInputSource; import org.apache.xerces.xni.parser.XMLParseException; import org.apache.xerces.xni.parser.XMLPullParserConfiguration; import org.jruby.Ruby; import org.jruby.RubyArray; import org.jruby.RubyBoolean; import org.jruby.RubyClass; import org.jruby.RubyFixnum; import org.jruby.RubyObject; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.exceptions.RaiseException; import org.jruby.runtime.Block; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.IOInputStream; import org.xml.sax.InputSource; /** * Class for Nokogiri:XML::Reader * * @author sergio * @author Yoko Harada <yokolet@gmail.com> */ @JRubyClass(name="Nokogiri::XML::Reader") public class XmlReader extends RubyObject { private static final int XML_TEXTREADER_MODE_INITIAL = 0; private static final int XML_TEXTREADER_MODE_INTERACTIVE = 1; private static final int XML_TEXTREADER_MODE_ERROR = 2; private static final int XML_TEXTREADER_MODE_EOF = 3; private static final int XML_TEXTREADER_MODE_CLOSED = 4; private static final int XML_TEXTREADER_MODE_READING = 5; List<ReaderNode> nodeQueue; private int state; private int position = 0; private XMLPullParserConfiguration config; private boolean continueParsing = true; public XmlReader(Ruby runtime, RubyClass klazz) { super(runtime, klazz); } /** * Create and return a copy of this object. * * @return a clone of this object */ @Override public Object clone() throws CloneNotSupportedException { return super.clone(); } public void init(Ruby runtime) { nodeQueue = new LinkedList<ReaderNode>(); nodeQueue.add(new ReaderNode.EmptyNode(runtime)); } private void setInput(ThreadContext context, InputStream in, IRubyObject url, Options options){ this.setState(XML_TEXTREADER_MODE_READING); config = this.createReader(context.getRuntime(), options); InputSource inputSource = new InputSource(); ParserContext.setUrl(context, inputSource, url); XMLInputSource xmlInputSource = new XMLInputSource(inputSource.getPublicId(), inputSource.getSystemId(), null, in, null); try { config.setInputSource(xmlInputSource); } catch (IOException e) { throw context.getRuntime().newRuntimeError(e.getMessage()); } this.setState(XML_TEXTREADER_MODE_CLOSED); } private void setState(int state) { this.state = state; } @JRubyMethod public IRubyObject attribute(ThreadContext context, IRubyObject name) { return currentNode().getAttributeByName(name); } @JRubyMethod public IRubyObject attribute_at(ThreadContext context, IRubyObject index) { return currentNode().getAttributeByIndex(index); } @JRubyMethod public IRubyObject attribute_count(ThreadContext context) { return currentNode().getAttributeCount(); } @JRubyMethod public IRubyObject attribute_nodes(ThreadContext context) { return currentNode().getAttributesNodes(); } @JRubyMethod public IRubyObject attr_nodes(ThreadContext context) { return currentNode().getAttributesNodes(); } @JRubyMethod(name = "attributes?") public IRubyObject attributes_p(ThreadContext context) { return currentNode().hasAttributes(); } @JRubyMethod public IRubyObject base_uri(ThreadContext context) { return currentNode().getXmlBase(); } @JRubyMethod(name="default?") public IRubyObject default_p(ThreadContext context){ return currentNode().isDefault(); } @JRubyMethod public IRubyObject depth(ThreadContext context) { return currentNode().getDepth(); } @JRubyMethod(name = {"empty_element?", "self_closing?"}) public IRubyObject empty_element_p(ThreadContext context) { ReaderNode readerNode = currentNode(); ensureNodeClosed(context); if (readerNode == null) return context.getRuntime().getNil(); if (!(readerNode instanceof ElementNode)) context.getRuntime().getFalse(); return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren); } @JRubyMethod(meta = true, rest = true) public static IRubyObject from_io(ThreadContext context, IRubyObject cls, IRubyObject args[]) { // Only to pass the source test. Ruby runtime = context.getRuntime(); // Not nil allowed! if(args[0].isNil()) throw runtime.newArgumentError("io cannot be nil"); XmlReader reader = (XmlReader) NokogiriService.XML_READER_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Reader")); reader.init(runtime); reader.setInstanceVariable("@source", args[0]); reader.setInstanceVariable("@errors", runtime.newArray()); IRubyObject url = context.nil; if (args.length > 1) url = args[1]; if (args.length > 2) reader.setInstanceVariable("@encoding", args[2]); Options options; if (args.length > 3) { options = new ParserContext.Options((Long)args[3].toJava(Long.class)); } else { // use the default options RECOVER | NONET options = new ParserContext.Options(2048 | 1); } InputStream in = new UncloseableInputStream(new IOInputStream(args[0])); reader.setInput(context, in, url, options); return reader; } @JRubyMethod(meta = true, rest = true) public static IRubyObject from_memory(ThreadContext context, IRubyObject cls, IRubyObject args[]) { // args[0]: string, args[1]: url, args[2]: encoding, args[3]: options Ruby runtime = context.getRuntime(); // Not nil allowed! if(args[0].isNil()) throw runtime.newArgumentError("string cannot be nil"); XmlReader reader = (XmlReader) NokogiriService.XML_READER_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Reader")); reader.init(runtime); reader.setInstanceVariable("@source", args[0]); reader.setInstanceVariable("@errors", runtime.newArray()); IRubyObject url = context.nil; if (args.length > 1) url = args[1]; if (args.length > 2) reader.setInstanceVariable("@encoding", args[2]); Options options; if (args.length > 3) { options = new ParserContext.Options((Long)args[3].toJava(Long.class)); } else { // use the default options RECOVER | NONET options = new ParserContext.Options(2048 | 1); } IRubyObject stringIO = NokogiriService.getNokogiriClassCache(context.getRuntime()).get("StringIO").newInstance(context, args[0], Block.NULL_BLOCK); InputStream in = new UncloseableInputStream(new IOInputStream(stringIO)); reader.setInput(context, in, url, options); return reader; } @JRubyMethod public IRubyObject node_type(ThreadContext context) { IRubyObject node_type = currentNode().getNodeType(); return node_type == null ? RubyFixnum.zero(context.getRuntime()) : node_type; } @JRubyMethod public IRubyObject inner_xml(ThreadContext context) { ensureNodeClosed(context); return stringOrBlank(context.getRuntime(), getInnerXml(currentNode())); } private String getInnerXml(ReaderNode current) { if (current.depth < 0) return null; if (!current.hasChildren) return null; StringBuffer sb = new StringBuffer(); for (int i = current.startOffset + 1; i <= current.endOffset - 1; i++) { sb.append(nodeQueue.get(i).getString()); } return new String(sb); } @JRubyMethod public IRubyObject outer_xml(ThreadContext context) { ensureNodeClosed(context); return stringOrBlank(context.getRuntime(), getOuterXml()); } private String getOuterXml() { ReaderNode current = currentNode(); if (current.depth < 0) return null; if (current instanceof ClosingNode) { return "<" + current.name + "/>"; } StringBuffer sb = new StringBuffer(); for (int i = position; i <= current.endOffset; i++) { sb.append(nodeQueue.get(i).getString()); } return new String(sb); } @JRubyMethod public IRubyObject lang(ThreadContext context) { return currentNode().getLang(); } @JRubyMethod public IRubyObject local_name(ThreadContext context) { return currentNode().getLocalName(); } @JRubyMethod public IRubyObject name(ThreadContext context) { return currentNode().getName(); } @JRubyMethod public IRubyObject namespace_uri(ThreadContext context) { return currentNode().getUri(); } @JRubyMethod public IRubyObject namespaces(ThreadContext context) { return currentNode().getNamespaces(context); } @JRubyMethod public IRubyObject prefix(ThreadContext context) { return currentNode().getPrefix(); } private void readMoreData(ThreadContext context) { if (!continueParsing) { throw context.runtime.newRuntimeError("Cannot parse more data"); } try { continueParsing = config.parse(false); } catch (XNIException e) { Ruby ruby = context.runtime; XmlSyntaxError exception = (XmlSyntaxError) NokogiriService.XML_SYNTAXERROR_ALLOCATOR.allocate(ruby, getNokogiriClass(ruby, "Nokogiri::XML::SyntaxError")); throw new RaiseException(exception); } catch (IOException e) { throw context.getRuntime().newRuntimeError("Received IOException: " + e.getMessage()); } } private void ensureNodeClosed(ThreadContext context) { ReaderNode node = currentNode(); if (node instanceof TextNode) { return; } while (node.endOffset < 1) { readMoreData(context); } } @JRubyMethod public IRubyObject read(ThreadContext context) { position++; while (nodeQueue.size() <= position && continueParsing) { readMoreData(context); } if(currentNode() == null) { return context.nil; } else if(currentNode().isError()) { RubyArray errors = (RubyArray) this.getInstanceVariable("@errors"); errors.append(currentNode().toSyntaxError()); this.setInstanceVariable("@errors", errors); throw new RaiseException((XmlSyntaxError) currentNode().toSyntaxError()); } else { return this; } } private ReaderNode currentNode() { if (position >= nodeQueue.size()) return null; return nodeQueue.get(position); } @JRubyMethod public IRubyObject state(ThreadContext context) { return context.getRuntime().newFixnum(this.state); } @JRubyMethod public IRubyObject value(ThreadContext context) { return currentNode().getValue(); } @JRubyMethod(name = "value?") public IRubyObject value_p(ThreadContext context) { return currentNode().hasValue(); } @JRubyMethod public IRubyObject xml_version(ThreadContext context) { return currentNode().getXmlVersion(); } protected XMLPullParserConfiguration createReader(Ruby ruby, Options options) { StandardParserConfiguration config = new StandardParserConfiguration(); DocumentHandler handler = new DocumentHandler(ruby); // XMLReader reader = XMLReaderFactory.createXMLReader(); config.setDocumentHandler(handler); config.setDTDHandler(handler); config.setErrorHandler(handler); config.setEntityResolver(new EntityResolver2Wrapper(new NokogiriEntityResolver(ruby, null, options))); // config.setFeature("http://xml.org/sax/features/xmlns-uris", true); // config.setFeature("http://xml.org/sax/features/namespace-prefixes", true); config.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", options.dtdLoad || options.dtdValid); return config; } private class DocumentHandler extends DefaultXMLDocumentHandler implements XMLErrorHandler { Stack<String> langStack; int depth; Stack<String> xmlBaseStack; Stack<ReaderNode.ElementNode> elementStack; private final Ruby ruby; public DocumentHandler(Ruby ruby) { this.ruby = ruby; } @Override public void startGeneralEntity(String name, XMLResourceIdentifier identifier, String encoding, Augmentations augs) throws XNIException { Object entitySkipped; if (augs != null && (entitySkipped = augs.getItem(Constants.ENTITY_SKIPPED)) != null && ((Boolean) entitySkipped) == true) { nodeQueue.add(new ReaderNode.ExceptionNode(ruby, null)); } } @Override public void startDocument(XMLLocator locator, String encoding, NamespaceContext context, Augmentations augs) { depth = 0; langStack = new Stack<String>(); xmlBaseStack = new Stack<String>(); elementStack = new Stack<ReaderNode.ElementNode>(); } @Override public void endDocument(Augmentations augs) { langStack = null; xmlBaseStack = null; elementStack = null; } @Override public void startElement(QName element, XMLAttributes attrs, Augmentations augs) { commonElement(element, attrs, false); } @Override public void endElement(QName element, Augmentations augs) { String uri = element.uri; String localName = element.localpart; String qName = element.rawname; depth--; ElementNode startElementNode = elementStack.pop(); ReaderNode node = ReaderNode.createClosingNode(ruby, uri, localName, qName, depth, langStack, xmlBaseStack); startElementNode.endOffset = nodeQueue.size() - 1; if (startElementNode.endOffset != startElementNode.startOffset) { // this node isn't empty node.attributeList = startElementNode.attributeList; node.namespaces = startElementNode.namespaces; node.startOffset = startElementNode.startOffset; node.endOffset = ++startElementNode.endOffset; node.hasChildren = startElementNode.hasChildren = true; nodeQueue.add(node); } if (!langStack.isEmpty()) langStack.pop(); if (!xmlBaseStack.isEmpty()) xmlBaseStack.pop(); } @Override public void emptyElement(QName element, XMLAttributes attrs, Augmentations augs) { commonElement(element, attrs, true); } private void commonElement(QName element, XMLAttributes attrs, boolean isEmpty) { String qName = element.rawname; String uri = element.uri; String localName = element.localpart; ReaderNode readerNode = ReaderNode.createElementNode(ruby, uri, localName, qName, attrs, depth, langStack, xmlBaseStack); if (!elementStack.isEmpty()) { ElementNode parent = elementStack.peek(); parent.hasChildren = true; } nodeQueue.add(readerNode); readerNode.startOffset = nodeQueue.size() - 1; if (!isEmpty) { depth++; if (readerNode.lang != null) langStack.push(readerNode.lang); if (readerNode.xmlBase != null) xmlBaseStack.push(readerNode.xmlBase); elementStack.push((ReaderNode.ElementNode)readerNode); } else { readerNode.endOffset = readerNode.startOffset; readerNode.hasChildren = false; } } @Override public void characters(XMLString string, Augmentations augs) { ReaderNode.TextNode node = ReaderNode.createTextNode(ruby, string.toString(), depth, langStack, xmlBaseStack); nodeQueue.add(node); node.startOffset = node.endOffset = nodeQueue.size() - 1; } @Override public void error(String domain, String key, XMLParseException ex) { nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex)); throw ex; } @Override public void fatalError(String domain, String key, XMLParseException ex) { nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex)); throw ex; } @Override public void warning(String domain, String key, XMLParseException ex) { nodeQueue.add(new ReaderNode.ExceptionNode(ruby, ex)); throw ex; } }; }