/** * (The MIT License) * * Copyright (c) 2008 - 2011: * * * {Aaron Patterson}[http://tenderlovemaking.com] * * {Mike Dalessio}[http://mike.daless.io] * * {Charles Nutter}[http://blog.headius.com] * * {Sergio Arbeo}[http://www.serabe.com] * * {Patrick Mahoney}[http://polycrystal.org] * * {Yoko Harada}[http://yokolet.blogspot.com] * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * 'Software'), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package nokogiri; import static nokogiri.internals.NokogiriHelpers.isWhitespaceText; import static org.jruby.javasupport.util.RuntimeHelpers.invoke; import java.io.IOException; import java.io.InputStream; import nokogiri.internals.NokogiriHandler; import nokogiri.internals.ParserContext; import nokogiri.internals.XmlSaxParser; import org.apache.xerces.parsers.AbstractSAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyFixnum; import org.jruby.RubyModule; import org.jruby.RubyObjectAdapter; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.exceptions.RaiseException; import org.jruby.javasupport.JavaEmbedUtils; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; import org.xml.sax.SAXParseException; /** * Base class for the SAX parsers. * * @author Patrick Mahoney <pat@polycrystal.org> * @author Yoko Harada <yokolet@gmail.com> */ @JRubyClass(name="Nokogiri::XML::SAX::ParserContext") public class XmlSaxParserContext extends ParserContext { protected static final String FEATURE_NAMESPACES = "http://xml.org/sax/features/namespaces"; protected static final String FEATURE_NAMESPACE_PREFIXES = "http://xml.org/sax/features/namespace-prefixes"; protected static final String FEATURE_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd"; protected static final String FEATURE_CONTINUE_AFTER_FATAL_ERROR = "http://apache.org/xml/features/continue-after-fatal-error"; protected AbstractSAXParser parser; protected NokogiriHandler handler = null; private IRubyObject replaceEntities; private IRubyObject recovery; public XmlSaxParserContext(final Ruby ruby, RubyClass rubyClass) { super(ruby, rubyClass); } protected void initialize(Ruby runtime) { replaceEntities = runtime.getTrue(); recovery = runtime.getFalse(); try { parser = createParser(); } catch (SAXException se) { throw RaiseException.createNativeRaiseException(runtime, se); } } /** * Create and return a copy of this object. * * @return a clone of this object */ @Override public Object clone() throws CloneNotSupportedException { return super.clone(); } protected AbstractSAXParser createParser() throws SAXException { XmlSaxParser parser = new XmlSaxParser(); parser.setFeature(FEATURE_NAMESPACE_PREFIXES, true); parser.setFeature(FEATURE_LOAD_EXTERNAL_DTD, false); return parser; } /** * Create a new parser context that will parse the string * <code>data</code>. */ @JRubyMethod(name="memory", meta=true) public static IRubyObject parse_memory(ThreadContext context, IRubyObject klazz, IRubyObject data) { XmlSaxParserContext ctx = (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz); ctx.initialize(context.getRuntime()); ctx.setInputSource(context, data, context.getRuntime().getNil()); return ctx; } /** * Create a new parser context that will read from the file * <code>data</code> and parse. */ @JRubyMethod(name="file", meta=true) public static IRubyObject parse_file(ThreadContext context, IRubyObject klazz, IRubyObject data) { XmlSaxParserContext ctx = (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz); ctx.initialize(context.getRuntime()); ctx.setInputSourceFile(context, data); return ctx; } /** * Create a new parser context that will read from the IO or * StringIO <code>data</code> and parse. * * TODO: Currently ignores encoding <code>enc</code>. */ @JRubyMethod(name="io", meta=true) public static IRubyObject parse_io(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject enc) { //int encoding = (int)enc.convertToInteger().getLongValue(); XmlSaxParserContext ctx = (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz); ctx.initialize(context.getRuntime()); ctx.setInputSource(context, data, context.getRuntime().getNil()); return ctx; } /** * Create a new parser context that will read from a raw input * stream. Not a JRuby method. Meant to be run in a separate * thread by XmlSaxPushParser. */ public static IRubyObject parse_stream(ThreadContext context, IRubyObject klazz, InputStream stream) { XmlSaxParserContext ctx = (XmlSaxParserContext) NokogiriService.XML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz); ctx.initialize(context.getRuntime()); ctx.setInputSource(stream); return ctx; } /** * Set a property of the underlying parser. */ protected void setProperty(String key, Object val) throws SAXNotRecognizedException, SAXNotSupportedException { parser.setProperty(key, val); } protected void setContentHandler(ContentHandler handler) { parser.setContentHandler(handler); } protected void setErrorHandler(ErrorHandler handler) { parser.setErrorHandler(handler); } public NokogiriHandler getNokogiriHandler() { return handler; } /** * Perform any initialization prior to parsing with the handler * <code>handlerRuby</code>. Convenience hook for subclasses. */ protected void preParse(ThreadContext context, IRubyObject handlerRuby, NokogiriHandler handler) { ((XmlSaxParser) parser).setXmlDeclHandler(handler); if(recovery.isTrue()) { try { ((XmlSaxParser) parser).setFeature(FEATURE_CONTINUE_AFTER_FATAL_ERROR, true); } catch(Exception e) { throw RaiseException.createNativeRaiseException(context.getRuntime(), e); } } } protected void postParse(ThreadContext context, IRubyObject handlerRuby, NokogiriHandler handler) { // noop } protected void do_parse() throws SAXException, IOException { parser.parse(getInputSource()); } @JRubyMethod public IRubyObject parse_with(ThreadContext context, IRubyObject handlerRuby) { Ruby ruby = context.getRuntime(); if(!invoke(context, handlerRuby, "respond_to?", ruby.newSymbol("document")).isTrue()) { String msg = "argument must respond_to document"; throw ruby.newArgumentError(msg); } handler = new NokogiriHandler(ruby, handlerRuby); preParse(context, handlerRuby, handler); setContentHandler(handler); setErrorHandler(handler); try{ setProperty("http://xml.org/sax/properties/lexical-handler", handler); } catch(Exception ex) { throw ruby.newRuntimeError( "Problem while creating XML SAX Parser: " + ex.toString()); } try{ try { do_parse(); } catch(SAXParseException spe) { // A bad document (<foo><bar></foo>) should call the // error handler instead of raising a SAX exception. // However, an EMPTY document should raise a // RuntimeError. This is a bit kludgy, but AFAIK SAX // doesn't distinguish between empty and bad whereas // Nokogiri does. String message = spe.getMessage(); if ("Premature end of file.".matches(message) && stringDataSize < 1) { throw ruby.newRuntimeError( "couldn't parse document: " + message); } else { handler.error(spe); } } } catch(SAXException se) { throw RaiseException.createNativeRaiseException(ruby, se); } catch(IOException ioe) { throw ruby.newIOErrorFromException(ioe); } postParse(context, handlerRuby, handler); //maybeTrimLeadingAndTrailingWhitespace(context, handlerRuby); return ruby.getNil(); } /** * Can take a boolean assignment. * * @param context * @param value * @return */ @JRubyMethod(name = "replace_entities=") public IRubyObject set_replace_entities(ThreadContext context, IRubyObject value) { if (!value.isTrue()) replaceEntities = context.getRuntime().getFalse(); else replaceEntities = context.getRuntime().getTrue(); return this; } @JRubyMethod(name="replace_entities") public IRubyObject get_replace_entities(ThreadContext context) { return replaceEntities; } /** * Can take a boolean assignment. * * @param context * @param value * @return */ @JRubyMethod(name = "recovery=") public IRubyObject set_recovery(ThreadContext context, IRubyObject value) { if (!value.isTrue()) recovery = context.getRuntime().getFalse(); else recovery = context.getRuntime().getTrue(); return this; } @JRubyMethod(name="recovery") public IRubyObject get_recovery(ThreadContext context) { return recovery; } /** * If the handler's document is a FragmentHandler, attempt to trim * leading and trailing whitespace. * * This is a bit hackish and depends heavily on the internals of * FragmentHandler. */ protected void maybeTrimLeadingAndTrailingWhitespace(ThreadContext context, IRubyObject parser) { final String path = "Nokogiri::XML::FragmentHandler"; RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter(); RubyModule mod = context.getRuntime().getClassFromPath(path); IRubyObject handler = adapter.getInstanceVariable(parser, "@document"); if (handler == null || handler.isNil() || !adapter.isKindOf(handler, mod)) return; IRubyObject stack = adapter.getInstanceVariable(handler, "@stack"); if (stack == null || stack.isNil()) return; // doc is finally a DocumentFragment whose nodes we can check IRubyObject doc = adapter.callMethod(stack, "first"); if (doc == null || doc.isNil()) return; IRubyObject children; for (;;) { children = adapter.callMethod(doc, "children"); IRubyObject first = adapter.callMethod(children, "first"); if (isWhitespaceText(context, first)) adapter.callMethod(first, "unlink"); else break; } for (;;) { children = adapter.callMethod(doc, "children"); IRubyObject last = adapter.callMethod(children, "last"); if (isWhitespaceText(context, last)) adapter.callMethod(last, "unlink"); else break; } // While we have a document, normalize it. ((XmlNode) doc).normalize(); } @JRubyMethod(name="column") public IRubyObject column(ThreadContext context) { Integer number = handler.getColumn(); if (number == null) return context.getRuntime().getNil(); else return RubyFixnum.newFixnum(context.getRuntime(), number.longValue()); } @JRubyMethod(name="line") public IRubyObject line(ThreadContext context) { Integer number = handler.getLine(); if (number == null) return context.getRuntime().getNil(); else return RubyFixnum.newFixnum(context.getRuntime(), number.longValue()); } }