/** * (The MIT License) * * Copyright (c) 2008 - 2012: * * * {Aaron Patterson}[http://tenderlovemaking.com] * * {Mike Dalessio}[http://mike.daless.io] * * {Charles Nutter}[http://blog.headius.com] * * {Sergio Arbeo}[http://www.serabe.com] * * {Patrick Mahoney}[http://polycrystal.org] * * {Yoko Harada}[http://yokolet.blogspot.com] * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * 'Software'), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package nokogiri; import nokogiri.internals.HtmlDomParserContext; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.javasupport.util.RuntimeHelpers; import org.jruby.runtime.Arity; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * Class for Nokogiri::HTML::Document. * * @author sergio * @author Yoko Harada <yokolet@gmail.com> */ @JRubyClass(name="Nokogiri::HTML::Document", parent="Nokogiri::XML::Document") public class HtmlDocument extends XmlDocument { private static final String DEFAULT_CONTENT_TYPE = "html"; private static final String DEFAULT_PUBLIC_ID = "-//W3C//DTD HTML 4.01//EN"; private static final String DEFAULT_SYTEM_ID = "http://www.w3.org/TR/html4/strict.dtd"; private String parsed_encoding = null; public HtmlDocument(Ruby ruby, RubyClass klazz) { super(ruby, klazz); } public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) { super(ruby, klazz, doc); } @JRubyMethod(name="new", meta = true, rest = true, required=0) public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) { HtmlDocument htmlDocument = null; try { Document docNode = createNewDocument(); htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz); htmlDocument.setDocumentNode(context, docNode); } catch (Exception ex) { throw context.getRuntime().newRuntimeError("couldn't create document: "+ex.toString()); } RuntimeHelpers.invoke(context, htmlDocument, "initialize", args); return htmlDocument; } public IRubyObject getInternalSubset(ThreadContext context) { IRubyObject internalSubset = super.getInternalSubset(context); // html documents are expected to have a default internal subset // the default values are the same ones used when the following // feature is turned on // "http://cyberneko.org/html/features/insert-doctype" // the reason we don't turn it on, is because it overrides the document's // declared doctype declaration. if (internalSubset.isNil()) { internalSubset = XmlDtd.newEmpty(context.getRuntime(), getDocument(), context.getRuntime().newString(DEFAULT_CONTENT_TYPE), context.getRuntime().newString(DEFAULT_PUBLIC_ID), context.getRuntime().newString(DEFAULT_SYTEM_ID)); setInternalSubset(internalSubset); } return internalSubset; } public static IRubyObject do_parse(ThreadContext context, IRubyObject klass, IRubyObject[] args) { Ruby ruby = context.getRuntime(); Arity.checkArgumentCount(ruby, args, 4, 4); HtmlDomParserContext ctx = new HtmlDomParserContext(ruby, args[2], args[3]); ctx.setInputSource(context, args[0], args[1]); return ctx.parse(context, klass, args[1]); } public void setDocumentNode(ThreadContext context, Node node) { super.setNode(context, node); Ruby runtime = context.getRuntime(); if (node != null) { Document document = (Document)node; document.normalize(); stabilzeAttrValue(document.getDocumentElement()); } setInstanceVariable("@decorators", runtime.getNil()); } private void stabilzeAttrValue(Node node) { if (node == null) return; if (node.hasAttributes()) { NamedNodeMap nodeMap = node.getAttributes(); for (int i=0; i<nodeMap.getLength(); i++) { Node n = nodeMap.item(i); if (n instanceof Attr) { Attr attr = (Attr)n; String attrName = attr.getName(); // not sure, but need to get value always before document is referred. // or lose attribute value String attrValue = attr.getValue(); // don't delete this line } } } NodeList children = node.getChildNodes(); for (int i=0; i<children.getLength(); i++) { stabilzeAttrValue(children.item(i)); } } public void setParsedEncoding(String encoding) { parsed_encoding = encoding; } public String getPraedEncoding() { return parsed_encoding; } /* * call-seq: * read_io(io, url, encoding, options) * * Read the HTML document from +io+ with given +url+, +encoding+, * and +options+. See Nokogiri::HTML.parse */ @JRubyMethod(meta = true, rest = true) public static IRubyObject read_io(ThreadContext context, IRubyObject cls, IRubyObject[] args) { return do_parse(context, cls, args); } /* * call-seq: * read_memory(string, url, encoding, options) * * Read the HTML document contained in +string+ with given +url+, +encoding+, * and +options+. See Nokogiri::HTML.parse */ @JRubyMethod(meta = true, rest = true) public static IRubyObject read_memory(ThreadContext context, IRubyObject cls, IRubyObject[] args) { return do_parse(context, cls, args); } }