/* * GNU LESSER GENERAL PUBLIC LICENSE Copyright (C) 2006 The Lobo Project * * This library is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, write to the Free Software Foundation, Inc., * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Contact info: lobochief@users.sourceforge.net */ /* * Created on Oct 15, 2005 */ package com.nvarghese.beowulf.common.cobra.html.parser; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import org.w3c.dom.DOMImplementation; import org.w3c.dom.Document; import org.xml.sax.EntityResolver; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.nvarghese.beowulf.common.cobra.html.HtmlRendererContext; import com.nvarghese.beowulf.common.cobra.html.UserAgentContext; import com.nvarghese.beowulf.common.cobra.html.domimpl.DOMImplementationImpl; import com.nvarghese.beowulf.common.cobra.html.domimpl.HTMLDocumentImpl; import com.nvarghese.beowulf.common.cobra.html.io.WritableLineReader; /** * The <code>DocumentBuilderImpl</code> class is an HTML DOM parser that * implements the standard W3C <code>DocumentBuilder</code> interface. * * @author J. H. S. */ public class DocumentBuilderImpl extends DocumentBuilder { private static final Logger logger = Logger.getLogger(DocumentBuilderImpl.class.getName()); private EntityResolver resolver; private ErrorHandler errorHandler; private final UserAgentContext bcontext; private final HtmlRendererContext rcontext; /** * Constructs a <code>DocumentBuilderImpl</code>. This constructor should be * used when only the parsing functionality (without rendering) is required. * * @param context * An instance of * {@link com.nvarghese.beowulf.common.cobra.html.UserAgentContext} * , which may be an instance of * {@link com.nvarghese.beowulf.common.cobra.com.nvarghese.beowulf.common.cobra.html.test.SimpleUserAgentContext} * . */ public DocumentBuilderImpl(UserAgentContext context) { this.rcontext = null; this.bcontext = context; } /** * Constructs a <code>DocumentBuilderImpl</code>. This constructor should be * used when rendering is expected. * * @param ucontext * An instance of * {@link com.nvarghese.beowulf.common.cobra.html.UserAgentContext} * , which may be an instance of * {@link com.nvarghese.beowulf.common.cobra.com.nvarghese.beowulf.common.cobra.html.test.SimpleUserAgentContext} * . * @param rcontext * An instance of * {@link com.nvarghese.beowulf.common.cobra.html.HtmlRendererContext} * , which may be an instance of * {@link com.nvarghese.beowulf.common.cobra.com.nvarghese.beowulf.common.cobra.html.test.SimpleHtmlRendererContext} * . */ public DocumentBuilderImpl(UserAgentContext ucontext, HtmlRendererContext rcontext) { this.rcontext = rcontext; this.bcontext = ucontext; } /** * Constructs a <code>DocumentBuilderImpl</code>. This constructor should be * used when rendering is expected. * * @param rcontext * An instance of * {@link com.nvarghese.beowulf.common.cobra.html.HtmlRendererContext} * , which may be an instance of * {@link com.nvarghese.beowulf.common.cobra.com.nvarghese.beowulf.common.cobra.html.test.SimpleHtmlRendererContext} * . */ public DocumentBuilderImpl(HtmlRendererContext rcontext) { this.rcontext = rcontext; this.bcontext = rcontext.getUserAgentContext(); } /** * Parses an HTML document. Note that this method will read the entire input * source before returning a <code>Document</code> instance. * * @param is * The input source, which may be an instance of * {@link com.nvarghese.beowulf.common.cobra.com.nvarghese.beowulf.common.cobra.html.parser.InputSourceImpl} * . * @see #createDocument(InputSource) */ public Document parse(InputSource is) throws org.xml.sax.SAXException, IOException { HTMLDocumentImpl document = (HTMLDocumentImpl) this.createDocument(is); document.load(); return document; } /** * Creates a document without parsing the input provided, so the document * object can be used for incremental rendering. * * @param is * The input source, which may be an instance of * {@link com.nvarghese.beowulf.common.cobra.com.nvarghese.beowulf.common.cobra.html.parser.InputSourceImpl} * . The input source must provide either an input stream or a * reader. * @see HTMLDocumentImpl#load() */ public Document createDocument(InputSource is) throws SAXException, IOException { String encoding = is.getEncoding(); String charset = encoding; if (charset == null) { charset = "US-ASCII"; } String uri = is.getSystemId(); if (uri == null) { logger.warning("parse(): InputSource has no SystemId (URI); document item URLs will not be resolvable."); } WritableLineReader wis = null; Reader reader = is.getCharacterStream(); if (reader != null) { wis = new WritableLineReader(reader); } else { InputStream in = is.getByteStream(); if (in != null) { wis = new WritableLineReader(new InputStreamReader(in, charset)); } else if (uri != null) { /* * Disabling this feature as the request goes out of scanner's * contexts * * // To comply with the InputSource documentation, we need // * to do this: java.net.URLConnection connection = new * java.net.URL(uri).openConnection(); in = * connection.getInputStream(); if(encoding == null) { charset = * org.cobra_grendel.util.Urls.getCharset(connection); } wis = * new WritableLineReader(new InputStreamReader(in, charset)); */ } else { throw new IllegalArgumentException("The InputSource must have either a reader, an input stream or a URI."); } } HTMLDocumentImpl document = new HTMLDocumentImpl(this.bcontext, this.rcontext, wis, uri); return document; } public boolean isNamespaceAware() { return false; } public boolean isValidating() { return false; } public void setEntityResolver(EntityResolver er) { this.resolver = er; } public void setErrorHandler(ErrorHandler eh) { this.errorHandler = eh; } public Document newDocument() { return new HTMLDocumentImpl(this.bcontext); } private DOMImplementation domImplementation; public DOMImplementation getDOMImplementation() { synchronized (this) { if (this.domImplementation == null) { this.domImplementation = new DOMImplementationImpl(this.bcontext); } return this.domImplementation; } } public ErrorHandler getErrorHandler() { return errorHandler; } public EntityResolver getResolver() { return resolver; } }