HTMLParser.java example

Explorer

commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl

package com.dappit.Dapper.parser;

import java.io.FileOutputStream;
import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;
import org.dom4j.DocumentException;
import org.w3c.dom.Document;

/**
 * An HTML parser that can take a stream of bytes and return a DOM Document
 * object.
 *
 * @author Tony Novak
 */
public interface HTMLParser
{
	/**
	 * Parse an html document, returning a DOM document.
	 *
	 * @param htmlBytes       Raw bytes to parse
	 * @param htmlEncoding    The character set (e.g. <tt>UTF-8</tt>) for
	 *                        decoding <i>htmlBytes</i>.  If <tt>null</tt>
	 *                        is passed, the default HTTP encoding
	 *                        (<tt>ISO-8559-1</tt>) is used, unless the
	 *                        character set is overridden by a
	 *                        <tt><META></tt> tag in the HTML body.
	 *                        If a non-null value is passed, any such
	 *                        <tt><META></tt> tag is ignored.
	 */
	public Document parse( byte[] htmlBytes, String htmlEncoding,FileOutputStream optinalOutputStream ) throws DocumentException, ParserException, ParserConfigurationException,IOException;
}