HTMLDocParser.java example

Explorer
eclipse.platform.ua-master
/*******************************************************************************
 * Copyright (c) 2000, 2015 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/
package org.eclipse.help.internal.search;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Locale;
import java.util.StringTokenizer;

import org.apache.lucene.demo.html.HTMLParser;
import org.eclipse.help.internal.base.util.ProxyUtil;

/**
 * Parser HTML documents. Extracts document encoding from header, and delegates
 * to lucene HTML parser for extraction of title, summary, and content.
 */
public class HTMLDocParser {
	// maximum number of characters that will be searched
	// from the beginning of HTML document to charset declaration
	public static final int MAX_OFFSET = 2048;

	// elements, atributes and values contstants
	final static String ELEMENT_META = "META"; //$NON-NLS-1$
	final static String ELEMENT_BODY = "body"; //$NON-NLS-1$
	final static String ELEMENT_HEAD = "head"; //$NON-NLS-1$
	final static String ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$
	final static String ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$
	final static String ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$

	// states for parsing elements
	final static int STATE_ELEMENT_START = 0;
	final static int STATE_ELEMENT_AFTER_LT = 1;
	final static int STATE_ELEMENT_AFTER_LT_SLASH = 2;
	final static int STATE_ELEMENT_META = 3;
	// states for parsing HTTP-EQUIV attribute
	final static int STATE_HTTP_START = 0;
	final static int STATE_HTTP_AFTER_NAME = 1;
	final static int STATE_HTTP_AFTER_EQ = 2;
	final static int STATE_HTTP_DONE = 3;
	// states for parsing CONTENT attribute
	final static int STATE_CONTENT_START = 0;
	final static int STATE_CONTENT_AFTER_NAME = 1;
	final static int STATE_CONTENT_AFTER_EQ = 2;
	final static int STATE_CONTENT_DONE = 3;

	private HTMLParser htmlParser;
	private InputStream inputStream = null;
	/**
	 * @param url
	 * @throws IOException
	 */
	public void openDocument(URL url) throws IOException {
		inputStream = ProxyUtil.getStream(url);

		String encoding = getCharsetFromHTML(inputStream);
		try {
			inputStream.close();
		} catch (IOException closeIOE) {
		}
		inputStream = ProxyUtil.getStream(url);
		if (encoding != null) {
			try {
				htmlParser = new HTMLParser(new InputStreamReader(inputStream,
						encoding));

			}
			catch (UnsupportedEncodingException uee) {
				htmlParser = new HTMLParser(new InputStreamReader(inputStream));
			}
		}
		else {
			htmlParser = new HTMLParser(new InputStreamReader(inputStream));
		}
		htmlParser.parse();
	}
	/**
	 * Releases resources (closes streams)
	 */
	public void closeDocument() {
		if (inputStream != null) {
			try {
				inputStream.close();
			} catch (IOException closeIOE) {
			}
		}
	}
	public String getTitle() throws IOException {
		if (htmlParser == null) {
			throw new NullPointerException();
		}
		try {
			return htmlParser.getTitle();
		} catch (InterruptedException ie) {
			return ""; //$NON-NLS-1$
		}
	}
	public String getSummary(String title) throws IOException {
		try {
			return htmlParser.getSummary();
		} catch (InterruptedException ie) {
			return ""; //$NON-NLS-1$
		}
	}

	public Reader getContentReader() throws IOException {
		if (htmlParser == null) {
			throw new NullPointerException();
		}
		return htmlParser.getReader();
	}
	/**
	 * Private. Parses HTML to extract document encoding specified in HTTP
	 * equivalent META tag in the document header. Example of such META tag is
	 * <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
	 *
	 * @return String or null if encoding not found
	 */
	public static String getCharsetFromHTML(InputStream is) {
		// Set up an ascii reader for the document (documents should not use
		// other characters before encoding is defined)
		Reader asciiReader = new ASCIIReader(is, MAX_OFFSET);
		StreamTokenizer tokenizer = new StreamTokenizer(asciiReader);

		// tokenizer.eolIsSignificant(false);// default false
		// tokenizer.slashSlashComments(false); // default false
		// tokenizer.slashStarComments(false);// default false
		tokenizer.lowerCaseMode(false);

		// tokenizer.quoteChar('\"'); // default quote char
		tokenizer.ordinaryChar('\''); // default quote char
		tokenizer.ordinaryChar('/'); // default comment character

		String charset = getCharsetFromHTMLTokens(tokenizer);
		if (asciiReader != null) {
			try {
				asciiReader.close();
			} catch (IOException ioe) {
			}
		}
		return charset;
	}
	public static String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) {
		// keeps track of content attribute attribute until parsing
		// of the meta tag is complete
		String contentValue = null;

		// initialize states
		int stateContent = STATE_HTTP_START;
		int stateElement = STATE_ELEMENT_START;
		int stateHttp = STATE_HTTP_START;

		try {
			// in the worst case, process tokens until end of file
			for (int token = tokenizer.nextToken(); token != StreamTokenizer.TT_EOF; token = tokenizer
					.nextToken()) {
				// debug tokens
				//				if (token == StreamTokenizer.TT_WORD) {
				//					System.out.println("word =" + tokenizer.sval);
				//				} else if (token == StreamTokenizer.TT_NUMBER) {
				//					System.out.println("number =" + tokenizer.nval);
				//				} else if (token == StreamTokenizer.TT_EOL) {
				//					System.out.println("endofline=");
				//				} else if ((char) token == '\"') {
				//					System.out.println("\" =" + tokenizer.sval);
				//
				//				} else {
				//					System.out.println("else =" + (char) token);
				//				}

				// process input based depending on current state
				switch (stateElement) {
					case STATE_ELEMENT_START :
						if (token == '<') {
							stateElement = STATE_ELEMENT_AFTER_LT;
						} // else do nothing, cannot be beginning of META tag
						break;
					case STATE_ELEMENT_AFTER_LT :
						if (token == StreamTokenizer.TT_WORD) {
							// some element opened
							if (ELEMENT_META.equalsIgnoreCase(tokenizer.sval)) {
								// META element opened
								stateElement = STATE_ELEMENT_META;
								// initialize state of attributes
								stateHttp = STATE_HTTP_START;
								stateContent = STATE_CONTENT_START;
								contentValue = null;
							} else if (ELEMENT_BODY
									.equalsIgnoreCase(tokenizer.sval)) {
								// body element opened, we are too far, stop
								// processing input
								return null;
							} else {
								// some other element opened, start from initial
								// state
								stateElement = STATE_ELEMENT_START;
							}
						} else if (token == '/') {
							// can be begging of head closing
							stateElement = STATE_ELEMENT_AFTER_LT_SLASH;
						} else {
							// not an element opened, could be openning of
							// declaration
							// or element closing e.t.c.
							stateElement = STATE_ELEMENT_START;
						}
						break;
					case STATE_ELEMENT_AFTER_LT_SLASH :
						if (token == StreamTokenizer.TT_WORD
								&& ELEMENT_HEAD
										.equalsIgnoreCase(tokenizer.sval)) {
							// head element closed, we are too far, stop
							// processing input
							return null;
						}
						stateElement = STATE_ELEMENT_START;
						break;
					default : // STATE_META_IN :
						switch (token) {
							case '>' :
								// no longer inside META, start from initial
								// state
								stateElement = STATE_ELEMENT_START;
								break;
							case StreamTokenizer.TT_WORD :
								// string inside META tag, can be attribute name
								if (ATTRIBUTE_HTTP
										.equalsIgnoreCase(tokenizer.sval)) {
									// found HTTP-EQUIV attribute name
									stateHttp = STATE_HTTP_AFTER_NAME;
								} else if (ATTRIBUTE_CONTENT
										.equalsIgnoreCase(tokenizer.sval)) {
									// found CONTENT attribute name
									stateContent = STATE_CONTENT_AFTER_NAME;
								} else if (stateHttp == STATE_HTTP_AFTER_EQ
										&& ATTRIBUTE_HTTP_VALUE
												.equalsIgnoreCase(tokenizer.sval)) {
									// value of HTTP-EQUIV attribute (unquoted)
									// we found <META ...
									// HTTP-EQUIV=content-type
									stateHttp = STATE_HTTP_DONE;
								} else {
									// some other attribute name or string,
									// reset states of seeked attributes,
									// unless successfully processed earlier
									if (stateHttp != STATE_HTTP_DONE) {
										stateHttp = STATE_HTTP_START;
									}
									if (stateContent != STATE_CONTENT_DONE) {
										stateContent = STATE_CONTENT_START;
									}
								}
								break;
							case '=' :
								// = inside META tag, can separate interesing us
								// attribute names from values
								if (stateHttp == STATE_HTTP_AFTER_NAME) {
									// we have HTTP-EQUIV=
									stateHttp = STATE_HTTP_AFTER_EQ;
								} else if (stateContent == STATE_CONTENT_AFTER_NAME) {
									// we have CONTENT=
									stateContent = STATE_CONTENT_AFTER_EQ;
								} else {
									// equal sign after some other attribute
									// name or string,
									// reset states of seeked attributes,
									// unless successfully processed earlier
									if (stateHttp != STATE_HTTP_DONE) {
										stateHttp = STATE_HTTP_START;
									}
									if (stateContent != STATE_CONTENT_DONE) {
										stateContent = STATE_CONTENT_START;
									}
								}
								break;
							case '\"' :
								// quoted string inside META tag, can be
								// attribute value
								if (stateHttp == STATE_HTTP_AFTER_EQ) {
									// value of HTTP-EQUIV attribute
									if (ATTRIBUTE_HTTP_VALUE
											.equalsIgnoreCase(tokenizer.sval)) {
										// we found <META ...
										// HTTP-EQUIV="content-type"
										stateHttp = STATE_HTTP_DONE;
									}
								} else if (stateContent == STATE_CONTENT_AFTER_EQ) {
									// value of CONTENT attribute
									stateContent = STATE_CONTENT_DONE;
									// save the value of the attribute
									// if attribue HTTP-EQUIV="content-type" is
									// found
									// in the same META tag, this value might
									// have
									// Content-type entity header
									contentValue = tokenizer.sval;
								} else {
									// value for the attribute is missing
									// reset states of seeked attributes
									stateHttp = STATE_HTTP_START;
									stateContent = STATE_CONTENT_START;
								}
								break;
							default :
								// other unexpected token inside META tag
								// reset states of seeked attributes,
								// unless successfully processed earlier
								if (stateHttp != STATE_HTTP_DONE) {
									stateHttp = STATE_HTTP_START;
								}
								if (stateContent != STATE_CONTENT_DONE) {
									stateContent = STATE_CONTENT_START;
								}
								break;
						}
						break;
				}
				if (contentValue != null && stateHttp == STATE_HTTP_DONE
						&& stateContent == STATE_CONTENT_DONE) {
					// <META HTTP-EQUIV="content-type" CONTENT="*******"
					// parse vale of content attribute to extract encoding
					return getCharsetFromHTTP(contentValue);
				}

			}
		} catch (IOException ioe) {
			return null;
		}
		// end of file
		return null;
	}
	/**
	 * Parses HTTP1.1 Content-Type entity-header field for example,
	 * Content-Type: text/html; charset=ISO-8859-4, and extracts charset
	 * parameter value of the media sub type.
	 *
	 * @return value of charset parameter, for example ISO-8859-4 or null if
	 *         parameter does not exist
	 */
	public static String getCharsetFromHTTP(String contentValue) {
		StringTokenizer t = new StringTokenizer(contentValue, ";"); //$NON-NLS-1$
		while (t.hasMoreTokens()) {
			String parameter = t.nextToken().trim();
			if (parameter.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { //$NON-NLS-1$
				String charset = parameter
						.substring("charset=".length()).trim(); //$NON-NLS-1$
				if (charset.length() > 0) {
					return charset;
				}
			}
		}
		return null;
	}

	public Exception getException() {
		if (htmlParser != null) {
			return htmlParser.getException();
		}
		return null;
	}
}