HTMLConverter.java example

Explorer
lucida-master
- lucida
package info.ephyra.util;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;

import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;

/**
 * The <code>HTMLConverter</code> can be used to convert an HTML document to
 * plain text.
 * 
 * @author Nico Schlaefer
 * @version 2007-06-19
 */
public class HTMLConverter {
	/** Timeout for HTTP connections in milliseconds. */
	private static final int TIMEOUT = 120000;  // 2 min
	
	/**
	 * Checks if the given string is a URL.
	 * 
	 * @param s a string
	 * @return <code>true</code> iff the string is a URL
	 */
	public static boolean isUrl(String s) {
		try {
			new URL(s);
		} catch (MalformedURLException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Handles special characters in HTML documents by replacing sequences of
	 * the form <code>&...;</code> by the corresponding characters.
	 * 
	 * @param html html document
	 * @return transformed html document
	 */
	public static synchronized String replaceSpecialCharacters(String html) {
		html = html.replaceAll("	", " ");
		html = html.replaceAll("
", " ");
		html = html.replaceAll(" ", " ");
		html = html.replaceAll("!", "!");
		html = html.replaceAll("(?i)("|")", "\"");
		html = html.replaceAll("#", "#");
		html = html.replaceAll("$", "$");
		html = html.replaceAll("%", "%");
		html = html.replaceAll("(?i)(&|&)", "&");
		html = html.replaceAll("'", "'");
		html = html.replaceAll("(", "(");
		html = html.replaceAll(")", ")");
		html = html.replaceAll("*", "*");
		html = html.replaceAll("+", "+");
		html = html.replaceAll(",", ",");
		html = html.replaceAll("-", "-");
		html = html.replaceAll(".", ".");
		html = html.replaceAll("(?i)(/|⁄)", "/");
		html = html.replaceAll("0", "0");
		html = html.replaceAll("1", "1");
		html = html.replaceAll("2", "2");
		html = html.replaceAll("3", "3");
		html = html.replaceAll("4", "4");
		html = html.replaceAll("5", "5");
		html = html.replaceAll("6", "6");
		html = html.replaceAll("7", "7");
		html = html.replaceAll("8", "8");
		html = html.replaceAll("9", "9");
		html = html.replaceAll(":", ":");
		html = html.replaceAll(";", ";");
		html = html.replaceAll("(?i)(<|<)", "<");
		html = html.replaceAll("=", "=");
		html = html.replaceAll("(?i)(>|>)", ">");
		html = html.replaceAll("?", "?");
		html = html.replaceAll("@", "@");
		html = html.replaceAll("A", "A");
		html = html.replaceAll("B", "B");
		html = html.replaceAll("C", "C");
		html = html.replaceAll("D", "D");
		html = html.replaceAll("E", "E");
		html = html.replaceAll("F", "F");
		html = html.replaceAll("G", "G");
		html = html.replaceAll("H", "H");
		html = html.replaceAll("I", "I");
		html = html.replaceAll("J", "J");
		html = html.replaceAll("K", "K");
		html = html.replaceAll("L", "L");
		html = html.replaceAll("M", "M");
		html = html.replaceAll("N", "N");
		html = html.replaceAll("O", "O");
		html = html.replaceAll("P", "P");
		html = html.replaceAll("Q", "Q");
		html = html.replaceAll("R", "R");
		html = html.replaceAll("S", "S");
		html = html.replaceAll("T", "T");
		html = html.replaceAll("U", "U");
		html = html.replaceAll("V", "V");
		html = html.replaceAll("W", "W");
		html = html.replaceAll("X", "X");
		html = html.replaceAll("Y", "Y");
		html = html.replaceAll("Z", "Z");
		html = html.replaceAll("[", "[");
		html = html.replaceAll("\", "\\");
		html = html.replaceAll("]", "]");
		html = html.replaceAll("^", "^");
		html = html.replaceAll("_", "_");
		html = html.replaceAll("`", "`");
		html = html.replaceAll("a", "a");
		html = html.replaceAll("b", "b");
		html = html.replaceAll("c", "c");
		html = html.replaceAll("d", "d");
		html = html.replaceAll("e", "e");
		html = html.replaceAll("f", "f");
		html = html.replaceAll("g", "g");
		html = html.replaceAll("h", "h");
		html = html.replaceAll("i", "i");
		html = html.replaceAll("j", "j");
		html = html.replaceAll("k", "k");
		html = html.replaceAll("l", "l");
		html = html.replaceAll("m", "m");
		html = html.replaceAll("n", "n");
		html = html.replaceAll("o", "o");
		html = html.replaceAll("p", "p");
		html = html.replaceAll("q", "q");
		html = html.replaceAll("r", "r");
		html = html.replaceAll("s", "s");
		html = html.replaceAll("t", "t");
		html = html.replaceAll("u", "u");
		html = html.replaceAll("v", "v");
		html = html.replaceAll("w", "w");
		html = html.replaceAll("x", "x");
		html = html.replaceAll("y", "y");
		html = html.replaceAll("z", "z");
		html = html.replaceAll("{", "{");
		html = html.replaceAll("|", "|");
		html = html.replaceAll("}", "}");
		html = html.replaceAll("~", "~");
		html = html.replaceAll("(?i)(–|–)", "–");
		html = html.replaceAll("(?i)(—|—)", "—");
		html = html.replaceAll("(?i)( | )", " ");
		html = html.replaceAll("(?i)(¡|¡)", "¡");
		html = html.replaceAll("(?i)(¢|¢)", "¢");
		html = html.replaceAll("(?i)(£|£)", "£");
		html = html.replaceAll("(?i)(¤|¤)", "¤");
		html = html.replaceAll("(?i)(¥|¥)", "¥");
		html = html.replaceAll("(?i)(¦|¦|&brkbar;)", "¦");
		html = html.replaceAll("(?i)(§|§)", "§");
		html = html.replaceAll("(?i)(¨|¨|¨)", "¨");
		html = html.replaceAll("(?i)(©|©)", "©");
		html = html.replaceAll("(?i)(ª|ª)", "ª");
		html = html.replaceAll("(?i)(«|«)", "«");
		html = html.replaceAll("(?i)(¬|¬)", "¬");
		html = html.replaceAll("(?i)(|)", "");
		html = html.replaceAll("(?i)(®|®)", "®");
		html = html.replaceAll("(?i)(¯|¯|&hibar;)", "¯");
		// TODO complete this list
		// (http://www.webmonkey.com/reference/Special_Characters/)
		
		html = html.replaceAll("&#?+\\w*+;", "");  // drop invalid codes
		
		return html;
	}
	
	/**
	 * Converts a snippet with HTML tags and special characters into plain text.
	 * 
	 * @param snippet HTML snippet
	 * @return plain text
	 */
	public static synchronized String htmlsnippet2text(String snippet) {
		// drop HTML tags
		snippet = snippet.replaceAll("<[^>]*+>", "");
		
		// handle special characters
		snippet = replaceSpecialCharacters(snippet);
		
		// replace sequences of whitespaces by single blanks and trim
		snippet = snippet.replaceAll("\\s++", " ").trim();
		
		return snippet;
	}
	
	/**
	 * Converts an HTML document into plain text.
	 * 
	 * @param html HTML document
	 * @return plain text or <code>null</code> if the conversion failed
	 */
	public static synchronized String html2text(String html) {
		// convert HTML document
		StringBean sb = new StringBean();
		sb.setLinks(false);  // no links
		sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
	    sb.setCollapse(true);  // replace sequences of whitespaces
		Parser parser = new Parser();
		try {
			parser.setInputHTML(html);
			parser.visitAllNodesWith(sb);
		} catch (ParserException e) {
			return null;
		}
		String docText = sb.getStrings();
		
		if (docText == null) docText = "";  // no content
		
		return docText;
	}
	
	/**
	 * Reads an HTML document from a file and converts it into plain text.
	 * 
	 * @param filename name of file containing HTML documents
	 * @return plain text or <code>null</code> if the reading or conversion failed
	 */
	public static synchronized String file2text(String filename) {
		// read from file and convert HTML document
		StringBean sb = new StringBean();
		sb.setLinks(false);  // no links
		sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
	    sb.setCollapse(true);  // replace sequences of whitespaces
		Parser parser = new Parser();
		try {
			parser.setResource(filename);
			parser.visitAllNodesWith(sb);
		} catch (ParserException e) {
			return null;
		}
		String docText = sb.getStrings();
		
		return docText;
	}
	
	/**
	 * Fetches an HTML document from a URL and converts it into plain text.
	 * 
	 * @param url URL of HTML document
	 * @return plain text or <code>null</code> if the fetching or conversion failed
	 */
	public static synchronized String url2text(String url) throws SocketTimeoutException {
		// connect to URL
		URLConnection conn = null;
		try {
			conn = (new URL(url)).openConnection();
			if (!(conn instanceof HttpURLConnection)) return null;  // only allow HTTP connections
		} catch (IOException e) {
			return null;
		}
		conn.setRequestProperty("User-agent","Mozilla/4.0");  // pretend to be a browser
		conn.setConnectTimeout(TIMEOUT);
		conn.setReadTimeout(TIMEOUT);
		
		// fetch URL and convert HTML document
		StringBean sb = new StringBean();
		sb.setLinks(false);  // no links
		sb.setReplaceNonBreakingSpaces(true); // replace non-breaking spaces
	    sb.setCollapse(true);  // replace sequences of whitespaces
		sb.setConnection(conn);
		String docText = sb.getStrings();
		
		return docText;
	}
//	// simple conversion using standard API components
//	public static String url2text(String url) {
//		EditorKit kit = new HTMLEditorKit();
//		Document doc = kit.createDefaultDocument();
//		doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);  // Document does not handle charsets properly
//		
//		try {
//			// create reader on HTML content
//			URLConnection conn = (HttpURLConnection) (new URL(url)).openConnection();
//			conn.setRequestProperty("User-agent","Mozilla/4.0");  // pretend to be a browser
//			BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "ISO-8859-1"));//, "UTF-8"));
//			
//			// parse HTML content
//			kit.read(br, doc, 0);
//			
//			return doc.getText(0, doc.getLength());
//		} catch (Exception e) {
//			// print HTTP error message
//			MsgPrinter.printHttpError(e.toString());
//			return null;
//		}
//	}
}