package info.ephyra.util; import java.io.IOException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.SocketTimeoutException; import java.net.URL; import java.net.URLConnection; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.util.ParserException; /** * The <code>HTMLConverter</code> can be used to convert an HTML document to * plain text. * * @author Nico Schlaefer * @version 2007-06-19 */ public class HTMLConverter { /** Timeout for HTTP connections in milliseconds. */ private static final int TIMEOUT = 120000; // 2 min /** * Checks if the given string is a URL. * * @param s a string * @return <code>true</code> iff the string is a URL */ public static boolean isUrl(String s) { try { new URL(s); } catch (MalformedURLException e) { return false; } return true; } /** * Handles special characters in HTML documents by replacing sequences of * the form <code>&...;</code> by the corresponding characters. * * @param html html document * @return transformed html document */ public static synchronized String replaceSpecialCharacters(String html) { html = html.replaceAll(" ", " "); html = html.replaceAll(" ", " "); html = html.replaceAll(" ", " "); html = html.replaceAll("!", "!"); html = html.replaceAll("(?i)("|")", "\""); html = html.replaceAll("#", "#"); html = html.replaceAll("$", "$"); html = html.replaceAll("%", "%"); html = html.replaceAll("(?i)(&|&)", "&"); html = html.replaceAll("'", "'"); html = html.replaceAll("(", "("); html = html.replaceAll(")", ")"); html = html.replaceAll("*", "*"); html = html.replaceAll("+", "+"); html = html.replaceAll(",", ","); html = html.replaceAll("-", "-"); html = html.replaceAll(".", "."); html = html.replaceAll("(?i)(/|⁄)", "/"); html = html.replaceAll("0", "0"); html = html.replaceAll("1", "1"); html = html.replaceAll("2", "2"); html = html.replaceAll("3", "3"); html = html.replaceAll("4", "4"); html = html.replaceAll("5", "5"); html = html.replaceAll("6", "6"); html = html.replaceAll("7", "7"); html = html.replaceAll("8", "8"); html = html.replaceAll("9", "9"); html = html.replaceAll(":", ":"); html = html.replaceAll(";", ";"); html = html.replaceAll("(?i)(<|<)", "<"); html = html.replaceAll("=", "="); html = html.replaceAll("(?i)(>|>)", ">"); html = html.replaceAll("?", "?"); html = html.replaceAll("@", "@"); html = html.replaceAll("A", "A"); html = html.replaceAll("B", "B"); html = html.replaceAll("C", "C"); html = html.replaceAll("D", "D"); html = html.replaceAll("E", "E"); html = html.replaceAll("F", "F"); html = html.replaceAll("G", "G"); html = html.replaceAll("H", "H"); html = html.replaceAll("I", "I"); html = html.replaceAll("J", "J"); html = html.replaceAll("K", "K"); html = html.replaceAll("L", "L"); html = html.replaceAll("M", "M"); html = html.replaceAll("N", "N"); html = html.replaceAll("O", "O"); html = html.replaceAll("P", "P"); html = html.replaceAll("Q", "Q"); html = html.replaceAll("R", "R"); html = html.replaceAll("S", "S"); html = html.replaceAll("T", "T"); html = html.replaceAll("U", "U"); html = html.replaceAll("V", "V"); html = html.replaceAll("W", "W"); html = html.replaceAll("X", "X"); html = html.replaceAll("Y", "Y"); html = html.replaceAll("Z", "Z"); html = html.replaceAll("[", "["); html = html.replaceAll("\", "\\"); html = html.replaceAll("]", "]"); html = html.replaceAll("^", "^"); html = html.replaceAll("_", "_"); html = html.replaceAll("`", "`"); html = html.replaceAll("a", "a"); html = html.replaceAll("b", "b"); html = html.replaceAll("c", "c"); html = html.replaceAll("d", "d"); html = html.replaceAll("e", "e"); html = html.replaceAll("f", "f"); html = html.replaceAll("g", "g"); html = html.replaceAll("h", "h"); html = html.replaceAll("i", "i"); html = html.replaceAll("j", "j"); html = html.replaceAll("k", "k"); html = html.replaceAll("l", "l"); html = html.replaceAll("m", "m"); html = html.replaceAll("n", "n"); html = html.replaceAll("o", "o"); html = html.replaceAll("p", "p"); html = html.replaceAll("q", "q"); html = html.replaceAll("r", "r"); html = html.replaceAll("s", "s"); html = html.replaceAll("t", "t"); html = html.replaceAll("u", "u"); html = html.replaceAll("v", "v"); html = html.replaceAll("w", "w"); html = html.replaceAll("x", "x"); html = html.replaceAll("y", "y"); html = html.replaceAll("z", "z"); html = html.replaceAll("{", "{"); html = html.replaceAll("|", "|"); html = html.replaceAll("}", "}"); html = html.replaceAll("~", "~"); html = html.replaceAll("(?i)(–|–)", "–"); html = html.replaceAll("(?i)(—|—)", "—"); html = html.replaceAll("(?i)( | )", " "); html = html.replaceAll("(?i)(¡|¡)", "¡"); html = html.replaceAll("(?i)(¢|¢)", "¢"); html = html.replaceAll("(?i)(£|£)", "£"); html = html.replaceAll("(?i)(¤|¤)", "¤"); html = html.replaceAll("(?i)(¥|¥)", "¥"); html = html.replaceAll("(?i)(¦|¦|&brkbar;)", "¦"); html = html.replaceAll("(?i)(§|§)", "§"); html = html.replaceAll("(?i)(¨|¨|¨)", "¨"); html = html.replaceAll("(?i)(©|©)", "©"); html = html.replaceAll("(?i)(ª|ª)", "ª"); html = html.replaceAll("(?i)(«|«)", "«"); html = html.replaceAll("(?i)(¬|¬)", "¬"); html = html.replaceAll("(?i)(­|­)", ""); html = html.replaceAll("(?i)(®|®)", "®"); html = html.replaceAll("(?i)(¯|¯|&hibar;)", "¯"); // TODO complete this list // (http://www.webmonkey.com/reference/Special_Characters/) html = html.replaceAll("&#?+\\w*+;", ""); // drop invalid codes return html; } /** * Converts a snippet with HTML tags and special characters into plain text. * * @param snippet HTML snippet * @return plain text */ public static synchronized String htmlsnippet2text(String snippet) { // drop HTML tags snippet = snippet.replaceAll("<[^>]*+>", ""); // handle special characters snippet = replaceSpecialCharacters(snippet); // replace sequences of whitespaces by single blanks and trim snippet = snippet.replaceAll("\\s++", " ").trim(); return snippet; } /** * Converts an HTML document into plain text. * * @param html HTML document * @return plain text or <code>null</code> if the conversion failed */ public static synchronized String html2text(String html) { // convert HTML document StringBean sb = new StringBean(); sb.setLinks(false); // no links sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces sb.setCollapse(true); // replace sequences of whitespaces Parser parser = new Parser(); try { parser.setInputHTML(html); parser.visitAllNodesWith(sb); } catch (ParserException e) { return null; } String docText = sb.getStrings(); if (docText == null) docText = ""; // no content return docText; } /** * Reads an HTML document from a file and converts it into plain text. * * @param filename name of file containing HTML documents * @return plain text or <code>null</code> if the reading or conversion failed */ public static synchronized String file2text(String filename) { // read from file and convert HTML document StringBean sb = new StringBean(); sb.setLinks(false); // no links sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces sb.setCollapse(true); // replace sequences of whitespaces Parser parser = new Parser(); try { parser.setResource(filename); parser.visitAllNodesWith(sb); } catch (ParserException e) { return null; } String docText = sb.getStrings(); return docText; } /** * Fetches an HTML document from a URL and converts it into plain text. * * @param url URL of HTML document * @return plain text or <code>null</code> if the fetching or conversion failed */ public static synchronized String url2text(String url) throws SocketTimeoutException { // connect to URL URLConnection conn = null; try { conn = (new URL(url)).openConnection(); if (!(conn instanceof HttpURLConnection)) return null; // only allow HTTP connections } catch (IOException e) { return null; } conn.setRequestProperty("User-agent","Mozilla/4.0"); // pretend to be a browser conn.setConnectTimeout(TIMEOUT); conn.setReadTimeout(TIMEOUT); // fetch URL and convert HTML document StringBean sb = new StringBean(); sb.setLinks(false); // no links sb.setReplaceNonBreakingSpaces(true); // replace non-breaking spaces sb.setCollapse(true); // replace sequences of whitespaces sb.setConnection(conn); String docText = sb.getStrings(); return docText; } // // simple conversion using standard API components // public static String url2text(String url) { // EditorKit kit = new HTMLEditorKit(); // Document doc = kit.createDefaultDocument(); // doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); // Document does not handle charsets properly // // try { // // create reader on HTML content // URLConnection conn = (HttpURLConnection) (new URL(url)).openConnection(); // conn.setRequestProperty("User-agent","Mozilla/4.0"); // pretend to be a browser // BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "ISO-8859-1"));//, "UTF-8")); // // // parse HTML content // kit.read(br, doc, 0); // // return doc.getText(0, doc.getLength()); // } catch (Exception e) { // // print HTTP error message // MsgPrinter.printHttpError(e.toString()); // return null; // } // } }