package info.ephyra.util;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
/**
* The <code>HTMLConverter</code> can be used to convert an HTML document to
* plain text.
*
* @author Nico Schlaefer
* @version 2007-06-19
*/
public class HTMLConverter {
/** Timeout for HTTP connections in milliseconds. */
private static final int TIMEOUT = 120000; // 2 min
/**
* Checks if the given string is a URL.
*
* @param s a string
* @return <code>true</code> iff the string is a URL
*/
public static boolean isUrl(String s) {
try {
new URL(s);
} catch (MalformedURLException e) {
return false;
}
return true;
}
/**
* Handles special characters in HTML documents by replacing sequences of
* the form <code>&...;</code> by the corresponding characters.
*
* @param html html document
* @return transformed html document
*/
public static synchronized String replaceSpecialCharacters(String html) {
html = html.replaceAll(" ", " ");
html = html.replaceAll("
", " ");
html = html.replaceAll(" ", " ");
html = html.replaceAll("!", "!");
html = html.replaceAll("(?i)("|")", "\"");
html = html.replaceAll("#", "#");
html = html.replaceAll("$", "$");
html = html.replaceAll("%", "%");
html = html.replaceAll("(?i)(&|&)", "&");
html = html.replaceAll("'", "'");
html = html.replaceAll("(", "(");
html = html.replaceAll(")", ")");
html = html.replaceAll("*", "*");
html = html.replaceAll("+", "+");
html = html.replaceAll(",", ",");
html = html.replaceAll("-", "-");
html = html.replaceAll(".", ".");
html = html.replaceAll("(?i)(/|⁄)", "/");
html = html.replaceAll("0", "0");
html = html.replaceAll("1", "1");
html = html.replaceAll("2", "2");
html = html.replaceAll("3", "3");
html = html.replaceAll("4", "4");
html = html.replaceAll("5", "5");
html = html.replaceAll("6", "6");
html = html.replaceAll("7", "7");
html = html.replaceAll("8", "8");
html = html.replaceAll("9", "9");
html = html.replaceAll(":", ":");
html = html.replaceAll(";", ";");
html = html.replaceAll("(?i)(<|<)", "<");
html = html.replaceAll("=", "=");
html = html.replaceAll("(?i)(>|>)", ">");
html = html.replaceAll("?", "?");
html = html.replaceAll("@", "@");
html = html.replaceAll("A", "A");
html = html.replaceAll("B", "B");
html = html.replaceAll("C", "C");
html = html.replaceAll("D", "D");
html = html.replaceAll("E", "E");
html = html.replaceAll("F", "F");
html = html.replaceAll("G", "G");
html = html.replaceAll("H", "H");
html = html.replaceAll("I", "I");
html = html.replaceAll("J", "J");
html = html.replaceAll("K", "K");
html = html.replaceAll("L", "L");
html = html.replaceAll("M", "M");
html = html.replaceAll("N", "N");
html = html.replaceAll("O", "O");
html = html.replaceAll("P", "P");
html = html.replaceAll("Q", "Q");
html = html.replaceAll("R", "R");
html = html.replaceAll("S", "S");
html = html.replaceAll("T", "T");
html = html.replaceAll("U", "U");
html = html.replaceAll("V", "V");
html = html.replaceAll("W", "W");
html = html.replaceAll("X", "X");
html = html.replaceAll("Y", "Y");
html = html.replaceAll("Z", "Z");
html = html.replaceAll("[", "[");
html = html.replaceAll("\", "\\");
html = html.replaceAll("]", "]");
html = html.replaceAll("^", "^");
html = html.replaceAll("_", "_");
html = html.replaceAll("`", "`");
html = html.replaceAll("a", "a");
html = html.replaceAll("b", "b");
html = html.replaceAll("c", "c");
html = html.replaceAll("d", "d");
html = html.replaceAll("e", "e");
html = html.replaceAll("f", "f");
html = html.replaceAll("g", "g");
html = html.replaceAll("h", "h");
html = html.replaceAll("i", "i");
html = html.replaceAll("j", "j");
html = html.replaceAll("k", "k");
html = html.replaceAll("l", "l");
html = html.replaceAll("m", "m");
html = html.replaceAll("n", "n");
html = html.replaceAll("o", "o");
html = html.replaceAll("p", "p");
html = html.replaceAll("q", "q");
html = html.replaceAll("r", "r");
html = html.replaceAll("s", "s");
html = html.replaceAll("t", "t");
html = html.replaceAll("u", "u");
html = html.replaceAll("v", "v");
html = html.replaceAll("w", "w");
html = html.replaceAll("x", "x");
html = html.replaceAll("y", "y");
html = html.replaceAll("z", "z");
html = html.replaceAll("{", "{");
html = html.replaceAll("|", "|");
html = html.replaceAll("}", "}");
html = html.replaceAll("~", "~");
html = html.replaceAll("(?i)(|–)", "–");
html = html.replaceAll("(?i)(|—)", "—");
html = html.replaceAll("(?i)( | )", " ");
html = html.replaceAll("(?i)(¡|¡)", "¡");
html = html.replaceAll("(?i)(¢|¢)", "¢");
html = html.replaceAll("(?i)(£|£)", "£");
html = html.replaceAll("(?i)(¤|¤)", "¤");
html = html.replaceAll("(?i)(¥|¥)", "¥");
html = html.replaceAll("(?i)(¦|¦|&brkbar;)", "¦");
html = html.replaceAll("(?i)(§|§)", "§");
html = html.replaceAll("(?i)(¨|¨|¨)", "¨");
html = html.replaceAll("(?i)(©|©)", "©");
html = html.replaceAll("(?i)(ª|ª)", "ª");
html = html.replaceAll("(?i)(«|«)", "«");
html = html.replaceAll("(?i)(¬|¬)", "¬");
html = html.replaceAll("(?i)(|)", "");
html = html.replaceAll("(?i)(®|®)", "®");
html = html.replaceAll("(?i)(¯|¯|&hibar;)", "¯");
// TODO complete this list
// (http://www.webmonkey.com/reference/Special_Characters/)
html = html.replaceAll("?+\\w*+;", ""); // drop invalid codes
return html;
}
/**
* Converts a snippet with HTML tags and special characters into plain text.
*
* @param snippet HTML snippet
* @return plain text
*/
public static synchronized String htmlsnippet2text(String snippet) {
// drop HTML tags
snippet = snippet.replaceAll("<[^>]*+>", "");
// handle special characters
snippet = replaceSpecialCharacters(snippet);
// replace sequences of whitespaces by single blanks and trim
snippet = snippet.replaceAll("\\s++", " ").trim();
return snippet;
}
/**
* Converts an HTML document into plain text.
*
* @param html HTML document
* @return plain text or <code>null</code> if the conversion failed
*/
public static synchronized String html2text(String html) {
// convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
Parser parser = new Parser();
try {
parser.setInputHTML(html);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
if (docText == null) docText = ""; // no content
return docText;
}
/**
* Reads an HTML document from a file and converts it into plain text.
*
* @param filename name of file containing HTML documents
* @return plain text or <code>null</code> if the reading or conversion failed
*/
public static synchronized String file2text(String filename) {
// read from file and convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
Parser parser = new Parser();
try {
parser.setResource(filename);
parser.visitAllNodesWith(sb);
} catch (ParserException e) {
return null;
}
String docText = sb.getStrings();
return docText;
}
/**
* Fetches an HTML document from a URL and converts it into plain text.
*
* @param url URL of HTML document
* @return plain text or <code>null</code> if the fetching or conversion failed
*/
public static synchronized String url2text(String url) throws SocketTimeoutException {
// connect to URL
URLConnection conn = null;
try {
conn = (new URL(url)).openConnection();
if (!(conn instanceof HttpURLConnection)) return null; // only allow HTTP connections
} catch (IOException e) {
return null;
}
conn.setRequestProperty("User-agent","Mozilla/4.0"); // pretend to be a browser
conn.setConnectTimeout(TIMEOUT);
conn.setReadTimeout(TIMEOUT);
// fetch URL and convert HTML document
StringBean sb = new StringBean();
sb.setLinks(false); // no links
sb.setReplaceNonBreakingSpaces(true); // replace non-breaking spaces
sb.setCollapse(true); // replace sequences of whitespaces
sb.setConnection(conn);
String docText = sb.getStrings();
return docText;
}
// // simple conversion using standard API components
// public static String url2text(String url) {
// EditorKit kit = new HTMLEditorKit();
// Document doc = kit.createDefaultDocument();
// doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); // Document does not handle charsets properly
//
// try {
// // create reader on HTML content
// URLConnection conn = (HttpURLConnection) (new URL(url)).openConnection();
// conn.setRequestProperty("User-agent","Mozilla/4.0"); // pretend to be a browser
// BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "ISO-8859-1"));//, "UTF-8"));
//
// // parse HTML content
// kit.read(br, doc, 0);
//
// return doc.getText(0, doc.getLength());
// } catch (Exception e) {
// // print HTTP error message
// MsgPrinter.printHttpError(e.toString());
// return null;
// }
// }
}