package org.meaningfulweb.util; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; public class HtmlExtractUtils { private static final Pattern HEAD = Pattern.compile("<head.*?>.*?</head>", Pattern.CASE_INSENSITIVE); private static final Pattern STYLE_SHEETS = Pattern.compile( "<style.*?>.*?</style>", Pattern.CASE_INSENSITIVE); private static final Pattern SCRIPTS = Pattern.compile( "<script.*?>.*?</script>", Pattern.CASE_INSENSITIVE); private static final Pattern TAGS = Pattern.compile("<.*?>"); private static final Pattern COMMENTS = Pattern.compile("<!--.*?-->"); private static final Pattern SPECIAL = Pattern.compile("&.*?;"); private static final Pattern NEWLINES = Pattern.compile("\n+"); private static final Pattern WHITESPACE = Pattern.compile("\\s+"); private static final Set<String> attributes = new HashSet<String>(); static { String[] validAttrs = new String[]{"abbr", "accept-charset", "accept", "accesskey", "action", "align", "alink", "alt", "archive", "axis", "background", "bgcolor", "border", "cellpadding", "cellspacing", "char", "charoff", "charset", "checked", "cite", "class", "classid", "clear", "code", "codebase", "codetype", "color", "cols", "colspan", "compact", "content", "coords", "data", "datetime", "declare", "defer", "dir", "disabled", "enctype", "face", "for", "frame", "frameborder", "headers", "height", "href", "hreflang", "hspace", "http-equiv", "id", "ismap", "label", "lang", "language", "link", "longdesc", "marginheight", "marginwidth", "maxlength", "media", "method", "multiple", "name", "nohref", "noresize", "noshade", "nowrap", "object", "onblur", "onchange", "onclick", "ondblclick", "onfocus", "onkeydown", "onkeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onselect", "onsubmit", "onunload", "profile", "prompt", "property", "readonly", "rel", "rev", "rows", "rowspan", "rules", "scheme", "scope", "scrolling", "selected", "shape", "size", "span", "src", "standby", "start", "style", "summary", "tabindex", "target", "text", "title", "type", "usemap", "valign", "value", "valuetype", "version", "vlink", "vspace", "width"}; attributes.addAll(Arrays.asList(validAttrs)); } public static String removeHead(String content) { // Remove any contiguous whitespace and replace with single space Matcher head = HEAD.matcher(content); while (head.find()) { content = head.replaceAll(" "); } return content; } public static String removeNewlines(String content) { // Remove new line characters, replace with spaces Matcher mnLines = NEWLINES.matcher(content); while (mnLines.find()) { content = mnLines.replaceAll(" "); } return content; } public static String removeStyleSheets(String content) { // Remove style tags & inclusive content Matcher mstyles = STYLE_SHEETS.matcher(content); while (mstyles.find()) { content = mstyles.replaceAll(""); } return content; } public static String removeScripts(String content) { // Remove script tags & inclusive content Matcher mscripts = SCRIPTS.matcher(content); while (mscripts.find()) { content = mscripts.replaceAll(""); } return content; } public static String removeTags(String content) { // Remove primary HTML tags Matcher mtags = TAGS.matcher(content); while (mtags.find()) { content = mtags.replaceAll(" "); } return content; } public static String removeComments(String content) { // Remove comment tags & inclusive content Matcher mcomments = COMMENTS.matcher(content); while (mcomments.find()) { content = mcomments.replaceAll(" "); } return content; } public static String removeSpecialCharacters(String content) { // Remove special characters, such as   Matcher msChars = SPECIAL.matcher(content); while (msChars.find()) { content = msChars.replaceAll(""); } return content; } public static String removeContiguousWhitespace(String content) { // Remove any contiguous whitespace and replace with single space Matcher endWhites = WHITESPACE.matcher(content); while (endWhites.find()) { content = endWhites.replaceAll(" "); } return StringUtils.trim(content); } public static String extractAllText(byte[] htmlBytes) { String content = new String(htmlBytes); content = removeNewlines(content); content = removeStyleSheets(content); content = removeScripts(content); content = removeTags(content); content = removeComments(content); content = removeSpecialCharacters(content); return content; } public static String extractBodyText(byte[] htmlBytes) { String content = new String(htmlBytes); content = removeNewlines(content); content = removeHead(content); content = removeStyleSheets(content); content = removeScripts(content); content = removeTags(content); content = removeComments(content); content = removeSpecialCharacters(content); return extractAllText(content.getBytes()); } public static boolean isValidAttribute(String name) { return attributes.contains(StringUtils.lowerCase(name)); } }