package org.caudexorigo.text; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.caudexorigo.io.IOUtils; public class HtmlStripper { private static final Pattern breaker = Pattern.compile("(<blockquote|<center|<div|<p|<br|<h\\d|<ul|<dl|<ol|<hr|<table)", Pattern.CASE_INSENSITIVE); private static final Pattern markup_cleaner = Pattern.compile("<xml.*?xml>|<style.*?style>|<script.*?script>|<.*?>", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); private static final Pattern space_cleaner = Pattern.compile("[\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u00a0\\u1680\\u000b\\u0020\\u00a0\\u1680\\u202f\\u205f\\u3000\\u0009]"); private static final Pattern nl_cleaner = Pattern.compile("[\r\\u000a\\u000c\\u000d\\u0085\\u2028\\u2029]"); private static final Pattern trim_space = Pattern.compile("^[ \t]+|[ \t]+$", Pattern.MULTILINE); private static final Pattern multi_space = Pattern.compile("[ \t]{2,}"); private static final Pattern multi_ln = Pattern.compile("[\\x0B\n]{2,}"); public static String strip(String html) { if (StringUtils.isBlank(html)) { return ""; } Matcher m = breaker.matcher(html); String step0; if (m.find()) { step0 = m.replaceAll("\n" + m.group(1)); } else { step0 = html; } String step1 = markup_cleaner.matcher(step0).replaceAll(" "); String step2 = StringEscapeUtils.unescapeHtml4(step1); String step3 = space_cleaner.matcher(step2).replaceAll(" "); String step4 = nl_cleaner.matcher(step3).replaceAll("\n"); String step5 = trim_space.matcher(step4).replaceAll(""); String step6 = multi_ln.matcher(step5).replaceAll("\n"); String step7 = multi_space.matcher(step6).replaceAll(" ").trim(); return step7; } public static void main(String[] args) throws Throwable { String link = "http://technotes.blogs.sapo.pt/"; URL url = new URL(link); String html = IOUtils.toString(url.openStream()); System.out.println(strip(html)); } }