// BlogBridge -- RSS feed reader, manager, and web based service // Copyright (C) 2002-2006 by R. Pito Salas // // This program is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free Software Foundation; // either version 2 of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; // without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along with this program; // if not, write to the Free Software Foundation, Inc., 59 Temple Place, // Suite 330, Boston, MA 02111-1307 USA // // Contact: R. Pito Salas // mailto:pitosalas@users.sourceforge.net // More information: about BlogBridge // http://www.blogbridge.com // http://sourceforge.net/projects/blogbridge // // $Id: TextProcessor.java,v 1.15 2008/04/04 14:03:27 spyromus Exp $ // package com.salas.bb.utils.swinghtml; import com.salas.bb.utils.Constants; import com.salas.bb.utils.htmlparser.HtmlParser; import com.salas.bb.utils.htmlparser.IHtmlParserListener; import com.salas.bb.utils.htmlparser.utils.StringBuilderListener; import java.io.IOException; import java.io.StringReader; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Simple text processor utility which is using <code>HtmlParser</code> and company to prepare * any HTML source for displaing. * <p> * There's Swing and non-Swing modes available. When in Swing mode the processor will prepare * text to be correctly displayed in Swing components, while when in non-Swing mode it will * simply clean and straighten things up. */ public final class TextProcessor { private static final Pattern PAT_ALL_TAGS = Pattern.compile("<[^>]+>"); private static final Pattern PAT_ENTITIES = Pattern.compile("&[a-zA-Z]{2,6};", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_NUMERIC_ENTITIES = Pattern.compile("&#(x?[0-9a-fA-F]{1,4})(;|(\\s)|$)", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_QUOT = Pattern.compile(""(;|(\\s)|$)", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_NBSP = Pattern.compile(" (;|(\\s)|$)", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_APOS = Pattern.compile("&apos(;|(\\s)|$)", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_AMP = Pattern.compile("&(;|(\\s)|$)", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_LT = Pattern.compile("<(;|(\\s)|$)", Pattern.CASE_INSENSITIVE); private static final Pattern PAT_GT = Pattern.compile(">(;|(\\s)|$)", Pattern.CASE_INSENSITIVE); public static final Pattern PAT_BACKGROUND_ATTR = Pattern.compile("(<[^>]+)background\\s*=\\s*('[\\s]*'|\"[\\s]*\")([^>]*>)", Pattern.CASE_INSENSITIVE); private static final Pattern PATTERN = Pattern.compile("&((nbsp)|(lt)|(gt)|(apos)|(quot)|(amp));"); private static final String[] REPLACEMENT = { " ", "<", ">", "'", "\"", "&" }; /** * Hidden utility class constructor. */ private TextProcessor() { } /** * Process text to return clean HTML text no longer than limit. * * @param text text. * @param sizeLimit limit in characters. * * @return HTML text. */ public static String processHTML(String text, int sizeLimit) { return process(text, sizeLimit, true); } /** * Process text to return clean plain text no longer than limit. * * @param text text. * @param sizeLimit limit in characters. * * @return plain text. */ public static String processPlain(String text, int sizeLimit) { return process(text, sizeLimit, false); } /** * Process source text with several flags. * * @param aText original text. * @param sizeLimit size limitation in chars. * @param html FALSE to remove styling markup, but leave structural (p, blockquotes...). * * @return result. */ static String process(String aText, int sizeLimit, boolean html) { if (aText == null) return null; IHtmlParserListener listener; HtmlParser parser = new HtmlParser(true); StringBuilderListener bufListener = new StringBuilderListener(aText.length(), sizeLimit); listener = html ? new SwingHtmlFilter(bufListener) : new SwingPlainFilter(bufListener); try { parser.parse(new StringReader(aText), listener); } catch (IOException e) { // OK. Buffer will be empty. } String result = bufListener.toString(); result = PAT_BACKGROUND_ATTR.matcher(result).replaceAll("$1$3".intern()); result = convertEntity(result, "&mdash(;|(\\s)|$)".intern(), "\u2014".intern()); // of course we have to put here more entities, but it isn't clear which of them return result; } /** * Converts numeric entities (&#YYYY, &#YY, &#xYYYY or others) into Unicode chars. * * @param text original text. * * @return output. */ public static String convertNumericHTMLEntities(String text) { if (text == null) return null; Matcher m = PAT_NUMERIC_ENTITIES.matcher(text); StringBuffer buf = new StringBuffer(); while (m.find()) { String value = m.group(1); if (value.length() > 0) { char c; String replacement; try { if (value.toLowerCase().charAt(0) == 'x') { c = (char)Integer.parseInt(value.substring(1), 16); } else { c = (char)Integer.parseInt(value); } char[] chars; if (c == 92 || c == 36) { chars = new char[] { '\\', c }; } else { chars = new char[] { c }; } replacement = new String(chars); } catch (NumberFormatException e) { replacement = Constants.EMPTY_STRING; } String tail = m.group(3); if (tail != null) replacement += tail; m.appendReplacement(buf, replacement); } } m.appendTail(buf); return buf.toString(); } /** * Converts &amp;, &apos;, &nbsp; and &quot; entities in quivalents. * * @param text source text. * * @return result. */ public static String convertHTMLEntities(String text) { if (text == null) return null; text = convertEntity(text, PAT_AMP, "&"); text = convertEntity(text, PAT_APOS, "'"); text = convertEntity(text, PAT_NBSP, " "); text = convertEntity(text, PAT_QUOT, "\""); text = convertEntity(text, PAT_LT, "<"); text = convertEntity(text, PAT_GT, ">"); return text; } /** * Replaces entity with a corresponding replacement. The tail is taken from second group. * * @param text source text. * @param pattern pattern. * @param replacement replacement. * * @return result. */ private static String convertEntity(String text, String pattern, String replacement) { return convertEntity(text, Pattern.compile(pattern), replacement); } /** * Replaces entity with a corresponding replacement. The tail is taken from second group. * * @param text source text. * @param pattern pattern. * @param replacement replacement. * * @return result. */ private static String convertEntity(String text, Pattern pattern, String replacement) { if (text == null) return null; Matcher m = pattern.matcher(text); StringBuffer buf = new StringBuffer(); while (m.find()) { String rep = replacement; String tail = m.group(2); if (tail != null) rep += tail; m.appendReplacement(buf, rep); } m.appendTail(buf); return buf.toString(); } /** * Replaces all detected entities with space. * * @param text source text. * * @return result. */ public static String removeHTMLEntities(String text) { return text == null ? null : PAT_ENTITIES.matcher(text).replaceAll(" ".intern()); } /** * Removes all found tags from text. * * @param text source text. * * @return result. */ public static String removeTags(String text) { return text == null ? null : PAT_ALL_TAGS.matcher(text).replaceAll(Constants.EMPTY_STRING); } /** * Returns the excerpt from text. The excerpt is a text, having no more than defined number * of distinct words and doesn't spanning across multiple lines. The exceprpt may be terminated * with new line or natural sentense terminator: '.', '!' or '?'. * * @param text text to get excerpt from. * @param words maximum number of words. * * @return result. */ public static String getExcerpt(String text, int words) { if (text == null) return null; int i = 0; int length = text.length(); // Find letter or digit boolean found = false; while (i < length && !(found = !Character.isWhitespace(text.charAt(i)))) i++; String excerpt = null; if (found) { int start = i; int end = -1; int count = 0; boolean isInWord = false; // Spin until we encounter new line, sentense terminator, the end of text or // number of words will match the limit. while (excerpt == null && count <= words && i < length) { char ch = text.charAt(i); if (ch == 0x0a || ch == 0x0d) { // New line found -- get the excerpt excerpt = text.substring(start, i) + "..."; } else if (ch == '.' || ch == '!' || ch == '?') { // Sentence terminator found -- get the excerpt excerpt = text.substring(start, i + 1); } else { // Some other character if (Character.isLetterOrDigit(ch)) { if (!isInWord) { count++; isInWord = true; } } else { isInWord = false; end = i; } } i++; } // If we still don't have the excerpt it means that we have reached the end // of text. Take everything from it and finish. if (excerpt == null) excerpt = text.substring(start, i == length ? i : end); if (count > words) excerpt += "..."; } else excerpt = Constants.EMPTY_STRING; return excerpt; } /** * Cleans the title of the article. Converts known HTML entities into strings, * removes unknown HTML entities and tags. If title isn't present, some words * from the head of the text will be returned. * * @param title title to process. * @param text article text to get title from if title isn't defined. * * @return title of the article. */ public static String filterTitle(String title, String text) { boolean excerptRequired = false; if (title == null && text != null) { title = text; excerptRequired = true; } if (title != null) { title = removeTags(title); title = convertNumericHTMLEntities(title); title = convertHTMLEntities(title); title = removeHTMLEntities(title); if (excerptRequired) { title = getExcerpt(title, Constants.WORDS_IN_EXCERPT); } } return title; } /** * Cleans the text of the article. Removes all garbage and constructs which * aren't understood by Swing and its renderers. If text isn't present null * will be returned. * * @param text original text. * * @return filtered text. */ public static String filterText(String text) { if (text == null) return null; text = text.replaceAll("<style[^>]*>[^<]*</style>", ""); text = processHTML(text, Constants.ARTICLE_SIZE_LIMIT); text = removeLeadingParagraphs(text); return text; } /** * Removes all leading paragraph signs to avoid unnecessary spacing. * * @param text text to process. * * @return results. */ static String removeLeadingParagraphs(String text) { return text.replaceFirst("^(\\s*<[pP]>)+", "").trim(); } /** * Converts raw text to plain. * * @param html raw HTML. * * @return plain text version. */ public static String toPlainText(String html) { String result; Matcher m = PATTERN.matcher(html); if (m.find()) { StringBuffer sb = new StringBuffer(); do { int matchIndex = findMatchIndex(m); m.appendReplacement(sb, REPLACEMENT[matchIndex]); } while (m.find()); m.appendTail(sb); result = sb.toString(); } else { result = html; } return result; } /** * Finds the matching group number. * * @param m matcher. * * @return group index. */ private static int findMatchIndex(Matcher m) { for (int i = 2; i <= m.groupCount(); i++) { String val = m.group(i); if (val != null) return i - 2; } return 0; } }