/* * SIP Communicator, the OpenSource Java VoIP and Instant Messaging client. * * Distributable under LGPL license. * See terms of license at gnu.org. */ package net.java.sip.communicator.util; import java.io.*; import javax.swing.text.html.*; import javax.swing.text.html.parser.*; /** * A utility class that allows to extract the text content of an html page * stripped from all formatting tags. * * @author Emil Ivov <emcho at sip-communicator.org> * @author Yana Stamcheva */ public class Html2Text { private static final Logger logger = Logger.getLogger(Html2Text.class); private static HTMLParserCallBack parser; /** * A utility method that allows to extract the text content of an html page * stripped from all formatting tags. Method is synchronized to avoid * concurrent access to the underlying html editor kit. * * @param html the html string that we will extract the text from. * @return the text content of the <tt>html</tt> parameter. */ public static synchronized String extractText(String html) { if (parser == null) parser = new HTMLParserCallBack(); try { StringReader in = new StringReader(html); parser.parse(in); in.close(); return parser.getText(); } catch (Exception exc) { logger.info("Failed to extract plain text from html="+html, exc); return html; } } /** * The ParserCallback that will parse the html. */ private static class HTMLParserCallBack extends HTMLEditorKit.ParserCallback { StringBuffer s; /** * Parses the text contained in the given reader. * * @param in the reader to parse. * @throws IOException thrown if we fail to parse the reader. */ public void parse(Reader in) throws IOException { s = new StringBuffer(); ParserDelegator delegator = new ParserDelegator(); // the third parameter is TRUE to ignore charset directive delegator.parse(in, this, Boolean.TRUE); } /** * Appends the given text to the string buffer. */ public void handleText(char[] text, int pos) { s.append(text); } /** * Returns the parsed text. */ public String getText() { return s.toString(); } } }