// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $ // $Author: derrickoswald $ // $Date: 2005/05/15 11:49:03 $ // $Revision: 1.44 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.beans; import java.beans.PropertyChangeListener; import java.beans.PropertyChangeSupport; import java.io.Serializable; import java.net.URLConnection; import org.htmlparser.Parser; import org.htmlparser.Text; import org.htmlparser.tags.LinkTag; import org.htmlparser.Tag; import org.htmlparser.util.ParserException; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.Translate; import org.htmlparser.visitors.NodeVisitor; /** * Extract strings from a URL. * <p>Text within <SCRIPT></SCRIPT> tags is removed.</p> * <p>The text within <PRE></PRE> tags is not altered.</p> * <p>The property <code>Strings</code>, which is the output property is null * until a URL is set. So a typical usage is:</p> * <pre> * StringBean sb = new StringBean (); * sb.setLinks (false); * sb.setReplaceNonBreakingSpaces (true); * sb.setCollapse (true); * sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here * String s = sb.getStrings (); * </pre> * You can also use the StringBean as a NodeVisitor on your own parser, * in which case you have to refetch your page if you change one of the * properties because it resets the Strings property:</p> * <pre> * StringBean sb = new StringBean (); * Parser parser = new Parser ("http://cbc.ca"); * parser.visitAllNodesWith (sb); * String s = sb.getStrings (); * sb.setLinks (true); * parser.reset (); * parser.visitAllNodesWith (sb); * String sl = sb.getStrings (); * </pre> * According to Nick Burch, who contributed the patch, this is handy if you * don't want StringBean to wander off and get the content itself, either * because you already have it, it's not on a website etc. */ public class StringBean extends NodeVisitor implements Serializable { /** * Property name in event where the URL contents changes. */ public static final String PROP_STRINGS_PROPERTY = "strings"; /** * Property name in event where the 'embed links' state changes. */ public static final String PROP_LINKS_PROPERTY = "links"; /** * Property name in event where the URL changes. */ public static final String PROP_URL_PROPERTY = "URL"; /** * Property name in event where the 'replace non-breaking spaces' * state changes. */ public static final String PROP_REPLACE_SPACE_PROPERTY = "replaceNonBreakingSpaces"; /** * Property name in event where the 'collapse whitespace' state changes. */ public static final String PROP_COLLAPSE_PROPERTY = "collapse"; /** * Property name in event where the connection changes. */ public static final String PROP_CONNECTION_PROPERTY = "connection"; /** * A newline. */ private static final String NEWLINE = System.getProperty ("line.separator"); /** * The length of the NEWLINE. */ private static final int NEWLINE_SIZE = NEWLINE.length (); /** * Bound property support. */ protected PropertyChangeSupport mPropertySupport; /** * The parser used to extract strings. */ protected Parser mParser; /** * The strings extracted from the URL. */ protected String mStrings; /** * If <code>true</code> the link URLs are embedded in the text output. */ protected boolean mLinks; /** * If <code>true</code> regular space characters are substituted for * non-breaking spaces in the text output. */ protected boolean mReplaceSpace; /** * If <code>true</code> sequences of whitespace characters are replaced * with a single space character. */ protected boolean mCollapse; /** * The buffer text is stored in while traversing the HTML. */ protected StringBuffer mBuffer; /** * Set <code>true</code> when traversing a SCRIPT tag. */ protected boolean mIsScript; /** * Set <code>true</code> when traversing a PRE tag. */ protected boolean mIsPre; /** * Set <code>true</code> when traversing a STYLE tag. */ protected boolean mIsStyle; /** * Create a StringBean object. * Default property values are set to 'do the right thing': * <p><code>Links</code> is set <code>false</code> so text appears like a * browser would display it, albeit without the colour or underline clues * normally associated with a link.</p> * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so * that printing the text works, but the extra information regarding these * formatting marks is available if you set it false.</p> * <p><code>Collapse</code> is set <code>true</code>, so text appears * compact like a browser would display it.</p> */ public StringBean () { super (true, true); mPropertySupport = new PropertyChangeSupport (this); mParser = new Parser (); mStrings = null; mLinks = false; mReplaceSpace = true; mCollapse = true; mBuffer = new StringBuffer (4096); mIsScript = false; mIsPre = false; mIsStyle = false; } // // internals // /** * Appends a newline to the buffer if there isn't one there already. * Except if the buffer is empty. */ protected void carriageReturn () { int length; length = mBuffer.length (); if ((0 != length) // don't append newlines to the beginning of a buffer && ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE && (!mBuffer.substring ( length - NEWLINE_SIZE, length).equals (NEWLINE)))) mBuffer.append (NEWLINE); } /** * Add the given text collapsing whitespace. * Use a little finite state machine: * <pre> * state 0: whitepace was last emitted character * state 1: in whitespace * state 2: in word * A whitespace character moves us to state 1 and any other character * moves us to state 2, except that state 0 stays in state 0 until * a non-whitespace and going from whitespace to word we emit a space * before the character: * input: whitespace other-character * state\next * 0 0 2 * 1 1 space then 2 * 2 1 2 * </pre> * @param buffer The buffer to append to. * @param string The string to append. */ protected void collapse (StringBuffer buffer, String string) { int chars; int length; int state; char character; chars = string.length (); if (0 != chars) { length = buffer.length (); state = ((0 == length) || (buffer.charAt (length - 1) == ' ') || ((NEWLINE_SIZE <= length) && buffer.substring ( length - NEWLINE_SIZE, length).equals (NEWLINE))) ? 0 : 1; for (int i = 0; i < chars; i++) { character = string.charAt (i); switch (character) { // see HTML specification section 9.1 White space // http://www.w3.org/TR/html4/struct/text.html#h-9.1 case '\u0020': case '\u0009': case '\u000C': case '\u200B': case '\r': case '\n': if (0 != state) state = 1; break; default: if (1 == state) buffer.append (' '); state = 2; buffer.append (character); } } } } /** * Extract the text from a page. * @return The textual contents of the page. * @exception ParserException If a parse error occurs. */ protected String extractStrings () throws ParserException { String ret; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); mBuffer = new StringBuffer(4096); return (ret); } /** * Assign the <code>Strings</code> property, firing the property change. * @param strings The new value of the <code>Strings</code> property. */ protected void updateStrings (String strings) { String oldValue; if ((null == mStrings) || !mStrings.equals (strings)) { oldValue = mStrings; mStrings = strings; mPropertySupport.firePropertyChange ( PROP_STRINGS_PROPERTY, oldValue, strings); } } /** * Fetch the URL contents. * Only do work if there is a valid parser with it's URL set. */ protected void setStrings () { if (null != getURL ()) try { try { mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); } finally { mBuffer = new StringBuffer (4096); } } catch (EncodingChangeException ece) { mIsPre = false; mIsScript = false; mIsStyle = false; try { // try again with the encoding now in force mParser.reset (); mBuffer = new StringBuffer (4096); mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); } catch (ParserException pe) { updateStrings (pe.toString ()); } finally { mBuffer = new StringBuffer (4096); } } catch (ParserException pe) { updateStrings (pe.toString ()); } else { // reset in case this StringBean is used as a visitor // on another parser, not it's own mStrings = null; mBuffer = new StringBuffer (4096); } } /** * Refetch the URL contents. * Only need to worry if there is already a valid parser and it's * been spent fetching the string contents. */ private void resetStrings () { if (null != mStrings) try { mParser.setURL (getURL ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } // // Property change support. // /** * Add a PropertyChangeListener to the listener list. * The listener is registered for all properties. * @param listener The PropertyChangeListener to be added. */ public void addPropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.addPropertyChangeListener (listener); } /** * Remove a PropertyChangeListener from the listener list. * This removes a registered PropertyChangeListener. * @param listener The PropertyChangeListener to be removed. */ public void removePropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.removePropertyChangeListener (listener); } // // Properties // /** * Return the textual contents of the URL. * This is the primary output of the bean. * @return The user visible (what would be seen in a browser) text. */ public String getStrings () { if (null == mStrings) if (0 == mBuffer.length ()) setStrings (); else updateStrings (mBuffer.toString ()); return (mStrings); } /** * Get the current 'include links' state. * @return <code>true</code> if link text is included in the text extracted * from the URL, <code>false</code> otherwise. */ public boolean getLinks () { return (mLinks); } /** * Set the 'include links' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param links Use <code>true</code> if link text is to be included in the * text extracted from the URL, <code>false</code> otherwise. */ public void setLinks (boolean links) { boolean oldValue = mLinks; if (oldValue != links) { mLinks = links; mPropertySupport.firePropertyChange ( PROP_LINKS_PROPERTY, oldValue, links); resetStrings (); } } /** * Get the current URL. * @return The URL from which text has been extracted, or <code>null</code> * if this property has not been set yet. */ public String getURL () { return ((null != mParser) ? mParser.getURL () : null); } /** * Set the URL to extract strings from. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param url The URL that text should be fetched from. */ public void setURL (String url) { String old; URLConnection conn; old = getURL (); conn = getConnection (); if (((null == old) && (null != url)) || ((null != old) && !old.equals (url))) { try { if (null == mParser) mParser = new Parser (url); else mParser.setURL (url); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, old, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } } /** * Get the current 'replace non breaking spaces' state. * @return <code>true</code> if non-breaking spaces (character '\u00a0', * numeric character reference &#160; or character entity * reference &nbsp;) are to be replaced with normal * spaces (character '\u0020'). */ public boolean getReplaceNonBreakingSpaces () { return (mReplaceSpace); } /** * Set the 'replace non breaking spaces' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param replace <code>true</code> if non-breaking spaces * (character '\u00a0', numeric character reference &#160; * or character entity reference &nbsp;) are to be replaced with normal * spaces (character '\u0020'). */ public void setReplaceNonBreakingSpaces (boolean replace) { boolean oldValue = mReplaceSpace; if (oldValue != replace) { mReplaceSpace = replace; mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, oldValue, replace); resetStrings (); } } /** * Get the current 'collapse whitespace' state. * If set to <code>true</code> this emulates the operation of browsers * in interpretting text where <quote>user agents should collapse input * white space sequences when producing output inter-word space</quote>. * See HTML specification section 9.1 White space * <a href="http://www.w3.org/TR/html4/struct/text.html#h-9.1"> * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>. * @return <code>true</code> if sequences of whitespace (space '\u0020', * tab '\u0009', form feed '\u000C', zero-width space '\u200B', * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single * space. */ public boolean getCollapse () { return (mCollapse); } /** * Set the current 'collapse whitespace' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param collapse If <code>true</code>, sequences of whitespace * will be reduced to a single space. */ public void setCollapse (boolean collapse) { boolean oldValue = mCollapse; if (oldValue != collapse) { mCollapse = collapse; mPropertySupport.firePropertyChange ( PROP_COLLAPSE_PROPERTY, oldValue, collapse); resetStrings (); } } /** * Get the current connection. * @return The connection that the parser has or <code>null</code> if it * hasn't been set or the parser hasn't been constructed yet. */ public URLConnection getConnection () { return ((null != mParser) ? mParser.getConnection () : null); } /** * Set the parser's connection. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param connection New value of property Connection. */ public void setConnection (URLConnection connection) { String url; URLConnection conn; url = getURL (); conn = getConnection (); if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection))) { try { if (null == mParser) mParser = new Parser (connection); else mParser.setConnection (connection); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, url, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } } // // NodeVisitor overrides // /** * Appends the text to the output. * @param string The text node. */ public void visitStringNode (Text string) { if (!mIsScript && !mIsStyle) { String text = string.getText (); if (!mIsPre) { text = Translate.decode (text); if (getReplaceNonBreakingSpaces ()) text = text.replace ('\u00a0', ' '); if (getCollapse ()) collapse (mBuffer, text); else mBuffer.append (text); } else mBuffer.append (text); } } /** * Appends a NEWLINE to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. * @param tag The tag to examine. */ public void visitTag (Tag tag) { String name; if (tag instanceof LinkTag) if (getLinks ()) { // appends the link as text between angle brackets to the output. mBuffer.append ("<"); mBuffer.append (((LinkTag)tag).getLink ()); mBuffer.append (">"); } name = tag.getTagName (); if (name.equalsIgnoreCase ("PRE")) mIsPre = true; else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = true; else if (name.equalsIgnoreCase ("STYLE")) mIsStyle = true; if (tag.breaksFlow ()) carriageReturn (); } /** * Resets the state of the PRE and SCRIPT flags. * @param tag The end tag to process. */ public void visitEndTag (Tag tag) { String name; name = tag.getTagName (); if (name.equalsIgnoreCase ("PRE")) mIsPre = false; else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = false; else if (name.equalsIgnoreCase ("STYLE")) mIsStyle = false; } /** * Unit test. * @param args Pass arg[0] as the URL to process. */ public static void main (String[] args) { if (0 >= args.length) System.out.println ("Usage: java -classpath htmlparser.jar" + " org.htmlparser.beans.StringBean <http://whatever_url>"); else { StringBean sb = new StringBean (); sb.setLinks (false); sb.setReplaceNonBreakingSpaces (true); sb.setCollapse (true); sb.setURL (args[0]); System.out.println (sb.getStrings ()); } } }