// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/FilterBean.java,v $ // $Author: derrickoswald $ // $Date: 2005/09/18 23:40:44 $ // $Revision: 1.4 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.beans; import java.beans.PropertyChangeListener; import java.beans.PropertyChangeSupport; import java.io.Serializable; import java.net.URLConnection; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.EncodingChangeException; /** * Extract nodes from a URL using a filter. * <pre> * <code> * FilterBean fb = new FilterBean ("http://cbc.ca"); * fb.setFilters (new NodeFilter[] { new TagNameFilter ("META") }); * fb.setURL ("http://cbc.ca"); * System.out.println (fb.getNodes ().toHtml ()); * </code> * </pre> */ public class FilterBean implements Serializable { /** * Property name in event where the URL contents changes. */ public static final String PROP_NODES_PROPERTY = "nodes"; /** * Property name in event where the URL contents changes. */ public static final String PROP_TEXT_PROPERTY = "text"; /** * Property name in event where the URL changes. */ public static final String PROP_URL_PROPERTY = "URL"; /** * Property name in event where the connection changes. */ public static final String PROP_CONNECTION_PROPERTY = "connection"; /** * Bound property support. */ protected PropertyChangeSupport mPropertySupport; /** * The parser used to filter. */ protected Parser mParser; /** * The filter set. */ protected NodeFilter[] mFilters; /** * The nodes extracted from the URL. */ protected NodeList mNodes; /** * The recursion behaviour for elements of the filter array. * If <code>true</code> the filters are applied recursively. * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean). */ protected boolean mRecursive; /** * Create a FilterBean object. */ public FilterBean () { mPropertySupport = new PropertyChangeSupport (this); mParser = new Parser (); mFilters = null; mNodes = null; mRecursive = true; } // // internals // /** * Assign the <code>Nodes</code> property, firing the property change. * @param nodes The new value of the <code>Nodes</code> property. */ protected void updateNodes (NodeList nodes) { NodeList oldValue; String oldText; String newText; if ((null == mNodes) || !mNodes.equals (nodes)) { oldValue = mNodes; if (null != oldValue) oldText = getText (); else oldText = ""; if (null == oldText) oldText = ""; mNodes = nodes; if (null != mNodes) // TODO: fix this null problem newText = getText (); else // StringBean finds no nodes newText = ""; if (null == newText) newText = ""; mPropertySupport.firePropertyChange ( PROP_NODES_PROPERTY, oldValue, nodes); if (!newText.equals (oldText)) mPropertySupport.firePropertyChange ( PROP_TEXT_PROPERTY, oldText, newText); } } /** * Apply each of the filters. * The first filter is applied to the output of the parser. * Subsequent filters are applied to the output of the prior filter. * @return A list of nodes passed through all filters. * If there are no filters, returns the entire page. * @throws ParserException If an encoding change occurs * or there is some other problem. */ protected NodeList applyFilters () throws ParserException { NodeFilter[] filters; NodeList ret; ret = mParser.parse (null); filters = getFilters (); if (null != filters) for (int i = 0; i < filters.length; i++) ret = ret.extractAllNodesThatMatch (filters[i], mRecursive); return (ret); } /** * Fetch the URL contents and filter it. * Only do work if there is a valid parser with it's URL set. */ protected void setNodes () { NodeList list; if (null != getURL ()) try { list = applyFilters (); updateNodes (list); } catch (EncodingChangeException ece) { try { // try again with the encoding now in force mParser.reset (); list = applyFilters (); updateNodes (list); } catch (ParserException pe) { updateNodes (new NodeList ()); } } catch (ParserException pe) { updateNodes (new NodeList ()); } } // // Property change support. // /** * Add a PropertyChangeListener to the listener list. * The listener is registered for all properties. * @param listener The PropertyChangeListener to be added. */ public void addPropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.addPropertyChangeListener (listener); } /** * Remove a PropertyChangeListener from the listener list. * This removes a registered PropertyChangeListener. * @param listener The PropertyChangeListener to be removed. */ public void removePropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.removePropertyChangeListener (listener); } // // Properties // /** * Return the nodes of the URL matching the filter. * This is the primary output of the bean. * @return The nodes from the URL matching the current filter. */ public NodeList getNodes () { if (null == mNodes) setNodes (); return (mNodes); } /** * Get the current URL. * @return The URL from which text has been extracted, or <code>null</code> * if this property has not been set yet. */ public String getURL () { return ((null != mParser) ? mParser.getURL () : null); } /** * Set the URL to extract strings from. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param url The URL that text should be fetched from. */ public void setURL (String url) { String old; URLConnection conn; old = getURL (); conn = getConnection (); if (((null == old) && (null != url)) || ((null != old) && !old.equals (url))) { try { if (null == mParser) mParser = new Parser (url); else mParser.setURL (url); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, old, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setNodes (); } catch (ParserException pe) { updateNodes (new NodeList ()); } } } /** * Get the current connection. * @return The connection that the parser has or <code>null</code> if it * hasn't been set or the parser hasn't been constructed yet. */ public URLConnection getConnection () { return ((null != mParser) ? mParser.getConnection () : null); } /** * Set the parser's connection. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param connection New value of property Connection. */ public void setConnection (URLConnection connection) { String url; URLConnection conn; url = getURL (); conn = getConnection (); if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection))) { try { if (null == mParser) mParser = new Parser (connection); else mParser.setConnection (connection); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, url, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setNodes (); } catch (ParserException pe) { updateNodes (new NodeList ()); } } } /** * Get the current filter set. * @return The current filters. */ public NodeFilter[] getFilters () { return (mFilters); } /** * Set the filters for the bean. * If the parser has been set, it is reset and * the nodes are refetched with the new filters. * @param filters The filter set to use. */ public void setFilters (NodeFilter[] filters) { mFilters = filters; if (null != getParser ()) { getParser ().reset (); setNodes (); } } /** * Get the parser used to fetch nodes. * @return The parser used by the bean. */ public Parser getParser () { return (mParser); } /** * Set the parser for the bean. * The parser is used immediately to fetch the nodes, * which for a null filter means all the nodes * @param parser The parser to use. */ public void setParser (Parser parser) { mParser = parser; if (null != getFilters ()) setNodes (); } /** * Convenience method to apply a {@link StringBean} to the filter results. * This may yield duplicate or multiple text elements if the node list * contains nodes from two or more levels in the same nested tag heirarchy, * but if the node list contains only one tag, it provides access to the * text within the node. * @return The textual contents of the nodes that pass through the filter set, * as collected by the StringBean. */ public String getText () { NodeList list; StringBean sb; String ret; list = getNodes (); if (0 != list.size ()) { sb = new StringBean (); for (int i = 0; i < list.size (); i++) list.elementAt (i).accept (sb); ret = sb.getStrings (); } else ret = ""; return (ret); } /** * Get the current recursion behaviour. * @return The recursion (applies to children, children's children, etc) * behavior currently being used. */ public boolean getRecursive () { return (mRecursive); } /** * Set the recursion behaviour. * @param recursive If <code>true</code> the * <code>extractAllNodesThatMatch()</code> call is performed recursively. * @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean). */ public void setRecursive (boolean recursive) { mRecursive = recursive; } /** * Unit test. * @param args Pass arg[0] as the URL to process, * and optionally a node name for filtering. */ public static void main (String[] args) { if (0 >= args.length) System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.FilterBean <http://whatever_url> [node name]"); else { FilterBean fb = new FilterBean (); if (1 < args.length) fb.setFilters (new NodeFilter[] { new org.htmlparser.filters.TagNameFilter (args[1]) }); fb.setURL (args[0]); //System.out.println (fb.getNodes ().toHtml ()); System.out.println (fb.getText ()); } } }