// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/FilterBean.java,v $
// $Author: derrickoswald $
// $Date: 2005/09/18 23:40:44 $
// $Revision: 1.4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.beans;
import java.beans.PropertyChangeListener;
import java.beans.PropertyChangeSupport;
import java.io.Serializable;
import java.net.URLConnection;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.EncodingChangeException;
/**
* Extract nodes from a URL using a filter.
* <pre>
* <code>
* FilterBean fb = new FilterBean ("http://cbc.ca");
* fb.setFilters (new NodeFilter[] { new TagNameFilter ("META") });
* fb.setURL ("http://cbc.ca");
* System.out.println (fb.getNodes ().toHtml ());
* </code>
* </pre>
*/
public class FilterBean
implements
Serializable
{
/**
* Property name in event where the URL contents changes.
*/
public static final String PROP_NODES_PROPERTY = "nodes";
/**
* Property name in event where the URL contents changes.
*/
public static final String PROP_TEXT_PROPERTY = "text";
/**
* Property name in event where the URL changes.
*/
public static final String PROP_URL_PROPERTY = "URL";
/**
* Property name in event where the connection changes.
*/
public static final String PROP_CONNECTION_PROPERTY = "connection";
/**
* Bound property support.
*/
protected PropertyChangeSupport mPropertySupport;
/**
* The parser used to filter.
*/
protected Parser mParser;
/**
* The filter set.
*/
protected NodeFilter[] mFilters;
/**
* The nodes extracted from the URL.
*/
protected NodeList mNodes;
/**
* The recursion behaviour for elements of the filter array.
* If <code>true</code> the filters are applied recursively.
* @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean).
*/
protected boolean mRecursive;
/**
* Create a FilterBean object.
*/
public FilterBean ()
{
mPropertySupport = new PropertyChangeSupport (this);
mParser = new Parser ();
mFilters = null;
mNodes = null;
mRecursive = true;
}
//
// internals
//
/**
* Assign the <code>Nodes</code> property, firing the property change.
* @param nodes The new value of the <code>Nodes</code> property.
*/
protected void updateNodes (NodeList nodes)
{
NodeList oldValue;
String oldText;
String newText;
if ((null == mNodes) || !mNodes.equals (nodes))
{
oldValue = mNodes;
if (null != oldValue)
oldText = getText ();
else
oldText = "";
if (null == oldText)
oldText = "";
mNodes = nodes;
if (null != mNodes) // TODO: fix this null problem
newText = getText ();
else // StringBean finds no nodes
newText = "";
if (null == newText)
newText = "";
mPropertySupport.firePropertyChange (
PROP_NODES_PROPERTY, oldValue, nodes);
if (!newText.equals (oldText))
mPropertySupport.firePropertyChange (
PROP_TEXT_PROPERTY, oldText, newText);
}
}
/**
* Apply each of the filters.
* The first filter is applied to the output of the parser.
* Subsequent filters are applied to the output of the prior filter.
* @return A list of nodes passed through all filters.
* If there are no filters, returns the entire page.
* @throws ParserException If an encoding change occurs
* or there is some other problem.
*/
protected NodeList applyFilters ()
throws
ParserException
{
NodeFilter[] filters;
NodeList ret;
ret = mParser.parse (null);
filters = getFilters ();
if (null != filters)
for (int i = 0; i < filters.length; i++)
ret = ret.extractAllNodesThatMatch (filters[i], mRecursive);
return (ret);
}
/**
* Fetch the URL contents and filter it.
* Only do work if there is a valid parser with it's URL set.
*/
protected void setNodes ()
{
NodeList list;
if (null != getURL ())
try
{
list = applyFilters ();
updateNodes (list);
}
catch (EncodingChangeException ece)
{
try
{ // try again with the encoding now in force
mParser.reset ();
list = applyFilters ();
updateNodes (list);
}
catch (ParserException pe)
{
updateNodes (new NodeList ());
}
}
catch (ParserException pe)
{
updateNodes (new NodeList ());
}
}
//
// Property change support.
//
/**
* Add a PropertyChangeListener to the listener list.
* The listener is registered for all properties.
* @param listener The PropertyChangeListener to be added.
*/
public void addPropertyChangeListener (PropertyChangeListener listener)
{
mPropertySupport.addPropertyChangeListener (listener);
}
/**
* Remove a PropertyChangeListener from the listener list.
* This removes a registered PropertyChangeListener.
* @param listener The PropertyChangeListener to be removed.
*/
public void removePropertyChangeListener (PropertyChangeListener listener)
{
mPropertySupport.removePropertyChangeListener (listener);
}
//
// Properties
//
/**
* Return the nodes of the URL matching the filter.
* This is the primary output of the bean.
* @return The nodes from the URL matching the current filter.
*/
public NodeList getNodes ()
{
if (null == mNodes)
setNodes ();
return (mNodes);
}
/**
* Get the current URL.
* @return The URL from which text has been extracted, or <code>null</code>
* if this property has not been set yet.
*/
public String getURL ()
{
return ((null != mParser) ? mParser.getURL () : null);
}
/**
* Set the URL to extract strings from.
* The text from the URL will be fetched, which may be expensive, so this
* property should be set last.
* @param url The URL that text should be fetched from.
*/
public void setURL (String url)
{
String old;
URLConnection conn;
old = getURL ();
conn = getConnection ();
if (((null == old) && (null != url)) || ((null != old)
&& !old.equals (url)))
{
try
{
if (null == mParser)
mParser = new Parser (url);
else
mParser.setURL (url);
mPropertySupport.firePropertyChange (
PROP_URL_PROPERTY, old, getURL ());
mPropertySupport.firePropertyChange (
PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
setNodes ();
}
catch (ParserException pe)
{
updateNodes (new NodeList ());
}
}
}
/**
* Get the current connection.
* @return The connection that the parser has or <code>null</code> if it
* hasn't been set or the parser hasn't been constructed yet.
*/
public URLConnection getConnection ()
{
return ((null != mParser) ? mParser.getConnection () : null);
}
/**
* Set the parser's connection.
* The text from the URL will be fetched, which may be expensive, so this
* property should be set last.
* @param connection New value of property Connection.
*/
public void setConnection (URLConnection connection)
{
String url;
URLConnection conn;
url = getURL ();
conn = getConnection ();
if (((null == conn) && (null != connection)) || ((null != conn)
&& !conn.equals (connection)))
{
try
{
if (null == mParser)
mParser = new Parser (connection);
else
mParser.setConnection (connection);
mPropertySupport.firePropertyChange (
PROP_URL_PROPERTY, url, getURL ());
mPropertySupport.firePropertyChange (
PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());
setNodes ();
}
catch (ParserException pe)
{
updateNodes (new NodeList ());
}
}
}
/**
* Get the current filter set.
* @return The current filters.
*/
public NodeFilter[] getFilters ()
{
return (mFilters);
}
/**
* Set the filters for the bean.
* If the parser has been set, it is reset and
* the nodes are refetched with the new filters.
* @param filters The filter set to use.
*/
public void setFilters (NodeFilter[] filters)
{
mFilters = filters;
if (null != getParser ())
{
getParser ().reset ();
setNodes ();
}
}
/**
* Get the parser used to fetch nodes.
* @return The parser used by the bean.
*/
public Parser getParser ()
{
return (mParser);
}
/**
* Set the parser for the bean.
* The parser is used immediately to fetch the nodes,
* which for a null filter means all the nodes
* @param parser The parser to use.
*/
public void setParser (Parser parser)
{
mParser = parser;
if (null != getFilters ())
setNodes ();
}
/**
* Convenience method to apply a {@link StringBean} to the filter results.
* This may yield duplicate or multiple text elements if the node list
* contains nodes from two or more levels in the same nested tag heirarchy,
* but if the node list contains only one tag, it provides access to the
* text within the node.
* @return The textual contents of the nodes that pass through the filter set,
* as collected by the StringBean.
*/
public String getText ()
{
NodeList list;
StringBean sb;
String ret;
list = getNodes ();
if (0 != list.size ())
{
sb = new StringBean ();
for (int i = 0; i < list.size (); i++)
list.elementAt (i).accept (sb);
ret = sb.getStrings ();
}
else
ret = "";
return (ret);
}
/**
* Get the current recursion behaviour.
* @return The recursion (applies to children, children's children, etc)
* behavior currently being used.
*/
public boolean getRecursive ()
{
return (mRecursive);
}
/**
* Set the recursion behaviour.
* @param recursive If <code>true</code> the
* <code>extractAllNodesThatMatch()</code> call is performed recursively.
* @see org.htmlparser.util.NodeList#extractAllNodesThatMatch(NodeFilter, boolean).
*/
public void setRecursive (boolean recursive)
{
mRecursive = recursive;
}
/**
* Unit test.
* @param args Pass arg[0] as the URL to process,
* and optionally a node name for filtering.
*/
public static void main (String[] args)
{
if (0 >= args.length)
System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.beans.FilterBean <http://whatever_url> [node name]");
else
{
FilterBean fb = new FilterBean ();
if (1 < args.length)
fb.setFilters (new NodeFilter[] { new org.htmlparser.filters.TagNameFilter (args[1]) });
fb.setURL (args[0]);
//System.out.println (fb.getNodes ().toHtml ());
System.out.println (fb.getText ());
}
}
}