// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/RegexFilter.java,v $ // $Author: derrickoswald $ // $Date: 2005/05/15 11:49:04 $ // $Revision: 1.4 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.filters; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Text; /** * This filter accepts all string nodes matching a regular expression. * Because this searches {@link org.htmlparser.Text Text} nodes. it is * only useful for finding small fragments of text, where it is * unlikely to be broken up by a tag. To find large fragments of text * you should convert the page to plain text with something like the * {@link org.htmlparser.beans.StringBean StringBean} and then apply * the regular expression. * <p> * For example, to look for dates use: * <pre> * (19|20)\d\d([- \\/.](0[1-9]|1[012])[- \\/.](0[1-9]|[12][0-9]|3[01]))? * </pre> * as in: * <pre> * Parser parser = new Parser ("http://cbc.ca"); * RegexFilter filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?"); * NodeIterator iterator = parser.extractAllNodesThatMatch (filter).elements (); * </pre> * which matches a date in yyyy-mm-dd format between 1900-01-01 and 2099-12-31, * with a choice of five separators, either a dash, a space, either kind of * slash or a period. * The year is matched by (19|20)\d\d which uses alternation to allow the * either 19 or 20 as the first two digits. The round brackets are mandatory. * The month is matched by 0[1-9]|1[012], again enclosed by round brackets * to keep the two options together. By using character classes, the first * option matches a number between 01 and 09, and the second * matches 10, 11 or 12. * The last part of the regex consists of three options. The first matches * the numbers 01 through 09, the second 10 through 29, and the third matches 30 or 31. * The day and month are optional, but must occur together because of the ()? * bracketing after the year. */ public class RegexFilter implements NodeFilter { /** * Use match() matching strategy. */ public static final int MATCH = 1; /** * Use lookingAt() match strategy. */ public static final int LOOKINGAT = 2; /** * Use find() match strategy. */ public static final int FIND = 3; /** * The regular expression to search for. */ protected String mPatternString; /** * The compiled regular expression to search for. */ protected Pattern mPattern; /** * The match strategy. * @see #RegexFilter(String, int) */ protected int mStrategy; /** * Creates a new instance of RegexFilter that accepts string nodes matching * the regular expression ".*" using the FIND strategy. */ public RegexFilter () { this (".*", FIND); } /** * Creates a new instance of RegexFilter that accepts string nodes matching * a regular expression using the FIND strategy. * @param pattern The pattern to search for. */ public RegexFilter (String pattern) { this (pattern, FIND); } /** * Creates a new instance of RegexFilter that accepts string nodes matching * a regular expression. * @param pattern The pattern to search for. * @param strategy The type of match: * <ol> * <li>{@link #MATCH} use matches() method: attempts to match * the entire input sequence against the pattern</li> * <li>{@link #LOOKINGAT} use lookingAt() method: attempts to match * the input sequence, starting at the beginning, against the pattern</li> * <li>{@link #FIND} use find() method: scans the input sequence looking * for the next subsequence that matches the pattern</li> * </ol> */ public RegexFilter (String pattern, int strategy) { setPattern (pattern); setStrategy (strategy); } /** * Get the search pattern. * @return Returns the pattern. */ public String getPattern () { return (mPatternString); } /** * Set the search pattern. * @param pattern The pattern to set. */ public void setPattern (String pattern) { mPatternString = pattern; mPattern = Pattern.compile (pattern); } /** * Get the search strategy. * @return Returns the strategy. */ public int getStrategy () { return (mStrategy); } /** * Set the search pattern. * @param strategy The strategy to use. One of MATCH, LOOKINGAT or FIND. */ public void setStrategy (int strategy) { if ((strategy != MATCH) && (strategy != LOOKINGAT) && (strategy != FIND)) throw new IllegalArgumentException ("illegal strategy (" + strategy + ")"); mStrategy = strategy; } /** * Accept string nodes that match the regular expression. * @param node The node to check. * @return <code>true</code> if the regular expression matches the * text of the node, <code>false</code> otherwise. */ public boolean accept (Node node) { String string; Matcher matcher; boolean ret; ret = false; if (node instanceof Text) { string = ((Text)node).getText (); matcher = mPattern.matcher (string); switch (mStrategy) { case MATCH: ret = matcher.matches (); break; case LOOKINGAT: ret = matcher.lookingAt (); break; case FIND: default: ret = matcher.find (); break; } } return (ret); } }