RegexFilter.java example

Explorer
EclipseTrader-master
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/RegexFilter.java,v $
// $Author: derrickoswald $
// $Date: 2005/05/15 11:49:04 $
// $Revision: 1.4 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.filters;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Text;

/**
 * This filter accepts all string nodes matching a regular expression.
 * Because this searches {@link org.htmlparser.Text Text} nodes. it is
 * only useful for finding small fragments of text, where it is
 * unlikely to be broken up by a tag. To find large fragments of text
 * you should convert the page to plain text with something like the
 * {@link org.htmlparser.beans.StringBean StringBean} and then apply
 * the regular expression.
 * <p>
 * For example, to look for dates use:
 * <pre>
 *   (19|20)\d\d([- \\/.](0[1-9]|1[012])[- \\/.](0[1-9]|[12][0-9]|3[01]))?
 * </pre>
 * as in:
 * <pre>
 * Parser parser = new Parser ("http://cbc.ca");
 * RegexFilter filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
 * NodeIterator iterator = parser.extractAllNodesThatMatch (filter).elements ();
 * </pre>
 * which matches a date in yyyy-mm-dd format between 1900-01-01 and 2099-12-31,
 * with a choice of five separators, either a dash, a space, either kind of
 * slash or a period.
 * The year is matched by (19|20)\d\d which uses alternation to allow the
 * either 19 or 20 as the first two digits. The round brackets are mandatory.
 * The month is matched by 0[1-9]|1[012], again enclosed by round brackets
 * to keep the two options together. By using character classes, the first
 * option matches a number between 01 and 09, and the second
 * matches 10, 11 or 12.
 * The last part of the regex consists of three options. The first matches
 * the numbers 01 through 09, the second 10 through 29, and the third matches 30 or 31.
 * The day and month are optional, but must occur together because of the ()?
 * bracketing after the year.
 */
public class RegexFilter implements NodeFilter
{
    /**
     * Use match() matching strategy.
     */
    public static final int MATCH = 1;

    /**
     * Use lookingAt() match strategy.
     */
    public static final int LOOKINGAT = 2;

    /**
     * Use find() match strategy.
     */
    public static final int FIND = 3;

    /**
     * The regular expression to search for.
     */
    protected String mPatternString;

    /**
     * The compiled regular expression to search for.
     */
    protected Pattern mPattern;

    /**
     * The match strategy.
     * @see #RegexFilter(String, int)
     */
    protected int mStrategy;

    /**
     * Creates a new instance of RegexFilter that accepts string nodes matching
     * the regular expression ".*" using the FIND strategy.
     */
    public RegexFilter ()
    {
        this (".*", FIND);
    }

    /**
     * Creates a new instance of RegexFilter that accepts string nodes matching
     * a regular expression using the FIND strategy.
     * @param pattern The pattern to search for.
     */
    public RegexFilter (String pattern)
    {
        this (pattern, FIND);
    }

    /**
     * Creates a new instance of RegexFilter that accepts string nodes matching
     * a regular expression.
     * @param pattern The pattern to search for.
     * @param strategy The type of match:
     * <ol>
     * <li>{@link #MATCH} use matches() method: attempts to match
     * the entire input sequence against the pattern</li>
     * <li>{@link #LOOKINGAT} use lookingAt() method: attempts to match
     * the input sequence, starting at the beginning, against the pattern</li>
     * <li>{@link #FIND} use find() method: scans the input sequence looking
     * for the next subsequence that matches the pattern</li>
     * </ol>
     */
    public RegexFilter (String pattern, int strategy)
    {
        setPattern (pattern);
        setStrategy (strategy);
    }

    /**
     * Get the search pattern.
     * @return Returns the pattern.
     */
    public String getPattern ()
    {
        return (mPatternString);
    }

    /**
     * Set the search pattern.
     * @param pattern The pattern to set.
     */
    public void setPattern (String pattern)
    {
        mPatternString = pattern;
        mPattern = Pattern.compile (pattern);
    }

    /**
     * Get the search strategy.
     * @return Returns the strategy.
     */
    public int getStrategy ()
    {
        return (mStrategy);
    }

    /**
     * Set the search pattern.
     * @param strategy The strategy to use. One of MATCH, LOOKINGAT or FIND.
     */
    public void setStrategy (int strategy)
    {
        if ((strategy != MATCH) && (strategy != LOOKINGAT)
            && (strategy != FIND))
            throw new IllegalArgumentException ("illegal strategy ("
                + strategy + ")");
        mStrategy = strategy;
    }

    /**
     * Accept string nodes that match the regular expression.
     * @param node The node to check.
     * @return <code>true</code> if the regular expression matches the
     * text of the node, <code>false</code> otherwise.
     */
    public boolean accept (Node node)
    {
        String string;
        Matcher matcher;
        boolean ret;

        ret = false;
        if (node instanceof Text)
        {
            string = ((Text)node).getText ();
            matcher = mPattern.matcher (string);
            switch (mStrategy)
            {
                case MATCH:
                    ret = matcher.matches ();
                    break;
                case LOOKINGAT:
                    ret = matcher.lookingAt ();
                    break;
                case FIND:
                default:
                    ret = matcher.find ();
                    break;
            }
        }

        return (ret);
    }
}