FilterTest.java example

Explorer
NewsSpeakServer-master
- libs
- src
  - com
    - vn
      - newsspeak
        ArticleParser.java
        ArticleParserFactory.java
        ArticleServlet.java
        Email.java
        ExtractTextXMLHandler.java
        FeaturedSourcesServlet.java
        FeedDataStorePopulator.java
        FeedServlet.java
        MailHandlerServlet.java
        NewsSource.java
        PMF.java
        parsers
        CNNParser.java
        DailyBeastParser.java
        EconomicTimesParser.java
        EngadgetParser.java
        HuffPostParser.java
        IndiaTodayParser.java
        LATimesParser.java
        MashableParser.java
        NYDailyNewsParser.java
        NYTimesParser.java
        ReadWriteWebParser.java
        TOIParser.java
        TechCrunchParser.java
        TheHinduParser.java
        USATodayParser.java
        WSJParser.java
        WashPostParser.java
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2003 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/filterTests/FilterTest.java,v $
// $Author: derrickoswald $
// $Date: 2004/07/02 00:49:29 $
// $Revision: 1.7 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.tests.filterTests;

import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.CssSelectorNodeFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.RegexFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.Text;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tests.ParserTestCase;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * Test the operation of filters.
 */
public class FilterTest extends ParserTestCase
{
    static
    {
        System.setProperty ("org.htmlparser.tests.filterTests.FilterTest", "FilterTest");
    }

    public FilterTest (String name)
    {
        super (name);
    }

    /**
     * Test node class filtering.
     */
    public void testNodeClass () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the time for all good men..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (new NodeClassFilter (BodyTag.class));
        assertEquals ("only one element", 1, list.size ());
        assertType ("should be BodyTag", BodyTag.class, list.elementAt (0));
        BodyTag body = (BodyTag)list.elementAt (0);
        assertEquals ("only one child", 1, body.getChildCount ());
        assertSuperType ("should be Text", Text.class, body.getChildren ().elementAt (0));
        assertStringEquals("html", guts, body.toHtml ());
    }


    /**
     * Test tag name filtering.
     */
    public void testTagName () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<booty>Now is the time for all good men..</booty>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (new TagNameFilter ("booty"));
        assertEquals ("only one element", 1, list.size ());
        assertSuperType ("should be Tag", Tag.class, list.elementAt (0));
        assertStringEquals("name", "BOOTY", ((Tag)(list.elementAt (0))).getTagName ());
    }

    /**
     * Test string filtering.
     */
    public void testString () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the <a id=target><b>time</b></a> for all good <time>men</time>..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (new StringFilter ("Time"));
        assertEquals ("only one element", 1, list.size ());
        assertSuperType ("should be String", Text.class, list.elementAt (0));
        assertStringEquals("name", "time", ((Text)list.elementAt (0)).getText ());
        // test case sensitivity
        list = parser.extractAllNodesThatMatch (new StringFilter ("Time", true));
        assertEquals ("should be no elements", 0, list.size ());
    }

    /**
     * Test child filtering.
     */
    public void testChild () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (new HasChildFilter (new TagNameFilter ("b")));
        assertEquals ("only one element", 1, list.size ());
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
        LinkTag link = (LinkTag)list.elementAt (0);
        assertEquals ("three children", 3, link.getChildCount ());
        assertSuperType ("should be TagNode", Tag.class, link.getChildren ().elementAt (0));
        Tag tag = (Tag)link.getChildren ().elementAt (0);
        assertStringEquals("name", "B", tag.getTagName ());
    }

    /**
     * Test attribute filtering.
     */
    public void testAttribute () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (new HasAttributeFilter ("id"));
        assertEquals ("only one element", 1, list.size ());
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
        LinkTag link = (LinkTag)list.elementAt (0);
        assertEquals ("attribute value", "target", link.getAttribute ("id"));
    }

    /**
     * Test and filtering.
     */
    public void testAnd () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the <a id=one><b>time</b></a> for all good <a id=two><b>men</b></a>..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (
            new AndFilter (
                new HasChildFilter (
                    new TagNameFilter ("b")),
                new HasChildFilter (
                    new StringFilter ("men")))
                );
        assertEquals ("only one element", 1, list.size ());
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
        LinkTag link = (LinkTag)list.elementAt (0);
        assertEquals ("attribute value", "two", link.getAttribute ("id"));
    }

    /**
     * Test or filtering.
     */
    public void testOr () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the <a id=one><b>time</b></a> for <a id=two><b>all</b></a> good <a id=three><b>men</b></a>..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (
            new OrFilter (
                new HasChildFilter (
                    new StringFilter ("time")),
                new HasChildFilter (
                    new StringFilter ("men")))
                );
        assertEquals ("two elements", 2, list.size ());
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
        LinkTag link = (LinkTag)list.elementAt (0);
        assertEquals ("attribute value", "one", link.getAttribute ("id"));
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (1));
        link = (LinkTag)list.elementAt (1);
        assertEquals ("attribute value", "three", link.getAttribute ("id"));
    }

    /**
     * Test not filtering.
     */
    public void testNot () throws ParserException
    {
        String guts;
        String html;
        NodeList list;

        guts = "<body>Now is the <a id=one><b>time</b></a> for <a id=two><b>all</b></a> good <a id=three><b>men</b></a>..</body>";
        html = "<html>" + guts + "</html>";
        createParser (html);
        list = parser.extractAllNodesThatMatch (
            new AndFilter (
                new HasChildFilter (
                    new TagNameFilter ("b")),
                new NotFilter (
                    new HasChildFilter (
                        new StringFilter ("all"))))
                );
        assertEquals ("two elements", 2, list.size ());
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (0));
        LinkTag link = (LinkTag)list.elementAt (0);
        assertEquals ("attribute value", "one", link.getAttribute ("id"));
        assertType ("should be LinkTag", LinkTag.class, list.elementAt (1));
        link = (LinkTag)list.elementAt (1);
        assertEquals ("attribute value", "three", link.getAttribute ("id"));
    }

    public void testEscape() throws Exception
    {
        assertEquals ("douchebag", CssSelectorNodeFilter.unescape ("doucheba\\g").toString ());
    }

    public void testSelectors() throws Exception
    {
        String html = "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>>moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>";
        Lexer l;
        Parser p;
        CssSelectorNodeFilter it;
        NodeIterator i;
        int count;

        l = new Lexer (html);
        p = new Parser (l);
        it = new CssSelectorNodeFilter ("li + li");
        count = 0;
        for (i = p.extractAllNodesThatMatch (it).elements (); i.hasMoreNodes ();)
        {
            assertEquals ("tag name wrong", "LI", ((Tag)i.nextNode()).getTagName());
            count++;
        }
        assertEquals ("wrong count", 2, count);
    }

    /**
     * Test regular expression matching:
     */
    public void testRegularExpression () throws Exception
    {
        String target =
              "\n"
            + "\n"
            + "Most recently, in the Western Conference final, the Flames knocked off \n"
            + "the San Jose Sharks, the Pacific Division champions, to become the first \n"
            + "Canadian team to reach the Stanley Cup Championship series since 1994.";
            
        String html =
              "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>"
            + "<body><h1>CBC SPORTS ONLINE</h1>\n"
            + "The Calgary Flames have already defeated three NHL division winners \n"
            + "during their improbable playoff run. If they are to hoist the Stanley \n"
            + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n"
            + "\n"
            + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n"
            + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n"
            + "</font></td></tr></table>\n"
            + "\n"
            + "\n"
            + "In the post-season's first round, the Flames defeated the Vancouver \n"
            + "Canucks, the Northwest Division winners, in seven tough games. <p>\n"
            + "\n"
            + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n"
            + "Division, but also boasted the NHL's best overall record during the \n"
            + "regular season, who fell to the Flames. <p>"
            + target
            + "<p>\n"
            + "\n"
            + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n"
            + "of the NHL's Southeast Division and the Eastern Conference's best team \n"
            + "during the regular season. <p>\n"
            + "\n"
            + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n"
            + "Conference final. <p>\n"
            + "</body></html>\n";
        Lexer lexer;
        Parser parser;
        RegexFilter filter;
        NodeIterator iterator;
        int count;

        lexer = new Lexer (html);
        parser = new Parser (lexer);
        filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
        count = 0;
        for (iterator = parser.extractAllNodesThatMatch (filter).elements (); iterator.hasMoreNodes ();)
        {
            assertEquals ("text wrong", target, iterator.nextNode ().toHtml ());
            count++;
        }
        assertEquals ("wrong count", 1, count);
    }
}