// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/filterTests/FilterTest.java,v $ // $Author: derrickoswald $ // $Date: 2004/07/02 00:49:29 $ // $Revision: 1.7 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.tests.filterTests; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.CssSelectorNodeFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.HasChildFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.NotFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.RegexFilter; import org.htmlparser.filters.StringFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.lexer.Lexer; import org.htmlparser.Text; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; /** * Test the operation of filters. */ public class FilterTest extends ParserTestCase { static { System.setProperty ("org.htmlparser.tests.filterTests.FilterTest", "FilterTest"); } public FilterTest (String name) { super (name); } /** * Test node class filtering. */ public void testNodeClass () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the time for all good men..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch (new NodeClassFilter (BodyTag.class)); assertEquals ("only one element", 1, list.size ()); assertType ("should be BodyTag", BodyTag.class, list.elementAt (0)); BodyTag body = (BodyTag)list.elementAt (0); assertEquals ("only one child", 1, body.getChildCount ()); assertSuperType ("should be Text", Text.class, body.getChildren ().elementAt (0)); assertStringEquals("html", guts, body.toHtml ()); } /** * Test tag name filtering. */ public void testTagName () throws ParserException { String guts; String html; NodeList list; guts = "<booty>Now is the time for all good men..</booty>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch (new TagNameFilter ("booty")); assertEquals ("only one element", 1, list.size ()); assertSuperType ("should be Tag", Tag.class, list.elementAt (0)); assertStringEquals("name", "BOOTY", ((Tag)(list.elementAt (0))).getTagName ()); } /** * Test string filtering. */ public void testString () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <time>men</time>..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch (new StringFilter ("Time")); assertEquals ("only one element", 1, list.size ()); assertSuperType ("should be String", Text.class, list.elementAt (0)); assertStringEquals("name", "time", ((Text)list.elementAt (0)).getText ()); // test case sensitivity list = parser.extractAllNodesThatMatch (new StringFilter ("Time", true)); assertEquals ("should be no elements", 0, list.size ()); } /** * Test child filtering. */ public void testChild () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch (new HasChildFilter (new TagNameFilter ("b"))); assertEquals ("only one element", 1, list.size ()); assertType ("should be LinkTag", LinkTag.class, list.elementAt (0)); LinkTag link = (LinkTag)list.elementAt (0); assertEquals ("three children", 3, link.getChildCount ()); assertSuperType ("should be TagNode", Tag.class, link.getChildren ().elementAt (0)); Tag tag = (Tag)link.getChildren ().elementAt (0); assertStringEquals("name", "B", tag.getTagName ()); } /** * Test attribute filtering. */ public void testAttribute () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch (new HasAttributeFilter ("id")); assertEquals ("only one element", 1, list.size ()); assertType ("should be LinkTag", LinkTag.class, list.elementAt (0)); LinkTag link = (LinkTag)list.elementAt (0); assertEquals ("attribute value", "target", link.getAttribute ("id")); } /** * Test and filtering. */ public void testAnd () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=one><b>time</b></a> for all good <a id=two><b>men</b></a>..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch ( new AndFilter ( new HasChildFilter ( new TagNameFilter ("b")), new HasChildFilter ( new StringFilter ("men"))) ); assertEquals ("only one element", 1, list.size ()); assertType ("should be LinkTag", LinkTag.class, list.elementAt (0)); LinkTag link = (LinkTag)list.elementAt (0); assertEquals ("attribute value", "two", link.getAttribute ("id")); } /** * Test or filtering. */ public void testOr () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=one><b>time</b></a> for <a id=two><b>all</b></a> good <a id=three><b>men</b></a>..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch ( new OrFilter ( new HasChildFilter ( new StringFilter ("time")), new HasChildFilter ( new StringFilter ("men"))) ); assertEquals ("two elements", 2, list.size ()); assertType ("should be LinkTag", LinkTag.class, list.elementAt (0)); LinkTag link = (LinkTag)list.elementAt (0); assertEquals ("attribute value", "one", link.getAttribute ("id")); assertType ("should be LinkTag", LinkTag.class, list.elementAt (1)); link = (LinkTag)list.elementAt (1); assertEquals ("attribute value", "three", link.getAttribute ("id")); } /** * Test not filtering. */ public void testNot () throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=one><b>time</b></a> for <a id=two><b>all</b></a> good <a id=three><b>men</b></a>..</body>"; html = "<html>" + guts + "</html>"; createParser (html); list = parser.extractAllNodesThatMatch ( new AndFilter ( new HasChildFilter ( new TagNameFilter ("b")), new NotFilter ( new HasChildFilter ( new StringFilter ("all")))) ); assertEquals ("two elements", 2, list.size ()); assertType ("should be LinkTag", LinkTag.class, list.elementAt (0)); LinkTag link = (LinkTag)list.elementAt (0); assertEquals ("attribute value", "one", link.getAttribute ("id")); assertType ("should be LinkTag", LinkTag.class, list.elementAt (1)); link = (LinkTag)list.elementAt (1); assertEquals ("attribute value", "three", link.getAttribute ("id")); } public void testEscape() throws Exception { assertEquals ("douchebag", CssSelectorNodeFilter.unescape ("doucheba\\g").toString ()); } public void testSelectors() throws Exception { String html = "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>>moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>"; Lexer l; Parser p; CssSelectorNodeFilter it; NodeIterator i; int count; l = new Lexer (html); p = new Parser (l); it = new CssSelectorNodeFilter ("li + li"); count = 0; for (i = p.extractAllNodesThatMatch (it).elements (); i.hasMoreNodes ();) { assertEquals ("tag name wrong", "LI", ((Tag)i.nextNode()).getTagName()); count++; } assertEquals ("wrong count", 2, count); } /** * Test regular expression matching: */ public void testRegularExpression () throws Exception { String target = "\n" + "\n" + "Most recently, in the Western Conference final, the Flames knocked off \n" + "the San Jose Sharks, the Pacific Division champions, to become the first \n" + "Canadian team to reach the Stanley Cup Championship series since 1994."; String html = "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>" + "<body><h1>CBC SPORTS ONLINE</h1>\n" + "The Calgary Flames have already defeated three NHL division winners \n" + "during their improbable playoff run. If they are to hoist the Stanley \n" + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n" + "\n" + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n" + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n" + "</font></td></tr></table>\n" + "\n" + "\n" + "In the post-season's first round, the Flames defeated the Vancouver \n" + "Canucks, the Northwest Division winners, in seven tough games. <p>\n" + "\n" + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n" + "Division, but also boasted the NHL's best overall record during the \n" + "regular season, who fell to the Flames. <p>" + target + "<p>\n" + "\n" + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n" + "of the NHL's Southeast Division and the Eastern Conference's best team \n" + "during the regular season. <p>\n" + "\n" + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n" + "Conference final. <p>\n" + "</body></html>\n"; Lexer lexer; Parser parser; RegexFilter filter; NodeIterator iterator; int count; lexer = new Lexer (html); parser = new Parser (lexer); filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?"); count = 0; for (iterator = parser.extractAllNodesThatMatch (filter).elements (); iterator.hasMoreNodes ();) { assertEquals ("text wrong", target, iterator.nextNode ().toHtml ()); count++; } assertEquals ("wrong count", 1, count); } }