// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Somik Raha // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/parserHelperTests/StringParserTest.java,v $ // $Author: derrickoswald $ // $Date: 2004/09/02 02:28:15 $ // $Revision: 1.50 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.tests.parserHelperTests; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.Remark; import org.htmlparser.Text; import org.htmlparser.tags.HeadTag; import org.htmlparser.tags.Html; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.ParserException; public class StringParserTest extends ParserTestCase { static { System.setProperty ("org.htmlparser.tests.parserHelperTests.StringParserTest", "StringParserTest"); } public StringParserTest(String name) { super(name); } /** * The bug being reproduced is this : <BR> * <HTML><HEAD><TITLE>Google</TITLE> <BR> * The above line is incorrectly parsed in that, the text Google is missed. * The presence of this bug is typically when some tag is identified before the string node is. (usually seen * with the end tag). The bug lies in NodeReader.readElement(). * Creation date: (6/17/2001 4:01:06 PM) */ public void testTextBug1() throws ParserException { createParser("<HTML><HEAD><TITLE>Google</TITLE>"); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(5); // The fourth node should be a Text- with the text - Google assertTrue("Fourth node should be a Text",node[3] instanceof Text); Text stringNode = (Text)node[3]; assertEquals("Text of the Text","Google",stringNode.getText()); } /** * Test string containing link. * Bug reported by Kaarle Kaila of Nokia<br> * For the following HTML : * view these documents, you must have <A href='http://www.adobe.com'>Adobe <br> * Acrobat Reader</A> installed on your computer.<br> * The first string before the link is not identified, and the space after the link is also not identified * Creation date: (8/2/2001 2:07:32 AM) */ public void testTextBug2() throws ParserException { // Register the link scanner createParser("view these documents, you must have <A href='http://www.adobe.com'>Adobe \n"+ "Acrobat Reader</A> installed on your computer."); parseAndAssertNodeCount(3); // The first node should be a Text- with the text - view these documents, you must have assertTrue("First node should be a Text",node[0] instanceof Text); Text stringNode = (Text)node[0]; assertEquals("Text of the Text","view these documents, you must have ",stringNode.getText()); assertTrue("Second node should be a link node",node[1] instanceof LinkTag); LinkTag linkNode = (LinkTag)node[1]; assertEquals("Link is","http://www.adobe.com",linkNode.getLink()); assertEquals("Link text is","Adobe \nAcrobat Reader",linkNode.getLinkText()); assertTrue("Third node should be a string node",node[2] instanceof Text); Text stringNode2 = (Text)node[2]; assertEquals("Contents of third node"," installed on your computer.",stringNode2.getText()); } /** * Bug reported by Roger Sollberger<br> * For the following HTML : * <a href="http://asgard.ch">[< ASGARD ></a><br> * The string node is not correctly identified */ public void testTagCharsInText() throws ParserException { createParser("<a href=\"http://asgard.ch\">[> ASGARD <]</a>"); parseAndAssertNodeCount(1); assertTrue("Node identified must be a link tag",node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; assertEquals("[> ASGARD <]",linkTag.getLinkText()); assertEquals("http://asgard.ch",linkTag.getLink()); } public void testToPlainTextString() throws ParserException { createParser("<HTML><HEAD><TITLE>This is the Title</TITLE></HEAD><BODY>Hello World, this is the HTML Parser</BODY></HTML>"); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(10); assertTrue("Fourth Node identified must be a string node",node[3] instanceof Text); Text stringNode = (Text)node[3]; assertEquals("First String Node","This is the Title",stringNode.toPlainTextString()); assertTrue("Eighth Node identified must be a string node",node[7] instanceof Text); stringNode = (Text)node[7]; assertEquals("Second string node","Hello World, this is the HTML Parser",stringNode.toPlainTextString()); } public void testToHTML() throws ParserException { createParser("<HTML><HEAD><TITLE>This is the Title</TITLE></HEAD><BODY>Hello World, this is the HTML Parser</BODY></HTML>"); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(10); assertTrue("Fourth Node identified must be a string node",node[3] instanceof Text); Text stringNode = (Text)node[3]; assertEquals("First String Node","This is the Title",stringNode.toHtml()); assertTrue("Eighth Node identified must be a string node",node[7] instanceof Text); stringNode = (Text)node[7]; assertEquals("Second string node","Hello World, this is the HTML Parser",stringNode.toHtml()); } public void testEmptyLines() throws ParserException { createParser( "David Nirenberg (Center for Advanced Study in the Behavorial Sciences, Stanford).<br>\n"+ " \n"+ "<br>" ); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(4); assertTrue("Third Node identified must be a string node",node[2] instanceof Text); } /** * This is a bug reported by John Zook (586222), where the first few chars * before a remark is being missed, if its on the same line. */ public void testStringBeingMissedBug() throws ParserException { createParser( "Before Comment <!-- Comment --> After Comment" ); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(3); assertTrue("First node should be Text",node[0] instanceof Text); assertTrue("Second node should be Remark",node[1] instanceof Remark); assertTrue("Third node should be Text",node[2] instanceof Text); Text stringNode = (Text)node[0]; assertEquals("First String node contents","Before Comment ",stringNode.getText()); Text stringNode2 = (Text)node[2]; assertEquals("Second String node contents"," After Comment",stringNode2.getText()); Remark remarkNode = (Remark)node[1]; assertEquals("Remark Node contents"," Comment ",remarkNode.getText()); } /** * Based on a bug report submitted by Cedric Rosa, if the last line contains a single character, * Text does not return the string node correctly. */ public void testLastLineWithOneChar() throws ParserException { createParser("a"); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(1); assertTrue("First node should be Text",node[0] instanceof Text); Text stringNode = (Text)node[0]; assertEquals("First String node contents","a",stringNode.getText()); } public void testStringWithEmptyLine() throws ParserException { String text = "a\n\nb"; createParser(text); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(1); assertTrue("First node should be Text",node[0] instanceof Text); Text stringNode = (Text)node[0]; assertStringEquals("First String node contents",text,stringNode.getText()); } /** * An attempt to reproduce bug 677176, which passes. * @throws Exception */ public void testStringParserBug() throws Exception { createParser( "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 " + "Transitional//EN\">" + "<html>" + "<head>" + "<title>Untitled Document</title>" + "<meta http-equiv=\"Content-Type\" content=\"text/html; " + "charset=iso-8859-1\">" + "</head>" + "<script language=\"JavaScript\" type=\"text/JavaScript\">" + "// if this fails, output a 'hello' \n" + "if (true) " + "{ " + "//something good...\n" + "} " + "</script>" + "<body>" + "</body>" + "</html>" ); parseAndAssertNodeCount(2); assertTrue(node[1] instanceof Html); Html htmlTag = (Html)node[1]; assertTrue("The HTML tag should have 3 nodes", 3 == htmlTag.getChildCount ()); assertTrue("The first child should be a HEAD tag",htmlTag.getChild(0) instanceof HeadTag); HeadTag headTag = (HeadTag)htmlTag.getChild(0); assertTrue("The HEAD tag should have 2 nodes", 2 == headTag.getChildCount ()); assertTrue("The second child should be a META tag",headTag.getChild(1) instanceof MetaTag); MetaTag metaTag = (MetaTag)headTag.getChild(1); assertStringEquals( "content", "text/html; charset=iso-8859-1", metaTag.getAttribute("CONTENT") ); } public void testStringWithLineBreaks() throws Exception { String text = "Testing &\nRefactoring"; createParser(text); parser.setNodeFactory (new PrototypicalNodeFactory (true)); parseAndAssertNodeCount(1); assertType("first node",Text.class,node[0]); Text stringNode = (Text)node[0]; assertStringEquals("text",text,stringNode.toPlainTextString()); } }