StringParserTest.java example

Explorer
NewsSpeakServer-master
- libs
- src
  - com
    - vn
      - newsspeak
        ArticleParser.java
        ArticleParserFactory.java
        ArticleServlet.java
        Email.java
        ExtractTextXMLHandler.java
        FeaturedSourcesServlet.java
        FeedDataStorePopulator.java
        FeedServlet.java
        MailHandlerServlet.java
        NewsSource.java
        PMF.java
        parsers
        CNNParser.java
        DailyBeastParser.java
        EconomicTimesParser.java
        EngadgetParser.java
        HuffPostParser.java
        IndiaTodayParser.java
        LATimesParser.java
        MashableParser.java
        NYDailyNewsParser.java
        NYTimesParser.java
        ReadWriteWebParser.java
        TOIParser.java
        TechCrunchParser.java
        TheHinduParser.java
        USATodayParser.java
        WSJParser.java
        WashPostParser.java
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Somik Raha
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/parserHelperTests/StringParserTest.java,v $
// $Author: derrickoswald $
// $Date: 2004/09/02 02:28:15 $
// $Revision: 1.50 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.tests.parserHelperTests;

import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.Remark;
import org.htmlparser.Text;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.Html;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tests.ParserTestCase;
import org.htmlparser.util.ParserException;

public class StringParserTest extends ParserTestCase {

    static
    {
        System.setProperty ("org.htmlparser.tests.parserHelperTests.StringParserTest", "StringParserTest");
    }

    public StringParserTest(String name) {
        super(name);
    }

    /**
     * The bug being reproduced is this : <BR>
     * <HTML><HEAD><TITLE>Google</TITLE> <BR>
     * The above line is incorrectly parsed in that, the text Google is missed.
     * The presence of this bug is typically when some tag is identified before the string node is. (usually seen
     * with the end tag). The bug lies in NodeReader.readElement().
     * Creation date: (6/17/2001 4:01:06 PM)
     */
    public void testTextBug1() throws ParserException {
        createParser("<HTML><HEAD><TITLE>Google</TITLE>");
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(5);
        // The fourth node should be a Text-  with the text - Google
        assertTrue("Fourth node should be a Text",node[3] instanceof Text);
        Text stringNode = (Text)node[3];
        assertEquals("Text of the Text","Google",stringNode.getText());
    }

    /**
     * Test string containing link.
     * Bug reported by Kaarle Kaila of Nokia<br>
     * For the following HTML :
     * view these documents, you must have <A href='http://www.adobe.com'>Adobe <br>
     * Acrobat Reader</A> installed on your computer.<br>
     * The first string before the link is not identified, and the space after the link is also not identified
     * Creation date: (8/2/2001 2:07:32 AM)
     */
    public void testTextBug2() throws ParserException {
        // Register the link scanner

        createParser("view these documents, you must have <A href='http://www.adobe.com'>Adobe \n"+
            "Acrobat Reader</A> installed on your computer.");
        parseAndAssertNodeCount(3);
        // The first node should be a Text-  with the text - view these documents, you must have
        assertTrue("First node should be a Text",node[0] instanceof Text);
        Text stringNode = (Text)node[0];
        assertEquals("Text of the Text","view these documents, you must have ",stringNode.getText());
        assertTrue("Second node should be a link node",node[1] instanceof LinkTag);
        LinkTag linkNode = (LinkTag)node[1];
        assertEquals("Link is","http://www.adobe.com",linkNode.getLink());
        assertEquals("Link text is","Adobe \nAcrobat Reader",linkNode.getLinkText());

        assertTrue("Third node should be a string node",node[2] instanceof Text);
        Text stringNode2 = (Text)node[2];
        assertEquals("Contents of third node"," installed on your computer.",stringNode2.getText());
    }

    /**
     * Bug reported by Roger Sollberger<br>
     * For the following HTML :
     * <a href="http://asgard.ch">[< ASGARD ></a><br>
     * The string node is not correctly identified
     */
    public void testTagCharsInText() throws ParserException {
        createParser("<a href=\"http://asgard.ch\">[> ASGARD <]</a>");
        parseAndAssertNodeCount(1);
        assertTrue("Node identified must be a link tag",node[0] instanceof LinkTag);
        LinkTag linkTag = (LinkTag) node[0];
        assertEquals("[> ASGARD <]",linkTag.getLinkText());
        assertEquals("http://asgard.ch",linkTag.getLink());
    }

    public void testToPlainTextString() throws ParserException {
        createParser("<HTML><HEAD><TITLE>This is the Title</TITLE></HEAD><BODY>Hello World, this is the HTML Parser</BODY></HTML>");
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(10);
        assertTrue("Fourth Node identified must be a string node",node[3] instanceof Text);
        Text stringNode = (Text)node[3];
        assertEquals("First String Node","This is the Title",stringNode.toPlainTextString());
        assertTrue("Eighth Node identified must be a string node",node[7] instanceof Text);
        stringNode = (Text)node[7];
        assertEquals("Second string node","Hello World, this is the HTML Parser",stringNode.toPlainTextString());
    }

    public void testToHTML() throws ParserException {
        createParser("<HTML><HEAD><TITLE>This is the Title</TITLE></HEAD><BODY>Hello World, this is the HTML Parser</BODY></HTML>");
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(10);
        assertTrue("Fourth Node identified must be a string node",node[3] instanceof Text);
        Text stringNode = (Text)node[3];
        assertEquals("First String Node","This is the Title",stringNode.toHtml());
        assertTrue("Eighth Node identified must be a string node",node[7] instanceof Text);
        stringNode = (Text)node[7];
        assertEquals("Second string node","Hello World, this is the HTML Parser",stringNode.toHtml());
    }

    public void testEmptyLines() throws ParserException {
        createParser(
        "David Nirenberg (Center for Advanced Study in the Behavorial Sciences, Stanford).<br>\n"+
        "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      \n"+
        "<br>"
        );
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(4);
        assertTrue("Third Node identified must be a string node",node[2] instanceof Text);
    }

    /**
     * This is a bug reported by John Zook (586222), where the first few chars
     * before a remark is being missed, if its on the same line.
     */
    public void testStringBeingMissedBug() throws ParserException {
        createParser(
        "Before Comment <!-- Comment --> After Comment"
        );
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(3);
        assertTrue("First node should be Text",node[0] instanceof Text);
        assertTrue("Second node should be Remark",node[1] instanceof Remark);
        assertTrue("Third node should be Text",node[2] instanceof Text);
        Text stringNode = (Text)node[0];
        assertEquals("First String node contents","Before Comment ",stringNode.getText());
        Text stringNode2 = (Text)node[2];
        assertEquals("Second String node contents"," After Comment",stringNode2.getText());
        Remark remarkNode = (Remark)node[1];
        assertEquals("Remark Node contents"," Comment ",remarkNode.getText());

    }

    /**
     * Based on a bug report submitted by Cedric Rosa, if the last line contains a single character,
     * Text does not return the string node correctly.
     */
    public void testLastLineWithOneChar() throws ParserException {
        createParser("a");
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(1);
        assertTrue("First node should be Text",node[0] instanceof Text);
        Text stringNode = (Text)node[0];
        assertEquals("First String node contents","a",stringNode.getText());
    }

    public void testStringWithEmptyLine() throws ParserException {
        String text = "a\n\nb";
        createParser(text);
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(1);
        assertTrue("First node should be Text",node[0] instanceof Text);
        Text stringNode = (Text)node[0];
        assertStringEquals("First String node contents",text,stringNode.getText());
    }

    /**
     * An attempt to reproduce bug 677176, which passes.
     * @throws Exception
     */
    public void testStringParserBug() throws Exception {
        createParser(
            "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 " +
            "Transitional//EN\">" +
            "<html>" +
            "<head>" +
            "<title>Untitled Document</title>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; " +
            "charset=iso-8859-1\">" +
            "</head>" +
            "<script language=\"JavaScript\" type=\"text/JavaScript\">" +
            "// if this fails, output a 'hello' \n" +
            "if (true) " +
            "{ " +
            "//something good...\n" +
            "} " +
            "</script>" +
            "<body>" +
            "</body>" +
            "</html>"
        );
        parseAndAssertNodeCount(2);
        assertTrue(node[1] instanceof Html);
        Html htmlTag = (Html)node[1];
        assertTrue("The HTML tag should have 3 nodes", 3 == htmlTag.getChildCount ());
        assertTrue("The first child should be a HEAD tag",htmlTag.getChild(0) instanceof HeadTag);
        HeadTag headTag = (HeadTag)htmlTag.getChild(0);
        assertTrue("The HEAD tag should have 2 nodes", 2 == headTag.getChildCount ());
        assertTrue("The second child should be a META tag",headTag.getChild(1) instanceof MetaTag);
        MetaTag metaTag = (MetaTag)headTag.getChild(1);

        assertStringEquals(
            "content",
            "text/html; charset=iso-8859-1",
            metaTag.getAttribute("CONTENT")
        );
    }

    public void testStringWithLineBreaks() throws Exception {
        String text = "Testing &\nRefactoring";
        createParser(text);
        parser.setNodeFactory (new PrototypicalNodeFactory (true));
        parseAndAssertNodeCount(1);
        assertType("first node",Text.class,node[0]);
        Text stringNode = (Text)node[0];
        assertStringEquals("text",text,stringNode.toPlainTextString());
    }

}