LexerTests.java example

Explorer
NewsSpeakServer-master
- libs
- src
  - com
    - vn
      - newsspeak
        ArticleParser.java
        ArticleParserFactory.java
        ArticleServlet.java
        Email.java
        ExtractTextXMLHandler.java
        FeaturedSourcesServlet.java
        FeedDataStorePopulator.java
        FeedServlet.java
        MailHandlerServlet.java
        NewsSource.java
        PMF.java
        parsers
        CNNParser.java
        DailyBeastParser.java
        EconomicTimesParser.java
        EngadgetParser.java
        HuffPostParser.java
        IndiaTodayParser.java
        LATimesParser.java
        MashableParser.java
        NYDailyNewsParser.java
        NYTimesParser.java
        ReadWriteWebParser.java
        TOIParser.java
        TechCrunchParser.java
        TheHinduParser.java
        USATodayParser.java
        WSJParser.java
        WashPostParser.java
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v $
// $Author: derrickoswald $
// $Date: 2006/05/27 17:06:28 $
// $Revision: 1.31 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//

package org.htmlparser.tests.lexerTests;

import java.io.IOException;
import java.net.URL;
import java.util.HashSet;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tests.ParserTestCase;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class LexerTests extends ParserTestCase
{

    static
    {
        System.setProperty ("org.htmlparser.tests.lexerTests.LexerTests", "LexerTests");
    }

    /**
     * Test the Lexer class.
     */
    public LexerTests (String name)
    {
        super (name);
    }

    /**
     * Test operation without tags.
     */
    public void testPureText () throws ParserException
    {
        String reference;
        Lexer lexer;
        Text node;

        reference = "Hello world";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
    }

    /**
     * Test operation with Unix line endings.
     */
    public void testUnixEOL () throws ParserException
    {
        String reference;
        Lexer lexer;
        Text node;

        reference = "Hello\nworld";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
    }

    /**
     * Test operation with Dos line endings.
     */
    public void testDosEOL () throws ParserException
    {
        String reference;
        Lexer lexer;
        Text node;

        reference = "Hello\r\nworld";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
        reference = "Hello\rworld";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
    }

    /**
     * Test operation with line endings near the end of input.
     */
    public void testEOF_EOL () throws ParserException
    {
        String reference;
        Lexer lexer;
        Text node;

        reference = "Hello world\n";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
        reference = "Hello world\r";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
        reference = "Hello world\r\n";
        lexer = new Lexer (reference);
        node = (Text)lexer.nextNode ();
        assertEquals ("Text contents wrong", reference, node.getText ());
    }

    /**
     * Test that tags stop string nodes.
     */
    public void testTagStops () throws ParserException
    {
        String[] references =
        {
            "Hello world",
            "Hello world\n",
            "Hello world\r\n",
            "Hello world\r",

        };
        String[] suffixes =
        {
            "<head>",
            "</head>",
            "<%=head%>",
            "<?php ?>",
            "<!--head-->",
        };
        Lexer lexer;
        Text node;

        for (int i = 0; i < references.length; i++)
        {
            for (int j = 0; j < suffixes.length; j++)
            {
                lexer = new Lexer (references[i] + suffixes[j]);
                node = (Text)lexer.nextNode ();
                assertEquals ("Text contents wrong", references[i], node.getText ());
            }
        }
    }

    /**
     * Test operation with only tags.
     */
    public void testPureTag () throws ParserException
    {
        String reference;
        String suffix;
        Lexer lexer;
        Node node;

        reference = "<head>";
        lexer = new Lexer (reference);
        node = lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<head>";
        suffix = "<body>";
        lexer = new Lexer (reference + suffix);
        node = lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
        node = lexer.nextNode ();
        assertEquals ("Tag contents wrong", suffix, node.toHtml ());
    }

    /**
     * Test operation with attributed tags.
     */
    public void testAttributedTag () throws ParserException
    {
        String reference;
        Lexer lexer;
        Node node;

        reference = "<head lang='en_US' dir=ltr\nprofile=\"http://htmlparser.sourceforge.org/dictionary.html\">";
        lexer = new Lexer (reference);
        node = lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
    }

    /**
     * Test operation with comments.
     */
    public void testRemark () throws ParserException
    {
        String reference;
        Lexer lexer;
        Remark node;
        String suffix;

        reference = "<!-- This is a comment -->";
        lexer = new Lexer (reference);
        node = (Remark)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a comment --  >";
        lexer = new Lexer (reference);
        node = (Remark)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a\nmultiline comment -->";
        lexer = new Lexer (reference);
        node = (Remark)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        suffix = "<head>";
        reference = "<!-- This is a comment -->";
        lexer = new Lexer (reference + suffix);
        node = (Remark)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a comment --  >";
        lexer = new Lexer (reference + suffix);
        node = (Remark)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());

        reference = "<!-- This is a\nmultiline comment -->";
        lexer = new Lexer (reference + suffix);
        node = (Remark)lexer.nextNode ();
        assertEquals ("Tag contents wrong", reference, node.toHtml ());
    }

//    /**
//     * Try a real page.
//     */
//    public void testReal () throws ParserException, IOException
//    {
//        Lexer lexer;
//        Node node;
//
//        URL url = new URL ("http://sourceforge.net/projects/htmlparser");
//        lexer = new Lexer (url.openConnection ());
//        while (null != (node = lexer.nextNode ()))
//            System.out.println (node.toString ());
//    }

    /**
     * Test the fidelity of the toHtml() method.
     */
    public void testFidelity () throws ParserException, IOException
    {
        Lexer lexer;
        Node node;
        int position;
        StringBuffer buffer;
        String string;
        char[] ref;
        char[] test;

        URL url = new URL ("http://sourceforge.net");
        lexer = new Lexer (url.openConnection ());
        position = 0;
        buffer = new StringBuffer (80000);
        while (null != (node = lexer.nextNode ()))
        {
            string = node.toHtml ();
            if (position != node.getStartPosition ())
                fail ("non-contiguous" + string);
            buffer.append (string);
            position = node.getEndPosition ();
            if (buffer.length () != position)
                fail ("text length differed after encountering node " + string);
        }
        ref = lexer.getPage ().getText ().toCharArray ();
        test = new char[buffer.length ()];
        buffer.getChars (0, buffer.length (), test, 0);
        assertEquals ("different amounts of text", ref.length, test.length);
        for (int i = 0; i < ref.length; i++)
            if (ref[i] != test[i])
                fail ("character differs at position " + i + ", expected <" + ref[i] + "> but was <" + test[i] + ">");
    }

//    /**
//     * Test the relative speed reading from a string parsing tags too.
//     */
//    public void testSpeedStringWithoutTags () throws ParserException, IOException
//    {
//        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
//        URL url;
//        URLConnection connection;
//        Source source;
//        StringBuffer buffer;
//        int i;
//        String html;
//
//        long old_total;
//        long new_total;
//        long begin;
//        long end;
//        StringReader reader;
//        NodeReader nodes;
//        Parser parser;
//        int nodecount;
//        Node node;
//        int charcount;
//
//        url = new URL (link);
//        connection = url.openConnection ();
//        connection.connect ();
//        source = new Source (new Stream (connection.getInputStream ()));
//        buffer = new StringBuffer (350000);
//        while (-1 != (i = source.read ()))
//            buffer.append ((char)i);
//        source.close ();
//        html = buffer.toString ();
//        old_total = 0;
//        new_total = 0;
//        for (i = 0; i < 5; i++)
//        {
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            Lexer lexer = new Lexer (html);
//            nodecount = 0;
//            while (null != (node = lexer.nextNode ()))
//                nodecount++;
//            end = System.currentTimeMillis ();
//            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                new_total += (end - begin);
//
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            reader = new StringReader (html);
//            nodes =  new NodeReader (new BufferedReader (reader), 350000);
//            parser = new Parser (nodes, null);
//            nodecount = 0;
//            while (null != (node = nodes.readElement ()))
//                nodecount++;
//            end = System.currentTimeMillis ();
//            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                old_total += (end - begin);
//        }
//        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
//        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
//    }
//
//    /**
//     * Test the relative speed reading from a string parsing tags too.
//     */
//    public void testSpeedStringWithTags () throws ParserException, IOException
//    {
//        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
//        URL url;
//        URLConnection connection;
//        Source source;
//        StringBuffer buffer;
//        int i;
//        String html;
//
//        long old_total;
//        long new_total;
//        long begin;
//        long end;
//        StringReader reader;
//        NodeReader nodes;
//        Parser parser;
//        int nodecount;
//        Node node;
//        int charcount;
//
//        url = new URL (link);
//        connection = url.openConnection ();
//        connection.connect ();
//        source = new Source (new Stream (connection.getInputStream ()));
//        buffer = new StringBuffer (350000);
//        while (-1 != (i = source.read ()))
//            buffer.append ((char)i);
//        source.close ();
//        html = buffer.toString ();
//        old_total = 0;
//        new_total = 0;
//        for (i = 0; i < 5; i++)
//        {
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            Lexer lexer = new Lexer (html);
//            nodecount = 0;
//            while (null != (node = lexer.nextNode ()))
//            {
//                nodecount++;
//                if (node instanceof TagNode)
//                    ((TagNode)node).getAttributes ();
//            }
//            end = System.currentTimeMillis ();
//            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                new_total += (end - begin);
//
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            reader = new StringReader (html);
//            nodes =  new NodeReader (new BufferedReader (reader), 350000);
//            parser = new Parser (nodes, null);
//            nodecount = 0;
//            while (null != (node = nodes.readElement ()))
//            {
//                nodecount++;
//                if (node instanceof Tag)
//                    ((Tag)node).getAttributes ();
//            }
//            end = System.currentTimeMillis ();
//            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                old_total += (end - begin);
//        }
//        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
//        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
//    }
//
//    public void testSpeedStreamWithoutTags () throws ParserException, IOException
//    {
//        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
//        URL url;
//        URLConnection connection;
//        Source source;
//        StringBuffer buffer;
//        int i;
//        String html;
//        InputStream stream;
//
//        long old_total;
//        long new_total;
//        long begin;
//        long end;
//        InputStreamReader reader;
//        NodeReader nodes;
//        Parser parser;
//        int nodecount;
//        Node node;
//        int charcount;
//
//        url = new URL (link);
//        connection = url.openConnection ();
//        connection.connect ();
//        source = new Source (new Stream (connection.getInputStream ()));
//        buffer = new StringBuffer (350000);
//        while (-1 != (i = source.read ()))
//            buffer.append ((char)i);
//        source.close ();
//        html = buffer.toString ();
//        old_total = 0;
//        new_total = 0;
//
//        for (i = 0; i < 5; i++)
//        {
//
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
//            Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
//            nodecount = 0;
//            while (null != (node = lexer.nextNode ()))
//                nodecount++;
//            end = System.currentTimeMillis ();
//            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                new_total += (end - begin);
//
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
//            reader = new InputStreamReader (stream);
//            nodes =  new NodeReader (reader, 350000);
//            parser = new Parser (nodes, null);
//            nodecount = 0;
//            while (null != (node = nodes.readElement ()))
//                nodecount++;
//            end = System.currentTimeMillis ();
//            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                old_total += (end - begin);
//
//        }
//        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
//        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
//    }
//
//    public void testSpeedStreamWithTags () throws ParserException, IOException
//    {
//        final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
//        URL url;
//        URLConnection connection;
//        Source source;
//        StringBuffer buffer;
//        int i;
//        String html;
//        InputStream stream;
//
//        long old_total;
//        long new_total;
//        long begin;
//        long end;
//        InputStreamReader reader;
//        NodeReader nodes;
//        Parser parser;
//        int nodecount;
//        Node node;
//        int charcount;
//
//        url = new URL (link);
//        connection = url.openConnection ();
//        connection.connect ();
//        source = new Source (new Stream (connection.getInputStream ()));
//        buffer = new StringBuffer (350000);
//        while (-1 != (i = source.read ()))
//            buffer.append ((char)i);
//        source.close ();
//        html = buffer.toString ();
//        old_total = 0;
//        new_total = 0;
//
//        for (i = 0; i < 5; i++)
//        {
//
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
//            Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
//            nodecount = 0;
//            while (null != (node = lexer.nextNode ()))
//            {
//                nodecount++;
//                if (node instanceof TagNode)
//                    ((TagNode)node).getAttributes ();
//            }
//            end = System.currentTimeMillis ();
//            System.out.println ("     lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                new_total += (end - begin);
//
//            System.gc ();
//            begin = System.currentTimeMillis ();
//            stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
//            reader = new InputStreamReader (stream);
//            nodes =  new NodeReader (reader, 350000);
//            parser = new Parser (nodes, null);
//            nodecount = 0;
//            while (null != (node = nodes.readElement ()))
//            {
//                nodecount++;
//                if (node instanceof Tag)
//                    ((Tag)node).getAttributes ();
//            }
//            end = System.currentTimeMillis ();
//            System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
//            if (0 != i) // the first timing is way different
//                old_total += (end - begin);
//        }
//        assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
//        System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
//    }

//    public static void main (String[] args) throws ParserException, IOException
//    {
//        LexerTests tests = new LexerTests ("hallow");
//        tests.testSpeedStreamWithTags ();
//    }

    static final HashSet mAcceptable;
    static
    {
        mAcceptable = new HashSet ();
        mAcceptable.add ("A");
        mAcceptable.add ("BODY");
        mAcceptable.add ("BR");
        mAcceptable.add ("CENTER");
        mAcceptable.add ("FONT");
        mAcceptable.add ("HEAD");
        mAcceptable.add ("HR");
        mAcceptable.add ("HTML");
        mAcceptable.add ("IMG");
        mAcceptable.add ("P");
        mAcceptable.add ("TABLE");
        mAcceptable.add ("TD");
        mAcceptable.add ("TITLE");
        mAcceptable.add ("TR");
        mAcceptable.add ("META");
        mAcceptable.add ("STRONG");
        mAcceptable.add ("FORM");
        mAcceptable.add ("INPUT");
        mAcceptable.add ("!DOCTYPE");
        mAcceptable.add ("TBODY");
        mAcceptable.add ("B");
        mAcceptable.add ("DIV");
        mAcceptable.add ("SCRIPT");
        mAcceptable.add ("NOSCRIPT");
        mAcceptable.add ("STYLE");
        mAcceptable.add ("SPAN");
        mAcceptable.add ("UL");
        mAcceptable.add ("LI");
        mAcceptable.add ("IFRAME");
        mAcceptable.add ("LINK");
        mAcceptable.add ("H1");
        mAcceptable.add ("H3");
        mAcceptable.add ("OBJECT");
        mAcceptable.add ("PARAM");
        mAcceptable.add ("EMBED");
    }

    /**
     * Test case for bug #789439 Japanese page causes OutOfMemory Exception
     * No exception is thrown in the current version of the parser,
     * however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes
     * causes spurious tags.
     * The root cause is characters bracketed by [esc]$B and [esc](J (contrary
     * to what is indicated in then j_s_nightingale analysis of the problem) that
     * sometimes have an angle bracket (< or 0x3c) embedded in them. These
     * are taken to be tags by the parser, instead of being considered strings.
     * <p>
     * The URL refrenced has an ISO-8859-1 encoding (the default), but
     * Japanese characters intermixed on the page with English, using the JIS
     * encoding. We detect failure by looking for weird tag names which were
     * not correctly handled as string nodes.
     * <p>
     * Here is a partial dump of the page with escape sequences:
     * <pre>
     * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43
     * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61
     * ..
     * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a
     * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c
     * ..
     * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25
     * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a
     * ..
     * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25
     * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f
     * ..
     * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43
     * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d
     * ..
     * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21
     * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28
     * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22
     * </pre>
     * <p>
     * The fix proposed by j_s_nightingale is implemented to swallow JIS
     * escape sequences in the string parser.
     * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may
     * still be a problem.
     * It's theoretically possible that JIS encoding, or another one,
     * could be used as attribute names or values within tags as well,
     * but this is considered improbable and is therefore not handled in
     * the tag parser state machine.
     */
    public void testJIS ()
        throws ParserException
    {
        Parser parser;
        NodeIterator iterator;
        
        parser = new Parser ("http://www.009.com/");
        try
        {
            iterator = parser.elements ();
            while (iterator.hasMoreNodes ())
                checkTagNames (iterator.nextNode ());
        }
        catch (EncodingChangeException ece)
        {
            parser.reset ();
            iterator = parser.elements ();
            while (iterator.hasMoreNodes ())
                checkTagNames (iterator.nextNode ());
        }
    }

    /**
     * Check the tag name for one of the ones expected on the page.
     * Recursively check the children.
     */
    public void checkTagNames (Node node)
    {
        Tag tag;
        String name;
        NodeList children;
        
        if (node instanceof Tag)
        {
            tag = (Tag)node;
            name = tag.getTagName ();
            if (!mAcceptable.contains (name))
                fail ("unrecognized tag name \"" + name + "\"");
            children = tag.getChildren ();
            if (null != children)
                for (int i = 0; i < children.size (); i++)
                    checkTagNames (children.elementAt (i));
        }
    }

    /**
     * See bug #825820 Words conjoined
     */
    public void testConjoined ()
        throws
            ParserException
    {
        StringBuffer buffer;
        NodeIterator iterator;
        Node node;
        String expected;

        expected = "The Title\nThis is the body.";
        String html1 = "<html><title>The Title\n</title>" +
            "<body>This is <a href=\"foo.html\">the body</a>.</body></html>";
        createParser (html1);
        buffer = new StringBuffer ();
        for (iterator = parser.elements (); iterator.hasMoreNodes (); )
        {
            node = iterator.nextNode ();
            String text = node.toPlainTextString ();
            buffer.append (text);
        }
        assertStringEquals ("conjoined text", expected, buffer.toString ());

        String html2 = "<html><title>The Title</title>\n" +
            "<body>This is <a href=\"foo.html\">the body</a>.</body></html>";
        createParser (html2);
        buffer = new StringBuffer ();
        for (iterator = parser.elements (); iterator.hasMoreNodes (); )
        {
            node = iterator.nextNode ();
            String text = node.toPlainTextString ();
            buffer.append (text);
        }
        assertStringEquals ("conjoined text", expected, buffer.toString ());
        
        String html3 = "<html><title>The Title</title>" +
            "<body>\nThis is <a href=\"foo.html\">the body</a>.</body></html>";
        createParser (html3);
        buffer = new StringBuffer ();
        for (iterator = parser.elements (); iterator.hasMoreNodes (); )
        {
            node = iterator.nextNode ();
            String text = node.toPlainTextString ();
            buffer.append (text);
        }
        assertStringEquals ("conjoined text", expected, buffer.toString ());
    }

    /**
     * Check for StackOverflow error.
     */
    public void testStackOverflow ()
        throws
            ParserException
    {
        NodeIterator iterator;
        Node node;
        String html;
                                                                                                                                                        
        html = "<a href = \"http://test.com\" />";
        createParser (html);
        for (iterator = parser.elements (); iterator.hasMoreNodes (); )
        {
            node = iterator.nextNode ();
            String text = node.toHtml ();
            assertStringEquals ("no overflow", html, text);
        }
        html = "<a href=\"http://test.com\"/>";
        createParser (html);
        for (iterator = parser.elements (); iterator.hasMoreNodes (); )
        {
            node = iterator.nextNode ();
            String text = node.toHtml ();
            assertStringEquals ("no overflow", html, text);
        }
        html = "<a href = \"http://test.com\"/>";
        createParser (html);
        for (iterator = parser.elements (); iterator.hasMoreNodes (); )
        {
            node = iterator.nextNode ();
            String text = node.toHtml ();
            assertStringEquals ("no overflow", html, text);
        }
    }

    /**
     * See bug #880283 Character ">" erroneously inserted by Lexer
     */
    public void testJsp () throws ParserException
    {
        String html;
        Lexer lexer;
        Node node;
        
        html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>";
        lexer = new Lexer (html);
        node = lexer.nextNode ();
        if (node == null)
            fail ("too few nodes");
        else
            assertStringEquals ("bad html", html, node.toHtml());
        assertNull ("too many nodes", lexer.nextNode ());
    }

    /**
     * Unit test for new PI parsing code.
     */
    public void testPI() throws ParserException
    {
        String html;
        Lexer lexer;
        Node node;

        html = "<?php print(\"<p>Hello World!</p>\"); ?>";
        lexer = new Lexer(html);
        node = lexer.nextNode();
        if (node == null)
            fail ("too few nodes");
        else
            assertStringEquals("bad html", html, node.toHtml());
        assertNull("too many nodes", lexer.nextNode());
    }

    /**
     * See bug #899413 bug in javascript end detection.
     */
    public void testEscapedQuote () throws ParserException
    {
        String string;
        String html;
        Lexer lexer;
        Node node;
        
        string = "\na='\\'';\n";
        html = string + "</script>";
        lexer = new Lexer (html);
        node = lexer.nextNode (true);
        if (node == null)
            fail ("too few nodes");
        else
            assertStringEquals ("bad string", string, node.toHtml());
        assertNotNull ("too few nodes", lexer.nextNode (true));
        assertNull ("too many nodes", lexer.nextNode (true));
    }

    /**
     * See bug #1227213 Particular SCRIPT tags close too late.
     */
    public void testCommentInScript () throws ParserException
    {
        String tag;
        String cdata;
        String endtag;
        String html;
        Parser parser;
        NodeIterator iterator;
        Node node;

        tag = "<script>";
        cdata = "<!--document.write(\"en\");// -->";
        endtag = "</script>";
        html = tag + cdata + endtag;
        parser = new Parser ();
        parser.setInputHTML (html);
        iterator = parser.elements ();
        node = iterator.nextNode ();
        if (node == null)
            fail ("too few nodes");
        else
            assertStringEquals ("bad parse", html, node.toHtml());
        assertTrue (node instanceof ScriptTag);
        assertStringEquals ("bad cdata", cdata, ((ScriptTag)node).getScriptCode ());
        assertNull ("too many nodes", iterator.nextNode ());
    }

    /**
     * See bug #1227213 Particular SCRIPT tags close too late.
     * This was actually working prior to the patch, since the
     * ScriptScanner didn't use smartquote processing.
     * I'm not sure why jwilsonsprings1 said the patch worked
     * for him. I can only assume he was mistaken in thinking
     * it was the URL that caused the failure.
     */
    public void testUrlInStyle () throws ParserException
    {
        String tag;
        String cdata;
        String endtag;
        String html;
        Parser parser;
        NodeIterator iterator;
        Node node;
        
        tag = "<style>";
        cdata = ".eSDot {background-image:" +
            "url(http://di.image.eshop.msn.com/img/sys/dot.gif)}";
        endtag = "</style>";
        html = tag + cdata + endtag;
        parser = new Parser ();
        parser.setInputHTML (html);
        iterator = parser.elements ();
        node = iterator.nextNode ();
        if (node == null)
            fail ("too few nodes");
        else
            assertStringEquals ("bad parse", html, node.toHtml());
        assertTrue (node instanceof StyleTag);
        assertStringEquals ("bad cdata", cdata, ((StyleTag)node).getStyleCode ());
        assertNull ("too many nodes", iterator.nextNode ());
    }

    /**
     * See bug #1493884 Lexer returns a TagNode with a 'null' name
     */
    public void testDosLineEndingInName () throws ParserException
    {
        String html;
        NodeIterator iterator;
        Node node;

        html = "<!\r\nMSIE->";
        parser = new Parser ();
        parser.setInputHTML (html);
        iterator = parser.elements ();
        node = iterator.nextNode ();
        if (node == null)
            fail ("too few nodes");
        else
        {
            assertNotNull ("null node", node);
            assertTrue (node instanceof Tag);
            assertNotNull ("null name", ((Tag)node).getTagName ());
            assertStringEquals ("bad parse", "!", ((Tag)node).getTagName ());
        }
    }
}