// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML
// http://sourceforge.org/projects/htmlparser
// Copyright (C) 2004 Derrick Oswald
//
// Revision Control Information
//
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v $
// $Author: derrickoswald $
// $Date: 2006/05/27 17:06:28 $
// $Revision: 1.31 $
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
package org.htmlparser.tests.lexerTests;
import java.io.IOException;
import java.net.URL;
import java.util.HashSet;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tests.ParserTestCase;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class LexerTests extends ParserTestCase
{
static
{
System.setProperty ("org.htmlparser.tests.lexerTests.LexerTests", "LexerTests");
}
/**
* Test the Lexer class.
*/
public LexerTests (String name)
{
super (name);
}
/**
* Test operation without tags.
*/
public void testPureText () throws ParserException
{
String reference;
Lexer lexer;
Text node;
reference = "Hello world";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
}
/**
* Test operation with Unix line endings.
*/
public void testUnixEOL () throws ParserException
{
String reference;
Lexer lexer;
Text node;
reference = "Hello\nworld";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
}
/**
* Test operation with Dos line endings.
*/
public void testDosEOL () throws ParserException
{
String reference;
Lexer lexer;
Text node;
reference = "Hello\r\nworld";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
reference = "Hello\rworld";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
}
/**
* Test operation with line endings near the end of input.
*/
public void testEOF_EOL () throws ParserException
{
String reference;
Lexer lexer;
Text node;
reference = "Hello world\n";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
reference = "Hello world\r";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
reference = "Hello world\r\n";
lexer = new Lexer (reference);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", reference, node.getText ());
}
/**
* Test that tags stop string nodes.
*/
public void testTagStops () throws ParserException
{
String[] references =
{
"Hello world",
"Hello world\n",
"Hello world\r\n",
"Hello world\r",
};
String[] suffixes =
{
"<head>",
"</head>",
"<%=head%>",
"<?php ?>",
"<!--head-->",
};
Lexer lexer;
Text node;
for (int i = 0; i < references.length; i++)
{
for (int j = 0; j < suffixes.length; j++)
{
lexer = new Lexer (references[i] + suffixes[j]);
node = (Text)lexer.nextNode ();
assertEquals ("Text contents wrong", references[i], node.getText ());
}
}
}
/**
* Test operation with only tags.
*/
public void testPureTag () throws ParserException
{
String reference;
String suffix;
Lexer lexer;
Node node;
reference = "<head>";
lexer = new Lexer (reference);
node = lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
reference = "<head>";
suffix = "<body>";
lexer = new Lexer (reference + suffix);
node = lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
node = lexer.nextNode ();
assertEquals ("Tag contents wrong", suffix, node.toHtml ());
}
/**
* Test operation with attributed tags.
*/
public void testAttributedTag () throws ParserException
{
String reference;
Lexer lexer;
Node node;
reference = "<head lang='en_US' dir=ltr\nprofile=\"http://htmlparser.sourceforge.org/dictionary.html\">";
lexer = new Lexer (reference);
node = lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
}
/**
* Test operation with comments.
*/
public void testRemark () throws ParserException
{
String reference;
Lexer lexer;
Remark node;
String suffix;
reference = "<!-- This is a comment -->";
lexer = new Lexer (reference);
node = (Remark)lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
reference = "<!-- This is a comment -- >";
lexer = new Lexer (reference);
node = (Remark)lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
reference = "<!-- This is a\nmultiline comment -->";
lexer = new Lexer (reference);
node = (Remark)lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
suffix = "<head>";
reference = "<!-- This is a comment -->";
lexer = new Lexer (reference + suffix);
node = (Remark)lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
reference = "<!-- This is a comment -- >";
lexer = new Lexer (reference + suffix);
node = (Remark)lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
reference = "<!-- This is a\nmultiline comment -->";
lexer = new Lexer (reference + suffix);
node = (Remark)lexer.nextNode ();
assertEquals ("Tag contents wrong", reference, node.toHtml ());
}
// /**
// * Try a real page.
// */
// public void testReal () throws ParserException, IOException
// {
// Lexer lexer;
// Node node;
//
// URL url = new URL ("http://sourceforge.net/projects/htmlparser");
// lexer = new Lexer (url.openConnection ());
// while (null != (node = lexer.nextNode ()))
// System.out.println (node.toString ());
// }
/**
* Test the fidelity of the toHtml() method.
*/
public void testFidelity () throws ParserException, IOException
{
Lexer lexer;
Node node;
int position;
StringBuffer buffer;
String string;
char[] ref;
char[] test;
URL url = new URL ("http://sourceforge.net");
lexer = new Lexer (url.openConnection ());
position = 0;
buffer = new StringBuffer (80000);
while (null != (node = lexer.nextNode ()))
{
string = node.toHtml ();
if (position != node.getStartPosition ())
fail ("non-contiguous" + string);
buffer.append (string);
position = node.getEndPosition ();
if (buffer.length () != position)
fail ("text length differed after encountering node " + string);
}
ref = lexer.getPage ().getText ().toCharArray ();
test = new char[buffer.length ()];
buffer.getChars (0, buffer.length (), test, 0);
assertEquals ("different amounts of text", ref.length, test.length);
for (int i = 0; i < ref.length; i++)
if (ref[i] != test[i])
fail ("character differs at position " + i + ", expected <" + ref[i] + "> but was <" + test[i] + ">");
}
// /**
// * Test the relative speed reading from a string parsing tags too.
// */
// public void testSpeedStringWithoutTags () throws ParserException, IOException
// {
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
// URL url;
// URLConnection connection;
// Source source;
// StringBuffer buffer;
// int i;
// String html;
//
// long old_total;
// long new_total;
// long begin;
// long end;
// StringReader reader;
// NodeReader nodes;
// Parser parser;
// int nodecount;
// Node node;
// int charcount;
//
// url = new URL (link);
// connection = url.openConnection ();
// connection.connect ();
// source = new Source (new Stream (connection.getInputStream ()));
// buffer = new StringBuffer (350000);
// while (-1 != (i = source.read ()))
// buffer.append ((char)i);
// source.close ();
// html = buffer.toString ();
// old_total = 0;
// new_total = 0;
// for (i = 0; i < 5; i++)
// {
// System.gc ();
// begin = System.currentTimeMillis ();
// Lexer lexer = new Lexer (html);
// nodecount = 0;
// while (null != (node = lexer.nextNode ()))
// nodecount++;
// end = System.currentTimeMillis ();
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// new_total += (end - begin);
//
// System.gc ();
// begin = System.currentTimeMillis ();
// reader = new StringReader (html);
// nodes = new NodeReader (new BufferedReader (reader), 350000);
// parser = new Parser (nodes, null);
// nodecount = 0;
// while (null != (node = nodes.readElement ()))
// nodecount++;
// end = System.currentTimeMillis ();
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// old_total += (end - begin);
// }
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
// }
//
// /**
// * Test the relative speed reading from a string parsing tags too.
// */
// public void testSpeedStringWithTags () throws ParserException, IOException
// {
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
// URL url;
// URLConnection connection;
// Source source;
// StringBuffer buffer;
// int i;
// String html;
//
// long old_total;
// long new_total;
// long begin;
// long end;
// StringReader reader;
// NodeReader nodes;
// Parser parser;
// int nodecount;
// Node node;
// int charcount;
//
// url = new URL (link);
// connection = url.openConnection ();
// connection.connect ();
// source = new Source (new Stream (connection.getInputStream ()));
// buffer = new StringBuffer (350000);
// while (-1 != (i = source.read ()))
// buffer.append ((char)i);
// source.close ();
// html = buffer.toString ();
// old_total = 0;
// new_total = 0;
// for (i = 0; i < 5; i++)
// {
// System.gc ();
// begin = System.currentTimeMillis ();
// Lexer lexer = new Lexer (html);
// nodecount = 0;
// while (null != (node = lexer.nextNode ()))
// {
// nodecount++;
// if (node instanceof TagNode)
// ((TagNode)node).getAttributes ();
// }
// end = System.currentTimeMillis ();
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// new_total += (end - begin);
//
// System.gc ();
// begin = System.currentTimeMillis ();
// reader = new StringReader (html);
// nodes = new NodeReader (new BufferedReader (reader), 350000);
// parser = new Parser (nodes, null);
// nodecount = 0;
// while (null != (node = nodes.readElement ()))
// {
// nodecount++;
// if (node instanceof Tag)
// ((Tag)node).getAttributes ();
// }
// end = System.currentTimeMillis ();
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// old_total += (end - begin);
// }
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
// }
//
// public void testSpeedStreamWithoutTags () throws ParserException, IOException
// {
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
// URL url;
// URLConnection connection;
// Source source;
// StringBuffer buffer;
// int i;
// String html;
// InputStream stream;
//
// long old_total;
// long new_total;
// long begin;
// long end;
// InputStreamReader reader;
// NodeReader nodes;
// Parser parser;
// int nodecount;
// Node node;
// int charcount;
//
// url = new URL (link);
// connection = url.openConnection ();
// connection.connect ();
// source = new Source (new Stream (connection.getInputStream ()));
// buffer = new StringBuffer (350000);
// while (-1 != (i = source.read ()))
// buffer.append ((char)i);
// source.close ();
// html = buffer.toString ();
// old_total = 0;
// new_total = 0;
//
// for (i = 0; i < 5; i++)
// {
//
// System.gc ();
// begin = System.currentTimeMillis ();
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
// Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
// nodecount = 0;
// while (null != (node = lexer.nextNode ()))
// nodecount++;
// end = System.currentTimeMillis ();
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// new_total += (end - begin);
//
// System.gc ();
// begin = System.currentTimeMillis ();
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
// reader = new InputStreamReader (stream);
// nodes = new NodeReader (reader, 350000);
// parser = new Parser (nodes, null);
// nodecount = 0;
// while (null != (node = nodes.readElement ()))
// nodecount++;
// end = System.currentTimeMillis ();
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// old_total += (end - begin);
//
// }
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
// }
//
// public void testSpeedStreamWithTags () throws ParserException, IOException
// {
// final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html";
// URL url;
// URLConnection connection;
// Source source;
// StringBuffer buffer;
// int i;
// String html;
// InputStream stream;
//
// long old_total;
// long new_total;
// long begin;
// long end;
// InputStreamReader reader;
// NodeReader nodes;
// Parser parser;
// int nodecount;
// Node node;
// int charcount;
//
// url = new URL (link);
// connection = url.openConnection ();
// connection.connect ();
// source = new Source (new Stream (connection.getInputStream ()));
// buffer = new StringBuffer (350000);
// while (-1 != (i = source.read ()))
// buffer.append ((char)i);
// source.close ();
// html = buffer.toString ();
// old_total = 0;
// new_total = 0;
//
// for (i = 0; i < 5; i++)
// {
//
// System.gc ();
// begin = System.currentTimeMillis ();
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
// Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET));
// nodecount = 0;
// while (null != (node = lexer.nextNode ()))
// {
// nodecount++;
// if (node instanceof TagNode)
// ((TagNode)node).getAttributes ();
// }
// end = System.currentTimeMillis ();
// System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// new_total += (end - begin);
//
// System.gc ();
// begin = System.currentTimeMillis ();
// stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET));
// reader = new InputStreamReader (stream);
// nodes = new NodeReader (reader, 350000);
// parser = new Parser (nodes, null);
// nodecount = 0;
// while (null != (node = nodes.readElement ()))
// {
// nodecount++;
// if (node instanceof Tag)
// ((Tag)node).getAttributes ();
// }
// end = System.currentTimeMillis ();
// System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes");
// if (0 != i) // the first timing is way different
// old_total += (end - begin);
// }
// assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total);
// System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster");
// }
// public static void main (String[] args) throws ParserException, IOException
// {
// LexerTests tests = new LexerTests ("hallow");
// tests.testSpeedStreamWithTags ();
// }
static final HashSet mAcceptable;
static
{
mAcceptable = new HashSet ();
mAcceptable.add ("A");
mAcceptable.add ("BODY");
mAcceptable.add ("BR");
mAcceptable.add ("CENTER");
mAcceptable.add ("FONT");
mAcceptable.add ("HEAD");
mAcceptable.add ("HR");
mAcceptable.add ("HTML");
mAcceptable.add ("IMG");
mAcceptable.add ("P");
mAcceptable.add ("TABLE");
mAcceptable.add ("TD");
mAcceptable.add ("TITLE");
mAcceptable.add ("TR");
mAcceptable.add ("META");
mAcceptable.add ("STRONG");
mAcceptable.add ("FORM");
mAcceptable.add ("INPUT");
mAcceptable.add ("!DOCTYPE");
mAcceptable.add ("TBODY");
mAcceptable.add ("B");
mAcceptable.add ("DIV");
mAcceptable.add ("SCRIPT");
mAcceptable.add ("NOSCRIPT");
mAcceptable.add ("STYLE");
mAcceptable.add ("SPAN");
mAcceptable.add ("UL");
mAcceptable.add ("LI");
mAcceptable.add ("IFRAME");
mAcceptable.add ("LINK");
mAcceptable.add ("H1");
mAcceptable.add ("H3");
mAcceptable.add ("OBJECT");
mAcceptable.add ("PARAM");
mAcceptable.add ("EMBED");
}
/**
* Test case for bug #789439 Japanese page causes OutOfMemory Exception
* No exception is thrown in the current version of the parser,
* however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes
* causes spurious tags.
* The root cause is characters bracketed by [esc]$B and [esc](J (contrary
* to what is indicated in then j_s_nightingale analysis of the problem) that
* sometimes have an angle bracket (< or 0x3c) embedded in them. These
* are taken to be tags by the parser, instead of being considered strings.
* <p>
* The URL refrenced has an ISO-8859-1 encoding (the default), but
* Japanese characters intermixed on the page with English, using the JIS
* encoding. We detect failure by looking for weird tag names which were
* not correctly handled as string nodes.
* <p>
* Here is a partial dump of the page with escape sequences:
* <pre>
* 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43
* 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61
* ..
* 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a
* 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c
* ..
* 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25
* 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a
* ..
* 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25
* 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f
* ..
* 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43
* 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d
* ..
* 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21
* 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28
* 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22
* </pre>
* <p>
* The fix proposed by j_s_nightingale is implemented to swallow JIS
* escape sequences in the string parser.
* Apparently the fix won't help EUC-JP and Shift-JIS though, so this may
* still be a problem.
* It's theoretically possible that JIS encoding, or another one,
* could be used as attribute names or values within tags as well,
* but this is considered improbable and is therefore not handled in
* the tag parser state machine.
*/
public void testJIS ()
throws ParserException
{
Parser parser;
NodeIterator iterator;
parser = new Parser ("http://www.009.com/");
try
{
iterator = parser.elements ();
while (iterator.hasMoreNodes ())
checkTagNames (iterator.nextNode ());
}
catch (EncodingChangeException ece)
{
parser.reset ();
iterator = parser.elements ();
while (iterator.hasMoreNodes ())
checkTagNames (iterator.nextNode ());
}
}
/**
* Check the tag name for one of the ones expected on the page.
* Recursively check the children.
*/
public void checkTagNames (Node node)
{
Tag tag;
String name;
NodeList children;
if (node instanceof Tag)
{
tag = (Tag)node;
name = tag.getTagName ();
if (!mAcceptable.contains (name))
fail ("unrecognized tag name \"" + name + "\"");
children = tag.getChildren ();
if (null != children)
for (int i = 0; i < children.size (); i++)
checkTagNames (children.elementAt (i));
}
}
/**
* See bug #825820 Words conjoined
*/
public void testConjoined ()
throws
ParserException
{
StringBuffer buffer;
NodeIterator iterator;
Node node;
String expected;
expected = "The Title\nThis is the body.";
String html1 = "<html><title>The Title\n</title>" +
"<body>This is <a href=\"foo.html\">the body</a>.</body></html>";
createParser (html1);
buffer = new StringBuffer ();
for (iterator = parser.elements (); iterator.hasMoreNodes (); )
{
node = iterator.nextNode ();
String text = node.toPlainTextString ();
buffer.append (text);
}
assertStringEquals ("conjoined text", expected, buffer.toString ());
String html2 = "<html><title>The Title</title>\n" +
"<body>This is <a href=\"foo.html\">the body</a>.</body></html>";
createParser (html2);
buffer = new StringBuffer ();
for (iterator = parser.elements (); iterator.hasMoreNodes (); )
{
node = iterator.nextNode ();
String text = node.toPlainTextString ();
buffer.append (text);
}
assertStringEquals ("conjoined text", expected, buffer.toString ());
String html3 = "<html><title>The Title</title>" +
"<body>\nThis is <a href=\"foo.html\">the body</a>.</body></html>";
createParser (html3);
buffer = new StringBuffer ();
for (iterator = parser.elements (); iterator.hasMoreNodes (); )
{
node = iterator.nextNode ();
String text = node.toPlainTextString ();
buffer.append (text);
}
assertStringEquals ("conjoined text", expected, buffer.toString ());
}
/**
* Check for StackOverflow error.
*/
public void testStackOverflow ()
throws
ParserException
{
NodeIterator iterator;
Node node;
String html;
html = "<a href = \"http://test.com\" />";
createParser (html);
for (iterator = parser.elements (); iterator.hasMoreNodes (); )
{
node = iterator.nextNode ();
String text = node.toHtml ();
assertStringEquals ("no overflow", html, text);
}
html = "<a href=\"http://test.com\"/>";
createParser (html);
for (iterator = parser.elements (); iterator.hasMoreNodes (); )
{
node = iterator.nextNode ();
String text = node.toHtml ();
assertStringEquals ("no overflow", html, text);
}
html = "<a href = \"http://test.com\"/>";
createParser (html);
for (iterator = parser.elements (); iterator.hasMoreNodes (); )
{
node = iterator.nextNode ();
String text = node.toHtml ();
assertStringEquals ("no overflow", html, text);
}
}
/**
* See bug #880283 Character ">" erroneously inserted by Lexer
*/
public void testJsp () throws ParserException
{
String html;
Lexer lexer;
Node node;
html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>";
lexer = new Lexer (html);
node = lexer.nextNode ();
if (node == null)
fail ("too few nodes");
else
assertStringEquals ("bad html", html, node.toHtml());
assertNull ("too many nodes", lexer.nextNode ());
}
/**
* Unit test for new PI parsing code.
*/
public void testPI() throws ParserException
{
String html;
Lexer lexer;
Node node;
html = "<?php print(\"<p>Hello World!</p>\"); ?>";
lexer = new Lexer(html);
node = lexer.nextNode();
if (node == null)
fail ("too few nodes");
else
assertStringEquals("bad html", html, node.toHtml());
assertNull("too many nodes", lexer.nextNode());
}
/**
* See bug #899413 bug in javascript end detection.
*/
public void testEscapedQuote () throws ParserException
{
String string;
String html;
Lexer lexer;
Node node;
string = "\na='\\'';\n";
html = string + "</script>";
lexer = new Lexer (html);
node = lexer.nextNode (true);
if (node == null)
fail ("too few nodes");
else
assertStringEquals ("bad string", string, node.toHtml());
assertNotNull ("too few nodes", lexer.nextNode (true));
assertNull ("too many nodes", lexer.nextNode (true));
}
/**
* See bug #1227213 Particular SCRIPT tags close too late.
*/
public void testCommentInScript () throws ParserException
{
String tag;
String cdata;
String endtag;
String html;
Parser parser;
NodeIterator iterator;
Node node;
tag = "<script>";
cdata = "<!--document.write(\"en\");// -->";
endtag = "</script>";
html = tag + cdata + endtag;
parser = new Parser ();
parser.setInputHTML (html);
iterator = parser.elements ();
node = iterator.nextNode ();
if (node == null)
fail ("too few nodes");
else
assertStringEquals ("bad parse", html, node.toHtml());
assertTrue (node instanceof ScriptTag);
assertStringEquals ("bad cdata", cdata, ((ScriptTag)node).getScriptCode ());
assertNull ("too many nodes", iterator.nextNode ());
}
/**
* See bug #1227213 Particular SCRIPT tags close too late.
* This was actually working prior to the patch, since the
* ScriptScanner didn't use smartquote processing.
* I'm not sure why jwilsonsprings1 said the patch worked
* for him. I can only assume he was mistaken in thinking
* it was the URL that caused the failure.
*/
public void testUrlInStyle () throws ParserException
{
String tag;
String cdata;
String endtag;
String html;
Parser parser;
NodeIterator iterator;
Node node;
tag = "<style>";
cdata = ".eSDot {background-image:" +
"url(http://di.image.eshop.msn.com/img/sys/dot.gif)}";
endtag = "</style>";
html = tag + cdata + endtag;
parser = new Parser ();
parser.setInputHTML (html);
iterator = parser.elements ();
node = iterator.nextNode ();
if (node == null)
fail ("too few nodes");
else
assertStringEquals ("bad parse", html, node.toHtml());
assertTrue (node instanceof StyleTag);
assertStringEquals ("bad cdata", cdata, ((StyleTag)node).getStyleCode ());
assertNull ("too many nodes", iterator.nextNode ());
}
/**
* See bug #1493884 Lexer returns a TagNode with a 'null' name
*/
public void testDosLineEndingInName () throws ParserException
{
String html;
NodeIterator iterator;
Node node;
html = "<!\r\nMSIE->";
parser = new Parser ();
parser.setInputHTML (html);
iterator = parser.elements ();
node = iterator.nextNode ();
if (node == null)
fail ("too few nodes");
else
{
assertNotNull ("null node", node);
assertTrue (node instanceof Tag);
assertNotNull ("null name", ((Tag)node).getTagName ());
assertStringEquals ("bad parse", "!", ((Tag)node).getTagName ());
}
}
}