package org.archive.format.text.html;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.NodeUtils;
import org.htmlparser.Node;
import org.htmlparser.lexer.Page;
//import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.ParserException;
import junit.framework.TestCase;
public class CDATALexerTest extends TestCase {
CDATALexer l;
Node n;
private CDATALexer makeLexer(String html) {
CDATALexer t = new CDATALexer();
t.setPage(new Page(html));
return t;
}
public void testNextNode() throws ParserException {
l = makeLexer("<a href=\"foo\">blem</a>");
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "A"));
assertEquals("foo",((TagNode)n).getAttribute("HREF"));
n = l.nextNode();
assertTrue(NodeUtils.isTextNode(n));
assertEquals("blem",((TextNode)n).getText());
n = l.nextNode();
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "A"));
assertNull(l.nextNode());
}
public void testInJS() throws ParserException {
l = makeLexer("<script>foo bar baz</script>");
assertFalse(l.inCSS());
assertFalse(l.inJS());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "SCRIPT"));
n = l.nextNode();
assertFalse(l.inCSS());
assertTrue(l.inJS());
assertTrue(NodeUtils.isTextNode(n));
assertEquals("foo bar baz",((TextNode)n).getText());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
public void testInCSS() throws ParserException {
l = makeLexer("<style>foo bar baz</style>");
assertFalse(l.inCSS());
assertFalse(l.inJS());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "STYLE"));
n = l.nextNode();
assertTrue(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isTextNode(n));
assertEquals("foo bar baz",((TextNode)n).getText());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
}
public void testInJSComment() throws ParserException {
// dumpParse("<script>//<!--\n foo bar baz\n //--></script>");
// dumpParse("<script><!-- foo bar baz --></script>");
// dumpParse("<script>//<!-- foo bar baz --></script>");
// dumpParse("<script><!-- foo bar baz //--></script>");
// dumpParse("<script>\n//<!-- foo bar baz\n //--></script>");
// dumpParse("<script> if(1 < 2) { foo(); } </script>");
// dumpParse("<script> if(1 <n) { foo(); } </script>");
// dumpParse("<script> document.write(\"<b>bold</b>\"); </script>");
// dumpParse("<script> document.write(\"<script>bold</script>\"); </script>");
// dumpParse("<script> <![CDATA[\n if(i<n) { foo() } // content of your Javascript goes here \n ]]> </script>");
assertJSContentWorks("//<!--\n foo bar baz\n //-->");
assertJSContentWorks("<!-- foo bar baz -->");
assertJSContentWorks("//<!-- foo bar baz -->");
assertJSContentWorks("<!-- foo bar baz //-->");
assertJSContentWorks("\n//<!-- foo bar baz\n //-->");
assertJSContentWorks("if(1 < 2) { foo(); } ");
assertJSContentWorks("if(1 <n) { foo(); } ");
assertJSContentWorks("document.write(\"<b>bold</b>\"); ");
assertJSContentWorks("document.write(\"<script>bold</script>\"); ");
assertJSContentWorks("<![CDATA[\n if(i<n) { foo() } // a comment \n ]]> ");
}
private void assertJSContentWorks(String js) throws ParserException {
String html = String.format("<script>%s</script>",js);
l = makeLexer(html);
assertFalse(l.inCSS());
assertFalse(l.inJS());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "SCRIPT"));
n = l.nextNode();
assertFalse(l.inCSS());
assertTrue(l.inJS());
assertTrue(NodeUtils.isTextNode(n));
assertEquals(js,((TextNode)n).getText());
n = l.nextNode();
assertFalse(l.inCSS());
assertFalse(l.inJS());
assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
// private void dumpParse(String html) throws ParserException {
// System.out.println("SOPARSE("+html+")");
// l = makeLexer(html);
// while(true) {
// n = l.nextNode();
// if(n == null) {
// break;
// }
// String state = String.format("%s%s",
// l.inCSS() ? "C" : "", l.inJS() ? "J" : "");
// if(NodeUtils.isRemarkNode(n)) {
// System.out.format("---COMMENT(%s)(%s)\n", state, ((RemarkNode)n).getText());
// } else if(NodeUtils.isTextNode(n)) {
// System.out.format("---TEXT(%s)(%s)\n", state, ((TextNode)n).getText());
// } else {
// TagNode tn = (TagNode) n;
// if(tn.isEmptyXmlTag()) {
// System.out.format("---EMPTY(%s)(%s)\n", state, tn.getTagName());
// } else if(tn.isEndTag()) {
// System.out.format("---END(%s)(%s)\n", state, tn.getTagName());
// } else {
// System.out.format("---OPEN(%s)(%s)\n", state, tn.getTagName());
// }
// }
// }
// System.out.println("EOPARSE");
// }
}