package net.htmlparser.jericho;
import org.junit.Test;
import static org.junit.Assert.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.nio.CharBuffer;
public class NodeIteratorTest {
private static final String sourceUrlString="file:test/data/StreamedSourceTest.html";
@Test public void test() throws Exception {
Source source=new Source(new URL(sourceUrlString));
Segment segment;
StartTag startTag;
assertEquals("UTF-8",source.getEncoding());
Iterator<Segment> i=source.iterator();
assertTrue(i.hasNext());
segment=i.next();
assertEquals(StartTagType.XML_DECLARATION,((Tag)segment).getTagType());
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>",segment.toString());
assertEquals("<?xml ?> (XML declaration) ((r1,c1,p0)-(r1,c40,p39))",segment.getDebugInfo()); // note row and column information is included
assertTrue(i.hasNext());
segment=i.next();
assertEquals("\r\n",segment.toString());
assertTrue(i.hasNext());
segment=i.next();
assertEquals(StartTagType.SERVER_COMMON,((Tag)segment).getTagType());
assertEquals("<%@ page language=\"java\" %>",segment.toString());
segment=i.next();
assertEquals(StartTagType.SERVER_COMMON,((Tag)segment).getTagType());
assertEquals("<%@ taglib uri=\"/WEB-INF/struts-i18n.tld\" prefix=\"i18n\" %>",segment.toString());
segment=i.next();
assertEquals("\r\n",segment.toString());
segment=i.next();
assertEquals(StartTagType.XML_PROCESSING_INSTRUCTION,((Tag)segment).getTagType());
assertEquals("<?xml-stylesheet href=\"standardstyle.css\" title=\"Standard Stylesheet\" type=\"text/css\"?>",segment.toString());
segment=i.next();
assertEquals("\r\n",segment.toString());
segment=i.next();
assertEquals(StartTagType.DOCTYPE_DECLARATION,((Tag)segment).getTagType());
assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" [\r\n <!ELEMENT greeting (#PCDATA)>\r\n <!ENTITY p CDATA \"<p>\">\r\n]>",segment.toString());
segment=i.next();
assertEquals(StartTagType.MARKUP_DECLARATION,((Tag)segment).getTagType());
assertEquals("<!ELEMENT greeting (#PCDATA)>",segment.toString());
segment=i.next();
assertEquals(StartTagType.MARKUP_DECLARATION,((Tag)segment).getTagType());
assertEquals("<!ENTITY p CDATA \"<p>\">",segment.toString());
for (int x=0; x<7; x++) segment=i.next();
assertEquals("Jericho HTML Parser Test Document",segment.toString());
for (int x=0; x<5; x++) segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.NORMAL,startTag.getTagType());
assertEquals("<meta name=\"keywords\" content=\"HTML parser,test document,R&D\" />",segment.toString());
assertEquals("HTML parser,test document,R&D",startTag.getAttributeValue("content")); // note that character reference inside attribute value is not handled as a separate segment
for (int x=0; x<8; x++) segment=i.next();
StringBuilder sb=new StringBuilder();
segment=i.next();
assertEquals("This paragraph contains character references: ",segment.toString());
sb.append(segment);
segment=i.next();
assertEquals("€",segment.toString());
CharacterEntityReference characterEntityReference=(CharacterEntityReference)segment;
characterEntityReference.appendCharTo(sb);
segment=i.next();
assertEquals(" and ",segment.toString());
sb.append(segment);
segment=i.next();
assertEquals("©",segment.toString());
NumericCharacterReference numericCharacterReference=(NumericCharacterReference)segment;
numericCharacterReference.appendCharTo(sb);
segment=i.next();
assertEquals(".",segment.toString());
sb.append(segment);
assertEquals("This paragraph contains character references: \u20AC and \u00A9.",sb.toString());
for (int x=0; x<3; x++) segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.COMMENT,startTag.getTagType());
assertEquals("<!-- <p>This paragraph is commented out</p> -->",segment.toString());
assertEquals(" <p>This paragraph is commented out</p> ",startTag.getTagContent().toString());
segment=i.next();
assertEquals("\r\n",segment.toString()); // note that <p> tag isn't found inside comment
segment=i.next();
assertEquals("<input type=\"button\" value=\"Click here to execute script\" title=\"simply writes some text using document.write\"\r\n onclick=\"document.write('<h2>This element is defined inside an onclick attribute</h2>')\"/>",segment.toString());
segment=i.next();
assertEquals("\r\n",segment.toString()); // note that <h2> tag isn't found inside the attribute value
for (int x=0; x<5; x++) segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.CDATA_SECTION,startTag.getTagType());
assertEquals("<![CDATA[\r\n <an> example of <sgml> markup that is not <painful> to write with < and such.\r\n]]>",segment.toString());
assertEquals("\r\n <an> example of <sgml> markup that is not <painful> to write with < and such.\r\n",startTag.getTagContent().toString());
segment=i.next();
segment=i.next();
assertEquals("<script language=\"javascript\" type=\"text/javascript\">",segment.toString());
segment=i.next();
// note that <p> tag isn't recognised inside <script> element
segment=i.next();
assertEquals("</script>",segment.toString());
segment=i.next();
segment=i.next();
assertEquals("<script language=\"javascript\" type=\"text/javascript\">",segment.toString());
segment=i.next();
// note that CDATA section isn't recognised inside <script> element
segment=i.next();
assertEquals("</script>",segment.toString());
segment=i.next();
segment=i.next();
assertEquals("<script language=\"javascript\" type=\"text/javascript\">",segment.toString());
segment=i.next();
segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.COMMENT,startTag.getTagType()); // note that a comment is recognised inside <script> element, whereas anything else isn't.
segment=i.next();
segment=i.next();
assertEquals("</script>",segment.toString());
segment=i.next();
segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.COMMENT,startTag.getTagType());
segment=i.next();
assertEquals("<% server tag %>",segment.toString()); // server tag is recognised inside comment
segment=i.next();
assertEquals("\r\n",segment.toString()); // processing instruction isn't recognised inside comment
segment=i.next();
assertEquals("<hr>",segment.toString());
segment=i.next();
assertEquals("\r\n",segment.toString());
segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.SERVER_COMMON_COMMENT,startTag.getTagType()); // server side comment
segment=i.next();
assertEquals("\r\n",segment.toString()); // neither normal nor server tags are recognised inside a server comment
segment=i.next();
assertEquals("<hr>",segment.toString());
segment=i.next();
assertEquals("\r\n<*abc def=\"ghi\">\r\n This is an example of an element from a hypothetical server language \r\n whose tag formats have not been registered with the TagTypeRegister class \r\n</*abc>\r\n",segment.toString());
segment=i.next();
assertEquals("<p>",segment.toString());
segment=i.next();
assertEquals(20071,segment.length());
segment=i.next();
assertEquals("</p>",segment.toString());
segment=i.next();
segment=i.next();
startTag=(StartTag)segment;
assertEquals(StartTagType.COMMENT,startTag.getTagType());
assertEquals(20077,segment.length());
segment=i.next();
assertTrue(i.hasNext());
segment=i.next();
assertEquals("</html>",segment.toString());
assertTrue(i.hasNext());
segment=i.next();
assertEquals("\r\n",segment.toString());
assertFalse(i.hasNext());
try {
segment=i.next();
fail("Should throw NoSuchElementException");
} catch (NoSuchElementException ex) {}
}
@Test public void testLegacyIteratorCompatabilityMode() throws Exception {
Source source=new Source(new URL(sourceUrlString));
Segment segment;
Source.LegacyIteratorCompatabilityMode=true;
Iterator<Segment> i=source.iterator();
Source.LegacyIteratorCompatabilityMode=false;
for (int x=0; x<30; x++) segment=i.next();
segment=i.next();
assertEquals("This paragraph contains character references: € and ©.",segment.toString());
assertEquals("This paragraph contains character references: \u20AC and \u00A9.",CharacterReference.decode(segment.toString()));
}
@Test public void testCharacterReferences() throws Exception {
String sourceText="&<a>&<b title=\"&\">ww&<c>&xx<d>yy&zz&<e>&";
Source source=new Source(sourceText);
Segment segment;
Iterator<Segment> i=source.iterator();
assertTrue(i.next() instanceof CharacterReference);
assertTrue(i.next() instanceof Tag);
assertTrue(i.next() instanceof CharacterReference);
assertTrue(i.next() instanceof Tag);
assertEquals("ww",i.next().toString());
assertTrue(i.next() instanceof CharacterReference);
assertTrue(i.next() instanceof Tag);
assertTrue(i.next() instanceof CharacterReference);
assertEquals("xx",i.next().toString());
assertTrue(i.next() instanceof Tag);
assertEquals("yy",i.next().toString());
assertTrue(i.next() instanceof CharacterReference);
assertEquals("zz",i.next().toString());
assertTrue(i.next() instanceof CharacterReference);
assertTrue(i.next() instanceof Tag);
assertTrue(i.next() instanceof CharacterReference);
}
/*
@Test public void benchmark() throws Exception {
for (int i=0; i<5000; i++) {
for (Segment segment : new Source(new URL(sourceUrlString))) {}
}
}
*/
}