package org.jsoup.nodes;
import org.jsoup.Jsoup;
import org.jsoup.TextUtil;
import org.jsoup.integration.ParseTest;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Document.OutputSettings.Syntax;
import org.junit.Ignore;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.Charset;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
/**
Tests for Document.
@author Jonathan Hedley, jonathan@hedley.net */
public class DocumentTest {
private static final String charsetUtf8 = "UTF-8";
private static final String charsetIso8859 = "ISO-8859-1";
@Test public void setTextPreservesDocumentStructure() {
Document doc = Jsoup.parse("<p>Hello</p>");
doc.text("Replaced");
assertEquals("Replaced", doc.text());
assertEquals("Replaced", doc.body().text());
assertEquals(1, doc.select("head").size());
}
@Test public void testTitles() {
Document noTitle = Jsoup.parse("<p>Hello</p>");
Document withTitle = Jsoup.parse("<title>First</title><title>Ignore</title><p>Hello</p>");
assertEquals("", noTitle.title());
noTitle.title("Hello");
assertEquals("Hello", noTitle.title());
assertEquals("Hello", noTitle.select("title").first().text());
assertEquals("First", withTitle.title());
withTitle.title("Hello");
assertEquals("Hello", withTitle.title());
assertEquals("Hello", withTitle.select("title").first().text());
Document normaliseTitle = Jsoup.parse("<title> Hello\nthere \n now \n");
assertEquals("Hello there now", normaliseTitle.title());
}
@Test public void testOutputEncoding() {
Document doc = Jsoup.parse("<p title=π>π & < > </p>");
// default is utf-8
assertEquals("<p title=\"π\">π & < > </p>", doc.body().html());
assertEquals("UTF-8", doc.outputSettings().charset().name());
doc.outputSettings().charset("ascii");
assertEquals(Entities.EscapeMode.base, doc.outputSettings().escapeMode());
assertEquals("<p title=\"π\">π & < > </p>", doc.body().html());
doc.outputSettings().escapeMode(Entities.EscapeMode.extended);
assertEquals("<p title=\"π\">π & < > </p>", doc.body().html());
}
@Test public void testXhtmlReferences() {
Document doc = Jsoup.parse("< > & " ' ×");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
assertEquals("< > & \" ' ×", doc.body().html());
}
@Test public void testNormalisesStructure() {
Document doc = Jsoup.parse("<html><head><script>one</script><noscript><p>two</p></noscript></head><body><p>three</p></body><p>four</p></html>");
assertEquals("<html><head><script>one</script><noscript><p>two</noscript></head><body><p>three</p><p>four</p></body></html>", TextUtil.stripNewlines(doc.html()));
}
@Test public void testClone() {
Document doc = Jsoup.parse("<title>Hello</title> <p>One<p>Two");
Document clone = doc.clone();
assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
clone.title("Hello there");
clone.select("p").first().text("One more").attr("id", "1");
assertEquals("<html><head><title>Hello there</title> </head><body><p id=\"1\">One more</p><p>Two</p></body></html>", TextUtil.stripNewlines(clone.html()));
assertEquals("<html><head><title>Hello</title> </head><body><p>One</p><p>Two</p></body></html>", TextUtil.stripNewlines(doc.html()));
}
@Test public void testClonesDeclarations() {
Document doc = Jsoup.parse("<!DOCTYPE html><html><head><title>Doctype test");
Document clone = doc.clone();
assertEquals(doc.html(), clone.html());
assertEquals("<!doctype html><html><head><title>Doctype test</title></head><body></body></html>",
TextUtil.stripNewlines(clone.html()));
}
@Test public void testLocation() throws IOException {
File in = new ParseTest().getFile("/htmltests/yahoo-jp.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html");
String location = doc.location();
String baseUri = doc.baseUri();
assertEquals("http://www.yahoo.co.jp/index.html",location);
assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/",baseUri);
in = new ParseTest().getFile("/htmltests/nyt-article-1.html");
doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");
location = doc.location();
baseUri = doc.baseUri();
assertEquals("http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp",location);
assertEquals("http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp",baseUri);
}
@Test public void testHtmlAndXmlSyntax() {
String h = "<!DOCTYPE html><body><img async checked='checked' src='&<>\"'><>&"<foo />bar";
Document doc = Jsoup.parse(h);
doc.outputSettings().syntax(Syntax.html);
assertEquals("<!doctype html>\n" +
"<html>\n" +
" <head></head>\n" +
" <body>\n" +
" <img async checked src=\"&<>"\"><>&\"\n" +
" <foo />bar\n" +
" </body>\n" +
"</html>", doc.html());
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
assertEquals("<!DOCTYPE html>\n" +
"<html>\n" +
" <head></head>\n" +
" <body>\n" +
" <img async=\"\" checked=\"checked\" src=\"&<>"\" /><>&\"\n" +
" <foo />bar\n" +
" </body>\n" +
"</html>", doc.html());
}
@Test public void htmlParseDefaultsToHtmlOutputSyntax() {
Document doc = Jsoup.parse("x");
assertEquals(Syntax.html, doc.outputSettings().syntax());
}
@Test public void testHtmlAppendable() {
String htmlContent = "<html><head><title>Hello</title></head><body><p>One</p><p>Two</p></body></html>";
Document document = Jsoup.parse(htmlContent);
OutputSettings outputSettings = new OutputSettings();
outputSettings.prettyPrint(false);
document.outputSettings(outputSettings);
assertEquals(htmlContent, document.html(new StringWriter()).toString());
}
// Ignored since this test can take awhile to run.
@Ignore
@Test public void testOverflowClone() {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 100000; i++) {
builder.insert(0, "<i>");
builder.append("</i>");
}
Document doc = Jsoup.parse(builder.toString());
doc.clone();
}
@Test public void DocumentsWithSameContentAreEqual() throws Exception {
Document docA = Jsoup.parse("<div/>One");
Document docB = Jsoup.parse("<div/>One");
Document docC = Jsoup.parse("<div/>Two");
assertFalse(docA.equals(docB));
assertTrue(docA.equals(docA));
assertEquals(docA.hashCode(), docA.hashCode());
assertFalse(docA.hashCode() == docC.hashCode());
}
@Test public void DocumentsWithSameContentAreVerifialbe() throws Exception {
Document docA = Jsoup.parse("<div/>One");
Document docB = Jsoup.parse("<div/>One");
Document docC = Jsoup.parse("<div/>Two");
assertTrue(docA.hasSameValue(docB));
assertFalse(docA.hasSameValue(docC));
}
@Test
public void testMetaCharsetUpdateUtf8() {
final Document doc = createHtmlDocument("changeThis");
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetUtf8));
final String htmlCharsetUTF8 = "<html>\n" +
" <head>\n" +
" <meta charset=\"" + charsetUtf8 + "\">\n" +
" </head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(htmlCharsetUTF8, doc.toString());
Element selectedElement = doc.select("meta[charset]").first();
assertEquals(charsetUtf8, doc.charset().name());
assertEquals(charsetUtf8, selectedElement.attr("charset"));
assertEquals(doc.charset(), doc.outputSettings().charset());
}
@Test
public void testMetaCharsetUpdateIso8859() {
final Document doc = createHtmlDocument("changeThis");
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetIso8859));
final String htmlCharsetISO = "<html>\n" +
" <head>\n" +
" <meta charset=\"" + charsetIso8859 + "\">\n" +
" </head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(htmlCharsetISO, doc.toString());
Element selectedElement = doc.select("meta[charset]").first();
assertEquals(charsetIso8859, doc.charset().name());
assertEquals(charsetIso8859, selectedElement.attr("charset"));
assertEquals(doc.charset(), doc.outputSettings().charset());
}
@Test
public void testMetaCharsetUpdateNoCharset() {
final Document docNoCharset = Document.createShell("");
docNoCharset.updateMetaCharsetElement(true);
docNoCharset.charset(Charset.forName(charsetUtf8));
assertEquals(charsetUtf8, docNoCharset.select("meta[charset]").first().attr("charset"));
final String htmlCharsetUTF8 = "<html>\n" +
" <head>\n" +
" <meta charset=\"" + charsetUtf8 + "\">\n" +
" </head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(htmlCharsetUTF8, docNoCharset.toString());
}
@Test
public void testMetaCharsetUpdateDisabled() {
final Document docDisabled = Document.createShell("");
final String htmlNoCharset = "<html>\n" +
" <head></head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(htmlNoCharset, docDisabled.toString());
assertNull(docDisabled.select("meta[charset]").first());
}
@Test
public void testMetaCharsetUpdateDisabledNoChanges() {
final Document doc = createHtmlDocument("dontTouch");
final String htmlCharset = "<html>\n" +
" <head>\n" +
" <meta charset=\"dontTouch\">\n" +
" <meta name=\"charset\" content=\"dontTouch\">\n" +
" </head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(htmlCharset, doc.toString());
Element selectedElement = doc.select("meta[charset]").first();
assertNotNull(selectedElement);
assertEquals("dontTouch", selectedElement.attr("charset"));
selectedElement = doc.select("meta[name=charset]").first();
assertNotNull(selectedElement);
assertEquals("dontTouch", selectedElement.attr("content"));
}
@Test
public void testMetaCharsetUpdateEnabledAfterCharsetChange() {
final Document doc = createHtmlDocument("dontTouch");
doc.charset(Charset.forName(charsetUtf8));
Element selectedElement = doc.select("meta[charset]").first();
assertEquals(charsetUtf8, selectedElement.attr("charset"));
assertTrue(doc.select("meta[name=charset]").isEmpty());
}
@Test
public void testMetaCharsetUpdateCleanup() {
final Document doc = createHtmlDocument("dontTouch");
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetUtf8));
final String htmlCharsetUTF8 = "<html>\n" +
" <head>\n" +
" <meta charset=\"" + charsetUtf8 + "\">\n" +
" </head>\n" +
" <body></body>\n" +
"</html>";
assertEquals(htmlCharsetUTF8, doc.toString());
}
@Test
public void testMetaCharsetUpdateXmlUtf8() {
final Document doc = createXmlDocument("1.0", "changeThis", true);
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetUtf8));
final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
assertEquals(xmlCharsetUTF8, doc.toString());
XmlDeclaration selectedNode = (XmlDeclaration) doc.childNode(0);
assertEquals(charsetUtf8, doc.charset().name());
assertEquals(charsetUtf8, selectedNode.attr("encoding"));
assertEquals(doc.charset(), doc.outputSettings().charset());
}
@Test
public void testMetaCharsetUpdateXmlIso8859() {
final Document doc = createXmlDocument("1.0", "changeThis", true);
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetIso8859));
final String xmlCharsetISO = "<?xml version=\"1.0\" encoding=\"" + charsetIso8859 + "\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
assertEquals(xmlCharsetISO, doc.toString());
XmlDeclaration selectedNode = (XmlDeclaration) doc.childNode(0);
assertEquals(charsetIso8859, doc.charset().name());
assertEquals(charsetIso8859, selectedNode.attr("encoding"));
assertEquals(doc.charset(), doc.outputSettings().charset());
}
@Test
public void testMetaCharsetUpdateXmlNoCharset() {
final Document doc = createXmlDocument("1.0", "none", false);
doc.updateMetaCharsetElement(true);
doc.charset(Charset.forName(charsetUtf8));
final String xmlCharsetUTF8 = "<?xml version=\"1.0\" encoding=\"" + charsetUtf8 + "\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
assertEquals(xmlCharsetUTF8, doc.toString());
XmlDeclaration selectedNode = (XmlDeclaration) doc.childNode(0);
assertEquals(charsetUtf8, selectedNode.attr("encoding"));
}
@Test
public void testMetaCharsetUpdateXmlDisabled() {
final Document doc = createXmlDocument("none", "none", false);
final String xmlNoCharset = "<root>\n" +
" node\n" +
"</root>";
assertEquals(xmlNoCharset, doc.toString());
}
@Test
public void testMetaCharsetUpdateXmlDisabledNoChanges() {
final Document doc = createXmlDocument("dontTouch", "dontTouch", true);
final String xmlCharset = "<?xml version=\"dontTouch\" encoding=\"dontTouch\"?>\n" +
"<root>\n" +
" node\n" +
"</root>";
assertEquals(xmlCharset, doc.toString());
XmlDeclaration selectedNode = (XmlDeclaration) doc.childNode(0);
assertEquals("dontTouch", selectedNode.attr("encoding"));
assertEquals("dontTouch", selectedNode.attr("version"));
}
@Test
public void testMetaCharsetUpdatedDisabledPerDefault() {
final Document doc = createHtmlDocument("none");
assertFalse(doc.updateMetaCharsetElement());
}
private Document createHtmlDocument(String charset) {
final Document doc = Document.createShell("");
doc.head().appendElement("meta").attr("charset", charset);
doc.head().appendElement("meta").attr("name", "charset").attr("content", charset);
return doc;
}
private Document createXmlDocument(String version, String charset, boolean addDecl) {
final Document doc = new Document("");
doc.appendElement("root").text("node");
doc.outputSettings().syntax(Syntax.xml);
if( addDecl == true ) {
XmlDeclaration decl = new XmlDeclaration("xml", "", false);
decl.attr("version", version);
decl.attr("encoding", charset);
doc.prependChild(decl);
}
return doc;
}
@Test
public void testShiftJisRoundtrip() throws Exception {
String input =
"<html>"
+ "<head>"
+ "<meta http-equiv=\"content-type\" content=\"text/html; charset=Shift_JIS\" />"
+ "</head>"
+ "<body>"
+ "before after"
+ "</body>"
+ "</html>";
InputStream is = new ByteArrayInputStream(input.getBytes(Charset.forName("ASCII")));
Document doc = Jsoup.parse(is, null, "http://example.com");
doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
String output = new String(doc.html().getBytes(doc.outputSettings().charset()), doc.outputSettings().charset());
assertFalse("Should not have contained a '?'.", output.contains("?"));
assertTrue("Should have contained a ' ' or a ' '.",
output.contains(" ") || output.contains(" "));
}
}