package org.jsoup.helper; import org.jsoup.Jsoup; import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Element; import org.junit.Test; import org.w3c.dom.Document; import org.w3c.dom.Node; import java.io.File; import java.io.IOException; import static org.jsoup.TextUtil.LE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class W3CDomTest { @Test public void simpleConversion() { String html = "<html><head><title>W3c</title></head><body><p class='one' id=12>Text</p><!-- comment --><invalid>What<script>alert('!')"; org.jsoup.nodes.Document doc = Jsoup.parse(html); W3CDom w3c = new W3CDom(); Document wDoc = w3c.fromJsoup(doc); String out = w3c.asString(wDoc); assertEquals( "<html>" + LE + "<head>" + LE + "<META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">" + LE + "<title>W3c</title>" + LE + "</head>" + LE + "<body>" + LE + "<p class=\"one\" id=\"12\">Text</p>" + LE + "<!-- comment -->" + LE + "<invalid>What<script>alert('!')</script>" + LE + "</invalid>" + LE + "</body>" + LE + "</html>" + LE , out); } @Test public void convertsGoogle() throws IOException { File in = ParseTest.getFile("/htmltests/google-ipod.html"); org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8"); W3CDom w3c = new W3CDom(); Document wDoc = w3c.fromJsoup(doc); Node htmlEl = wDoc.getChildNodes().item(0); assertEquals(null, htmlEl.getNamespaceURI()); assertEquals("html", htmlEl.getLocalName()); assertEquals("html", htmlEl.getNodeName()); String out = w3c.asString(wDoc); assertTrue(out.contains("ipod")); } @Test public void namespacePreservation() throws IOException { File in = ParseTest.getFile("/htmltests/namespaces.xhtml"); org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(in, "UTF-8"); Document doc; org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom(); doc = jDom.fromJsoup(jsoupDoc); Node htmlEl = doc.getChildNodes().item(0); assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI()); assertEquals("html", htmlEl.getLocalName()); assertEquals("html", htmlEl.getNodeName()); Node epubTitle = htmlEl.getChildNodes().item(2).getChildNodes().item(3); assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI()); assertEquals("title", epubTitle.getLocalName()); assertEquals("epub:title", epubTitle.getNodeName()); Node xSection = epubTitle.getNextSibling().getNextSibling(); assertEquals("urn:test", xSection.getNamespaceURI()); assertEquals("section", xSection.getLocalName()); assertEquals("x:section", xSection.getNodeName()); } @Test public void handlesInvalidAttributeNames() { String html = "<html><head></head><body style=\"color: red\" \" name\"></body></html>"; org.jsoup.nodes.Document jsoupDoc; jsoupDoc = Jsoup.parse(html); Element body = jsoupDoc.select("body").first(); assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it assertTrue(body.hasAttr("name\"")); Document w3Doc = new W3CDom().fromJsoup(jsoupDoc); } }