package org.jsoup.nodes; import org.jsoup.Jsoup; import org.junit.Test; import static org.jsoup.nodes.Document.OutputSettings; import static org.jsoup.nodes.Entities.EscapeMode.*; import static org.junit.Assert.*; public class EntitiesTest { @Test public void escape() { String text = "Hello &<> Å å π 新 there ¾ © »"; String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended)); String escapedAsciiXhtml = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(xhtml)); String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended)); String escapedUtfMin = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(xhtml)); assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAscii); assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAsciiFull); assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAsciiXhtml); assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedUtfFull); assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedUtfMin); // odd that it's defined as aring in base but angst in full // round trip assertEquals(text, Entities.unescape(escapedAscii)); assertEquals(text, Entities.unescape(escapedAsciiFull)); assertEquals(text, Entities.unescape(escapedAsciiXhtml)); assertEquals(text, Entities.unescape(escapedUtfFull)); assertEquals(text, Entities.unescape(escapedUtfMin)); } @Test public void escapedSupplemtary() { String text = "\uD835\uDD59"; String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); assertEquals("𝕙", escapedAscii); String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended)); assertEquals("𝕙", escapedAsciiFull); String escapedUtf= Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended)); assertEquals(text, escapedUtf); } @Test public void unescapeMultiChars() { String text = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫"; // gg is not combo, but 8811 could conflict with NestedGreaterGreater or others String un = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫"; assertEquals(un, Entities.unescape(text)); String escaped = Entities.escape(un, new OutputSettings().charset("ascii").escapeMode(extended)); assertEquals("≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫", escaped); assertEquals(un, Entities.unescape(escaped)); } @Test public void xhtml() { String text = "& > < ""; assertEquals(38, xhtml.codepointForName("amp")); assertEquals(62, xhtml.codepointForName("gt")); assertEquals(60, xhtml.codepointForName("lt")); assertEquals(34, xhtml.codepointForName("quot")); assertEquals("amp", xhtml.nameForCodepoint(38)); assertEquals("gt", xhtml.nameForCodepoint(62)); assertEquals("lt", xhtml.nameForCodepoint(60)); assertEquals("quot", xhtml.nameForCodepoint(34)); } @Test public void getByName() { assertEquals("≫⃒", Entities.getByName("nGt")); assertEquals("fj", Entities.getByName("fjlig")); assertEquals("≫", Entities.getByName("gg")); assertEquals("©", Entities.getByName("copy")); } @Test public void escapeSupplementaryCharacter() { String text = new String(Character.toChars(135361)); String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base)); assertEquals("𡃁", escapedAscii); String escapedUtf = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(base)); assertEquals(text, escapedUtf); } @Test public void notMissingMultis() { String text = "⫽⃥"; String un = "\u2AFD\u20E5"; assertEquals(un, Entities.unescape(text)); } @Test public void notMissingSupplementals() { String text = "⨔ 𝔮"; String un = "⨔ \uD835\uDD2E"; // 𝔮 assertEquals(un, Entities.unescape(text)); } @Test public void unescape() { String text = "Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©"; assertEquals("Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©", Entities.unescape(text)); assertEquals("&0987654321; &unknown", Entities.unescape("&0987654321; &unknown")); } @Test public void strictUnescape() { // for attributes, enforce strict unescaping (must look like &#xxx; , not just &#xxx) String text = "Hello &= &"; assertEquals("Hello &= &", Entities.unescape(text, true)); assertEquals("Hello &= &", Entities.unescape(text)); assertEquals("Hello &= &", Entities.unescape(text, false)); } @Test public void caseSensitive() { String unescaped = "Ü ü & &"; assertEquals("Ü ü & &", Entities.escape(unescaped, new OutputSettings().charset("ascii").escapeMode(extended))); String escaped = "Ü ü & &"; assertEquals("Ü ü & &", Entities.unescape(escaped)); } @Test public void quoteReplacements() { String escaped = "\ $"; String unescaped = "\\ $"; assertEquals(unescaped, Entities.unescape(escaped)); } @Test public void letterDigitEntities() { String html = "<p>¹²³¼½¾</p>"; Document doc = Jsoup.parse(html); doc.outputSettings().charset("ascii"); Element p = doc.select("p").first(); assertEquals("¹²³¼½¾", p.html()); assertEquals("¹²³¼½¾", p.text()); doc.outputSettings().charset("UTF-8"); assertEquals("¹²³¼½¾", p.html()); } @Test public void noSpuriousDecodes() { String string = "http://www.foo.com?a=1&num_rooms=1&children=0&int=VA&b=2"; assertEquals(string, Entities.unescape(string)); } @Test public void escapesGtInXmlAttributesButNotInHtml() { // https://github.com/jhy/jsoup/issues/528 - < is OK in HTML attribute values, but not in XML String docHtml = "<a title='<p>One</p>'>One</a>"; Document doc = Jsoup.parse(docHtml); Element element = doc.select("a").first(); doc.outputSettings().escapeMode(base); assertEquals("<a title=\"<p>One</p>\">One</a>", element.outerHtml()); doc.outputSettings().escapeMode(xhtml); assertEquals("<a title=\"<p>One</p>\">One</a>", element.outerHtml()); } }