package org.jsoup.nodes;
import org.jsoup.Jsoup;
import org.junit.Test;
import static org.jsoup.nodes.Document.OutputSettings;
import static org.jsoup.nodes.Entities.EscapeMode.*;
import static org.junit.Assert.*;
public class EntitiesTest {
@Test public void escape() {
String text = "Hello &<> Å å π 新 there ¾ © »";
String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended));
String escapedAsciiXhtml = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(xhtml));
String escapedUtfFull = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended));
String escapedUtfMin = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(xhtml));
assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAscii);
assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAsciiFull);
assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedAsciiXhtml);
assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedUtfFull);
assertEquals("Hello &<> Å å π 新 there ¾ © »", escapedUtfMin);
// odd that it's defined as aring in base but angst in full
// round trip
assertEquals(text, Entities.unescape(escapedAscii));
assertEquals(text, Entities.unescape(escapedAsciiFull));
assertEquals(text, Entities.unescape(escapedAsciiXhtml));
assertEquals(text, Entities.unescape(escapedUtfFull));
assertEquals(text, Entities.unescape(escapedUtfMin));
}
@Test public void escapedSupplemtary() {
String text = "\uD835\uDD59";
String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
assertEquals("𝕙", escapedAscii);
String escapedAsciiFull = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(extended));
assertEquals("𝕙", escapedAsciiFull);
String escapedUtf= Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(extended));
assertEquals(text, escapedUtf);
}
@Test public void unescapeMultiChars() {
String text = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫"; // gg is not combo, but 8811 could conflict with NestedGreaterGreater or others
String un = "≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫";
assertEquals(un, Entities.unescape(text));
String escaped = Entities.escape(un, new OutputSettings().charset("ascii").escapeMode(extended));
assertEquals("≫ ⋙̸ ≫⃒ ≫̸ ≫ ≫", escaped);
assertEquals(un, Entities.unescape(escaped));
}
@Test public void xhtml() {
String text = "& > < "";
assertEquals(38, xhtml.codepointForName("amp"));
assertEquals(62, xhtml.codepointForName("gt"));
assertEquals(60, xhtml.codepointForName("lt"));
assertEquals(34, xhtml.codepointForName("quot"));
assertEquals("amp", xhtml.nameForCodepoint(38));
assertEquals("gt", xhtml.nameForCodepoint(62));
assertEquals("lt", xhtml.nameForCodepoint(60));
assertEquals("quot", xhtml.nameForCodepoint(34));
}
@Test public void getByName() {
assertEquals("≫⃒", Entities.getByName("nGt"));
assertEquals("fj", Entities.getByName("fjlig"));
assertEquals("≫", Entities.getByName("gg"));
assertEquals("©", Entities.getByName("copy"));
}
@Test public void escapeSupplementaryCharacter() {
String text = new String(Character.toChars(135361));
String escapedAscii = Entities.escape(text, new OutputSettings().charset("ascii").escapeMode(base));
assertEquals("𡃁", escapedAscii);
String escapedUtf = Entities.escape(text, new OutputSettings().charset("UTF-8").escapeMode(base));
assertEquals(text, escapedUtf);
}
@Test public void notMissingMultis() {
String text = "⫽⃥";
String un = "\u2AFD\u20E5";
assertEquals(un, Entities.unescape(text));
}
@Test public void notMissingSupplementals() {
String text = "⨔ 𝔮";
String un = "⨔ \uD835\uDD2E"; // 𝔮
assertEquals(un, Entities.unescape(text));
}
@Test public void unescape() {
String text = "Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©";
assertEquals("Hello Æ &<> ® Å &angst π π 新 there &! ¾ © ©", Entities.unescape(text));
assertEquals("&0987654321; &unknown", Entities.unescape("&0987654321; &unknown"));
}
@Test public void strictUnescape() { // for attributes, enforce strict unescaping (must look like xx; , not just xx)
String text = "Hello &= &";
assertEquals("Hello &= &", Entities.unescape(text, true));
assertEquals("Hello &= &", Entities.unescape(text));
assertEquals("Hello &= &", Entities.unescape(text, false));
}
@Test public void caseSensitive() {
String unescaped = "Ü ü & &";
assertEquals("Ü ü & &",
Entities.escape(unescaped, new OutputSettings().charset("ascii").escapeMode(extended)));
String escaped = "Ü ü & &";
assertEquals("Ü ü & &", Entities.unescape(escaped));
}
@Test public void quoteReplacements() {
String escaped = "\ $";
String unescaped = "\\ $";
assertEquals(unescaped, Entities.unescape(escaped));
}
@Test public void letterDigitEntities() {
String html = "<p>¹²³¼½¾</p>";
Document doc = Jsoup.parse(html);
doc.outputSettings().charset("ascii");
Element p = doc.select("p").first();
assertEquals("¹²³¼½¾", p.html());
assertEquals("¹²³¼½¾", p.text());
doc.outputSettings().charset("UTF-8");
assertEquals("¹²³¼½¾", p.html());
}
@Test public void noSpuriousDecodes() {
String string = "http://www.foo.com?a=1&num_rooms=1&children=0&int=VA&b=2";
assertEquals(string, Entities.unescape(string));
}
@Test public void escapesGtInXmlAttributesButNotInHtml() {
// https://github.com/jhy/jsoup/issues/528 - < is OK in HTML attribute values, but not in XML
String docHtml = "<a title='<p>One</p>'>One</a>";
Document doc = Jsoup.parse(docHtml);
Element element = doc.select("a").first();
doc.outputSettings().escapeMode(base);
assertEquals("<a title=\"<p>One</p>\">One</a>", element.outerHtml());
doc.outputSettings().escapeMode(xhtml);
assertEquals("<a title=\"<p>One</p>\">One</a>", element.outerHtml());
}
}