package net.htmlparser.jericho; import org.junit.Test; import static org.junit.Assert.*; public class CharacterReferenceTest { @Test public void testDecode() { assertEquals("b&b",CharacterReference.decode("b&b")); // decode character entity reference with codepoint <= U+00FF assertEquals("b&b",CharacterReference.decode("b&b")); // decode decimal numeric character reference with codepoint <= U+00FF assertEquals("b&b",CharacterReference.decode("b&b")); // decode hexadecimal numeric character reference with codepoint <= U+00FF assertEquals("x\u20acx",CharacterReference.decode("x€x")); // decode character entity reference with codepoint > U+00FF assertEquals("x\u20acx",CharacterReference.decode("x€x")); // decode decimal numeric character reference with codepoint > U+00FF assertEquals("x\u20acx",CharacterReference.decode("x€x")); // decode hexadecimal numeric character reference with codepoint > U+00FF } @Test public void testDecodeUnterminated() { assertEquals("b&b",CharacterReference.decode("b&b")); // DO NOT decode unterminated character entity reference followed by an alphabetic character assertEquals("b& b",CharacterReference.decode("b& b")); // decode unterminated character entity reference followed by an NON-alphabetic character assertEquals("b&b",CharacterReference.decode("b&b")); // decode unterminated decimal numeric character reference followed by an alphabetic character assertEquals("b&x",CharacterReference.decode("b&x",true)); // decode hexadecimal numeric character reference followed by an alphabetic character (only if insideAttributeValue=true with default configuration) // DEFAULT CONFIGURATION (Config.CurrentCompatibilityMode=Config.CompatibilityMode.IE): assertEquals("x& x",CharacterReference.decode("x& x",false)); // decode unterminated character entity reference with codepoint <= U+00FF with insideAttributeValue=false assertEquals("x& x",CharacterReference.decode("x& x",false)); // decode unterminated decimal numeric character reference with codepoint <= U+00FF with insideAttributeValue=false assertEquals("x& x",CharacterReference.decode("x& x",false)); // DO NOT decode unterminated hexadecimal numeric character reference with codepoint <= U+00FF with insideAttributeValue=false assertEquals("x&euro x",CharacterReference.decode("x&euro x",false)); // DO NOT decode unterminated character entity reference with codepoint > U+00FF with insideAttributeValue=false assertEquals("x\u20ac x",CharacterReference.decode("x€ x",false)); // decode unterminated decimal numeric character reference with codepoint > U+00FF with insideAttributeValue=false assertEquals("x€ x",CharacterReference.decode("x€ x",false)); // DO NOT decode unterminated hexadecimal numeric character reference with codepoint > U+00FF with insideAttributeValue=false assertEquals("x& x",CharacterReference.decode("x& x",true)); // decode unterminated character entity reference with codepoint <= U+00FF with insideAttributeValue=false assertEquals("x& x",CharacterReference.decode("x& x",true)); // decode unterminated decimal numeric character reference with codepoint <= U+00FF with insideAttributeValue=false assertEquals("x& x",CharacterReference.decode("x& x",true)); // decode unterminated hexadecimal numeric character reference with codepoint <= U+00FF with insideAttributeValue=false assertEquals("x&euro x",CharacterReference.decode("x&euro x",true)); // DO NOT decode unterminated character entity reference with codepoint > U+00FF with insideAttributeValue=false assertEquals("x\u20ac x",CharacterReference.decode("x€ x",true)); // decode unterminated decimal numeric character reference with codepoint > U+00FF with insideAttributeValue=false assertEquals("x\u20ac x",CharacterReference.decode("x€ x",true)); // decode unterminated hexadecimal numeric character reference with codepoint > U+00FF with insideAttributeValue=false } @Test public void testDecodeAttribute() { // demonstrates rules for decoding inside attribute value with default configuration (Config.CompatibilityMode.IE): // - unterminated &euro is not decoded as it has codepoint >= U+00FF // - unterminated < is not decoded as it is followed by an alphabetic character // - unterminated > is decoded as it is has codepoint < U+00FF and is not followed by an alphabetic character Source source=new Source("<a href=\"test?a=1&b=2&c=3&euro=4&d=<x>&e=5\">test</a>"); StartTag startTag=source.getFirstStartTag(HTMLElementName.A); String href=startTag.getAttributeValue("href"); assertEquals("test?a=1&b=2&c=3&euro=4&d=<x>&e=5",href); } }