package com.mcxiaoke.minicat.util;
import java.util.HashMap;
import java.util.Map;
/**
* @author mcxiaoke
* @version 1.0 2011.05.02
*/
public final class HtmlHelper {
private static Map<String, String> entityEscapeMap = new HashMap<String, String>();
private static Map<String, String> escapeEntityMap = new HashMap<String, String>();
static {
String[][] entities = {
{" ", " "/* no-break space = non-breaking space */,
"\u00A0"},
{"¡", "¡"/* inverted exclamation mark */, "\u00A1"},
{"¢", "¢"/* cent sign */, "\u00A2"},
{"£", "£"/* pound sign */, "\u00A3"},
{"¤", "¤"/* currency sign */, "\u00A4"},
{"¥", "¥"/* yen sign = yuan sign */, "\u00A5"},
{"¦", "¦"/* broken bar = broken vertical bar */,
"\u00A6"},
{"§", "§"/* section sign */, "\u00A7"},
{"¨", "¨"/* diaeresis = spacing diaeresis */,
"\u00A8"},
{"©", "©"/* copyright sign */, "\u00A9"},
{"ª", "ª"/* feminine ordinal indicator */, "\u00AA"},
{"«", "«"/*
* left-pointing double angle quotation mark
* = left pointing guillemet
*/, "\u00AB"},
{"¬", "¬"/* not sign = discretionary hyphen */,
"\u00AC"},
{"", ""/* soft hyphen = discretionary hyphen */,
"\u00AD"},
{"®", "®"/*
* registered sign = registered trade mark
* sign
*/, "\u00AE"},
{"¯", "¯"/*
* macron = spacing macron = overline = APL
* overbar
*/, "\u00AF"},
{"°", "°"/* degree sign */, "\u00B0"},
{"±", "±"/* plus-minus sign = plus-or-minus sign */,
"\u00B1"},
{"²", "²"/*
* superscript two = superscript digit two =
* squared
*/, "\u00B2"},
{"³", "³"/*
* superscript three = superscript digit
* three = cubed
*/, "\u00B3"},
{"´", "´"/* acute accent = spacing acute */,
"\u00B4"},
{"µ", "µ"/* micro sign */, "\u00B5"},
{"¶", "¶"/* pilcrow sign = paragraph sign */,
"\u00B6"},
{"·", "·"/*
* middle dot = Georgian comma = Greek
* middle dot
*/, "\u00B7"},
{"¸", "¸"/* cedilla = spacing cedilla */, "\u00B8"},
{"¹",
"¹"/* superscript one = superscript digit one */,
"\u00B9"},
{"º", "º"/* masculine ordinal indicator */, "\u00BA"},
{"»", "»"/*
* right-pointing double angle quotation
* mark = right pointing guillemet
*/, "\u00BB"},
{"¼", "¼"/*
* vulgar fraction one quarter = fraction
* one quarter
*/, "\u00BC"},
{"½", "½"/*
* vulgar fraction one half = fraction one
* half
*/, "\u00BD"},
{"¾", "¾"/*
* vulgar fraction three quarters = fraction
* three quarters
*/, "\u00BE"},
{"¿", "¿"/*
* inverted question mark = turned question
* mark
*/, "\u00BF"},
{"À", "À"/*
* latin capital letter A with grave = latin
* capital letter A grave
*/, "\u00C0"},
{"Á", "Á"/* latin capital letter A with acute */,
"\u00C1"},
{"Â",
"Â"/* latin capital letter A with circumflex */,
"\u00C2"},
{"Ã", "Ã"/* latin capital letter A with tilde */,
"\u00C3"},
{"Ä",
"Ä"/* latin capital letter A with diaeresis */,
"\u00C4"},
{"Å", "Å"/*
* latin capital letter A with ring above =
* latin capital letter A ring
*/, "\u00C5"},
{"Æ", "Æ"/*
* latin capital letter AE = latin capital
* ligature AE
*/, "\u00C6"},
{"Ç", "Ç"/* latin capital letter C with cedilla */,
"\u00C7"},
{"È", "È"/* latin capital letter E with grave */,
"\u00C8"},
{"É", "É"/* latin capital letter E with acute */,
"\u00C9"},
{"Ê",
"Ê"/* latin capital letter E with circumflex */,
"\u00CA"},
{"Ë",
"Ë"/* latin capital letter E with diaeresis */,
"\u00CB"},
{"Ì", "Ì"/* latin capital letter I with grave */,
"\u00CC"},
{"Í", "Í"/* latin capital letter I with acute */,
"\u00CD"},
{"Î",
"Î"/* latin capital letter I with circumflex */,
"\u00CE"},
{"Ï",
"Ï"/* latin capital letter I with diaeresis */,
"\u00CF"},
{"Ð", "Ð"/* latin capital letter ETH */, "\u00D0"},
{"Ñ", "Ñ"/* latin capital letter N with tilde */,
"\u00D1"},
{"Ò", "Ò"/* latin capital letter O with grave */,
"\u00D2"},
{"Ó", "Ó"/* latin capital letter O with acute */,
"\u00D3"},
{"Ô",
"Ô"/* latin capital letter O with circumflex */,
"\u00D4"},
{"Õ", "Õ"/* latin capital letter O with tilde */,
"\u00D5"},
{"Ö",
"Ö"/* latin capital letter O with diaeresis */,
"\u00D6"},
{"×", "×"/* multiplication sign */, "\u00D7"},
{"Ø", "Ø"/*
* latin capital letter O with stroke =
* latin capital letter O slash
*/, "\u00D8"},
{"Ù", "Ù"/* latin capital letter U with grave */,
"\u00D9"},
{"Ú", "Ú"/* latin capital letter U with acute */,
"\u00DA"},
{"Û",
"Û"/* latin capital letter U with circumflex */,
"\u00DB"},
{"Ü",
"Ü"/* latin capital letter U with diaeresis */,
"\u00DC"},
{"Ý", "Ý"/* latin capital letter Y with acute */,
"\u00DD"},
{"Þ", "Þ"/* latin capital letter THORN */, "\u00DE"},
{"ß", "ß"/* latin small letter sharp s = ess-zed */,
"\u00DF"},
{"à", "à"/*
* latin small letter a with grave = latin
* small letter a grave
*/, "\u00E0"},
{"á", "á"/* latin small letter a with acute */,
"\u00E1"},
{"â", "â"/* latin small letter a with circumflex */,
"\u00E2"},
{"ã", "ã"/* latin small letter a with tilde */,
"\u00E3"},
{"ä", "ä"/* latin small letter a with diaeresis */,
"\u00E4"},
{"å", "å"/*
* latin small letter a with ring above =
* latin small letter a ring
*/, "\u00E5"},
{"æ", "æ"/*
* latin small letter ae = latin small
* ligature ae
*/, "\u00E6"},
{"ç", "ç"/* latin small letter c with cedilla */,
"\u00E7"},
{"è", "è"/* latin small letter e with grave */,
"\u00E8"},
{"é", "é"/* latin small letter e with acute */,
"\u00E9"},
{"ê", "ê"/* latin small letter e with circumflex */,
"\u00EA"},
{"ë", "ë"/* latin small letter e with diaeresis */,
"\u00EB"},
{"ì", "ì"/* latin small letter i with grave */,
"\u00EC"},
{"í", "í"/* latin small letter i with acute */,
"\u00ED"},
{"î", "î"/* latin small letter i with circumflex */,
"\u00EE"},
{"ï", "ï"/* latin small letter i with diaeresis */,
"\u00EF"},
{"ð", "ð"/* latin small letter eth */, "\u00F0"},
{"ñ", "ñ"/* latin small letter n with tilde */,
"\u00F1"},
{"ò", "ò"/* latin small letter o with grave */,
"\u00F2"},
{"ó", "ó"/* latin small letter o with acute */,
"\u00F3"},
{"ô", "ô"/* latin small letter o with circumflex */,
"\u00F4"},
{"õ", "õ"/* latin small letter o with tilde */,
"\u00F5"},
{"ö", "ö"/* latin small letter o with diaeresis */,
"\u00F6"},
{"÷", "÷"/* division sign */, "\u00F7"},
{"ø", "ø"/*
* latin small letter o with stroke = latin
* small letter o slash
*/, "\u00F8"},
{"ù", "ù"/* latin small letter u with grave */,
"\u00F9"},
{"ú", "ú"/* latin small letter u with acute */,
"\u00FA"},
{"û", "û"/* latin small letter u with circumflex */,
"\u00FB"},
{"ü", "ü"/* latin small letter u with diaeresis */,
"\u00FC"},
{"ý", "ý"/* latin small letter y with acute */,
"\u00FD"},
{"þ", "þ"/* latin small letter thorn with */,
"\u00FE"},
{"ÿ", "ÿ"/* latin small letter y with diaeresis */,
"\u00FF"},
{"ƒ", "ƒ"/*
* latin small f with hook = function =
* florin
*/, "\u0192"}
/* Greek */
,
{"Α", "Α"/* greek capital letter alpha */, "\u0391"},
{"Β", "Β"/* greek capital letter beta */, "\u0392"},
{"Γ", "Γ"/* greek capital letter gamma */, "\u0393"},
{"Δ", "Δ"/* greek capital letter delta */, "\u0394"},
{"Ε", "Ε"/* greek capital letter epsilon */,
"\u0395"},
{"Ζ", "Ζ"/* greek capital letter zeta */, "\u0396"},
{"Η", "Η"/* greek capital letter eta */, "\u0397"},
{"Θ", "Θ"/* greek capital letter theta */, "\u0398"},
{"Ι", "Ι"/* greek capital letter iota */, "\u0399"},
{"Κ", "Κ"/* greek capital letter kappa */, "\u039A"},
{"Λ", "Λ"/* greek capital letter lambda */, "\u039B"},
{"Μ", "Μ"/* greek capital letter mu */, "\u039C"},
{"Ν", "Ν"/* greek capital letter nu */, "\u039D"},
{"Ξ", "Ξ"/* greek capital letter xi */, "\u039E"},
{"Ο", "Ο"/* greek capital letter omicron */,
"\u039F"},
{"Π", "Π"/* greek capital letter pi */, "\u03A0"},
{"Ρ", "Ρ"/* greek capital letter rho */, "\u03A1"}
/* there is no Sigmaf and no \u03A2 */
,
{"Σ", "Σ"/* greek capital letter sigma */, "\u03A3"},
{"Τ", "Τ"/* greek capital letter tau */, "\u03A4"},
{"Υ", "Υ"/* greek capital letter upsilon */,
"\u03A5"},
{"Φ", "Φ"/* greek capital letter phi */, "\u03A6"},
{"Χ", "Χ"/* greek capital letter chi */, "\u03A7"},
{"Ψ", "Ψ"/* greek capital letter psi */, "\u03A8"},
{"Ω", "Ω"/* greek capital letter omega */, "\u03A9"},
{"α", "α"/* greek small letter alpha */, "\u03B1"},
{"β", "β"/* greek small letter beta */, "\u03B2"},
{"γ", "γ"/* greek small letter gamma */, "\u03B3"},
{"δ", "δ"/* greek small letter delta */, "\u03B4"},
{"ε", "ε"/* greek small letter epsilon */,
"\u03B5"},
{"ζ", "ζ"/* greek small letter zeta */, "\u03B6"},
{"η", "η"/* greek small letter eta */, "\u03B7"},
{"θ", "θ"/* greek small letter theta */, "\u03B8"},
{"ι", "ι"/* greek small letter iota */, "\u03B9"},
{"κ", "κ"/* greek small letter kappa */, "\u03BA"},
{"λ", "λ"/* greek small letter lambda */, "\u03BB"},
{"μ", "μ"/* greek small letter mu */, "\u03BC"},
{"ν", "ν"/* greek small letter nu */, "\u03BD"},
{"ξ", "ξ"/* greek small letter xi */, "\u03BE"},
{"ο", "ο"/* greek small letter omicron */,
"\u03BF"},
{"π", "π"/* greek small letter pi */, "\u03C0"},
{"ρ", "ρ"/* greek small letter rho */, "\u03C1"},
{"ς", "ς"/* greek small letter final sigma */,
"\u03C2"},
{"σ", "σ"/* greek small letter sigma */, "\u03C3"},
{"τ", "τ"/* greek small letter tau */, "\u03C4"},
{"υ", "υ"/* greek small letter upsilon */,
"\u03C5"},
{"φ", "φ"/* greek small letter phi */, "\u03C6"},
{"χ", "χ"/* greek small letter chi */, "\u03C7"},
{"ψ", "ψ"/* greek small letter psi */, "\u03C8"},
{"ω", "ω"/* greek small letter omega */, "\u03C9"},
{"ϑ", "ϑ"/* greek small letter theta symbol */,
"\u03D1"},
{"ϒ", "ϒ"/* greek upsilon with hook symbol */,
"\u03D2"},
{"ϖ", "ϖ"/* greek pi symbol */, "\u03D6"}
/* General Punctuation */
,
{"•", "•"/* bullet = black small circle */, "\u2022"}
/* bullet is NOT the same as bullet operator ,"\u2219 */
,
{"…", "…"/*
* horizontal ellipsis = three dot
* leader
*/, "\u2026"},
{"′", "′"/* prime = minutes = feet */, "\u2032"},
{"″", "″"/* double prime = seconds = inches */,
"\u2033"},
{"‾", "‾"/* overline = spacing overscore */,
"\u203E"},
{"⁄", "⁄"/* fraction slash */, "\u2044"}
/* Letterlike Symbols */
,
{"℘", "℘"/*
* script capital P = power set =
* Weierstrass p
*/, "\u2118"},
{"ℑ",
"ℑ"/* blackletter capital I = imaginary part */,
"\u2111"},
{"ℜ",
"ℜ"/* blackletter capital R = real part symbol */,
"\u211C"},
{"™", "™"/* trade mark sign */, "\u2122"},
{"ℵ", "ℵ"/*
* alef symbol = first transfinite
* cardinal
*/, "\u2135"}
/* alef symbol is NOT the same as hebrew letter alef ,"\u05D0"} */
/* Arrows */
,
{"←", "←"/* leftwards arrow */, "\u2190"},
{"↑", "↑"/* upwards arrow */, "\u2191"},
{"→", "→"/* rightwards arrow */, "\u2192"},
{"↓", "↓"/* downwards arrow */, "\u2193"},
{"↔", "↔"/* left right arrow */, "\u2194"},
{"↵", "↵"/*
* downwards arrow with corner leftwards =
* carriage return
*/, "\u21B5"},
{"⇐", "⇐"/* leftwards double arrow */, "\u21D0"}
/*
* Unicode does not say that lArr is the same as the 'is implied
* by' arrow but also does not have any other character for that
* function. So ? lArr can be used for 'is implied by' as
* ISOtech suggests
*/
,
{"⇑", "⇑"/* upwards double arrow */, "\u21D1"},
{"⇒", "⇒"/* rightwards double arrow */, "\u21D2"}
/*
* Unicode does not say this is the 'implies' character but does
* not have another character with this function so ? rArr can
* be used for 'implies' as ISOtech suggests
*/
,
{"⇓", "⇓"/* downwards double arrow */, "\u21D3"},
{"⇔", "⇔"/* left right double arrow */, "\u21D4"}
/* Mathematical Operators */
,
{"∀", "∀"/* for all */, "\u2200"},
{"∂", "∂"/* partial differential */, "\u2202"},
{"∃", "∃"/* there exists */, "\u2203"},
{"∅", "∅"/* empty set = null set = diameter */,
"\u2205"},
{"∇", "∇"/* nabla = backward difference */, "\u2207"},
{"∈", "∈"/* element of */, "\u2208"},
{"∉", "∉"/* not an element of */, "\u2209"},
{"∋", "∋"/* contains as member */, "\u220B"}
/* should there be a more memorable name than 'ni'? */
,
{"∏", "∏"/* n-ary product = product sign */,
"\u220F"}
/* prod is NOT the same character as ,"\u03A0"} */
,
{"∑", "∑"/* n-ary sumation */, "\u2211"}
/* sum is NOT the same character as ,"\u03A3"} */
,
{"−", "−"/* minus sign */, "\u2212"},
{"∗", "∗"/* asterisk operator */, "\u2217"},
{"√", "√"/* square root = radical sign */, "\u221A"},
{"∝", "∝"/* proportional to */, "\u221D"},
{"∞", "∞"/* infinity */, "\u221E"},
{"∠", "∠"/* angle */, "\u2220"},
{"∧", "∧"/* logical and = wedge */, "\u2227"},
{"∨", "∨"/* logical or = vee */, "\u2228"},
{"∩", "∩"/* intersection = cap */, "\u2229"},
{"∪", "∪"/* union = cup */, "\u222A"},
{"∫", "∫"/* integral */, "\u222B"},
{"∴", "∴"/* therefore */, "\u2234"},
{
"∼",
"∼"/* tilde operator = varies with = similar to */,
"\u223C"}
/*
* tilde operator is NOT the same character as the tilde
* ,"\u007E"}
*/
,
{"≅", "≅"/* approximately equal to */, "\u2245"},
{"≈", "≈"/* almost equal to = asymptotic to */,
"\u2248"},
{"≠", "≠"/* not equal to */, "\u2260"},
{"≡", "≡"/* identical to */, "\u2261"},
{"≤", "≤"/* less-than or equal to */, "\u2264"},
{"≥", "≥"/* greater-than or equal to */, "\u2265"},
{"⊂", "⊂"/* subset of */, "\u2282"},
{"⊃", "⊃"/* superset of */, "\u2283"}
/* note that nsup 'not a superset of ,"\u2283"} */
,
{"⊆", "⊆"/* subset of or equal to */, "\u2286"},
{"⊇", "⊇"/* superset of or equal to */, "\u2287"},
{"⊕", "⊕"/* circled plus = direct sum */, "\u2295"},
{"⊗", "⊗"/* circled times = vector product */,
"\u2297"},
{"⊥",
"⊥"/* up tack = orthogonal to = perpendicular */,
"\u22A5"},
{"⋅", "⋅"/* dot operator */, "\u22C5"}
/*
* dot operator is NOT the same character as ,"\u00B7"} /*
* Miscellaneous Technical
*/
,
{"⌈", "⌈"/* left ceiling = apl upstile */, "\u2308"},
{"⌉", "⌉"/* right ceiling */, "\u2309"},
{"⌊", "⌊"/* left floor = apl downstile */,
"\u230A"},
{"⌋", "⌋"/* right floor */, "\u230B"},
{"〈", "〈"/* left-pointing angle bracket = bra */,
"\u2329"}
/* lang is NOT the same character as ,"\u003C"} */
,
{"〉", "〉"/* right-pointing angle bracket = ket */,
"\u232A"}
/* rang is NOT the same character as ,"\u003E"} */
/* Geometric Shapes */
,
{"◊", "◊"/* lozenge */, "\u25CA"}
/* Miscellaneous Symbols */
,
{"♠", "♠"/* black spade suit */, "\u2660"}
/* black here seems to mean filled as opposed to hollow */
,
{"♣", "♣"/* black club suit = shamrock */, "\u2663"},
{"♥", "♥"/* black heart suit = valentine */,
"\u2665"},
{"♦", "♦"/* black diamond suit */, "\u2666"},
{""", """ /* quotation mark = APL quote */, "\""},
{"&", "&" /* ampersand */, "\u0026"},
{"<", "<" /* less-than sign */, "\u003C"},
{">", ">" /* greater-than sign */, "\u003E"}
/* Latin Extended-A */
,
{"Œ", "Œ" /* latin capital ligature OE */, "\u0152"},
{"œ", "œ" /* latin small ligature oe */, "\u0153"}
/*
* ligature is a misnomer this is a separate character in some
* languages
*/
,
{"Š",
"Š" /* latin capital letter S with caron */,
"\u0160"},
{"š", "š" /* latin small letter s with caron */,
"\u0161"},
{"Ÿ",
"Ÿ" /* latin capital letter Y with diaeresis */,
"\u0178"}
/* Spacing Modifier Letters */
,
{"ˆ", "ˆ" /* modifier letter circumflex accent */,
"\u02C6"},
{"˜", "˜" /* small tilde */, "\u02DC"}
/* General Punctuation */
,
{" ", " "/* en space */, "\u2002"},
{" ", " "/* em space */, "\u2003"},
{" ", " "/* thin space */, "\u2009"},
{"", ""/* zero width non-joiner */, "\u200C"},
{"", ""/* zero width joiner */, "\u200D"},
{"", ""/* left-to-right mark */, "\u200E"},
{"", ""/* right-to-left mark */, "\u200F"},
{"–", "–"/* en dash */, "\u2013"},
{"—", "—"/* em dash */, "\u2014"},
{"‘", "‘"/* left single quotation mark */, "\u2018"},
{"’", "’"/* right single quotation mark */, "\u2019"},
{"‚", "‚"/* single low-9 quotation mark */, "\u201A"},
{"“", "“"/* left double quotation mark */, "\u201C"},
{"”", "”"/* right double quotation mark */, "\u201D"},
{"„", "„"/* double low-9 quotation mark */, "\u201E"},
{"†", "†"/* dagger */, "\u2020"},
{"‡", "‡"/* double dagger */, "\u2021"},
{"‰", "‰"/* per mille sign */, "\u2030"},
{"‹", "‹"/*
* single left-pointing angle quotation
* mark
*/, "\u2039"}
/* lsaquo is proposed but not yet ISO standardized */
, {"›", "›"/*
* single right-pointing angle quotation
* mark
*/, "\u203A"}
/* rsaquo is proposed but not yet ISO standardized */
, {"€", "€" /* euro sign */, "\u20AC"}};
for (String[] entity : entities) {
entityEscapeMap.put(entity[2], entity[0]);
escapeEntityMap.put(entity[0], entity[2]);
escapeEntityMap.put(entity[1], entity[2]);
}
}
public static String escape(String original) {
StringBuffer buf = new StringBuffer(original);
escape(buf);
return buf.toString();
}
public static void escape(StringBuffer original) {
int index = 0;
String escaped;
while (index < original.length()) {
escaped = entityEscapeMap.get(original.substring(index, index + 1));
if (null != escaped) {
original.replace(index, index + 1, escaped);
index += escaped.length();
} else {
index++;
}
}
}
public static String unescape(String original) {
String returnValue = null;
if (null != original) {
StringBuffer buf = new StringBuffer(original);
unescape(buf);
returnValue = buf.toString();
}
return returnValue;
}
public static void unescape(StringBuffer original) {
int index = 0;
int semicolonIndex;
String escaped;
String entity;
while (index < original.length()) {
index = original.indexOf("&", index);
if (-1 == index) {
break;
}
semicolonIndex = original.indexOf(";", index);
if (-1 != semicolonIndex) {
escaped = original.substring(index, semicolonIndex + 1);
entity = escapeEntityMap.get(escaped);
if (null != entity) {
original.replace(index, semicolonIndex + 1, entity);
}
index++;
} else {
break;
}
}
}
}