package com.mcxiaoke.minicat.util; import java.util.HashMap; import java.util.Map; /** * @author mcxiaoke * @version 1.0 2011.05.02 */ public final class HtmlHelper { private static Map<String, String> entityEscapeMap = new HashMap<String, String>(); private static Map<String, String> escapeEntityMap = new HashMap<String, String>(); static { String[][] entities = { {" ", " "/* no-break space = non-breaking space */, "\u00A0"}, {"¡", "¡"/* inverted exclamation mark */, "\u00A1"}, {"¢", "¢"/* cent sign */, "\u00A2"}, {"£", "£"/* pound sign */, "\u00A3"}, {"¤", "¤"/* currency sign */, "\u00A4"}, {"¥", "¥"/* yen sign = yuan sign */, "\u00A5"}, {"¦", "¦"/* broken bar = broken vertical bar */, "\u00A6"}, {"§", "§"/* section sign */, "\u00A7"}, {"¨", "¨"/* diaeresis = spacing diaeresis */, "\u00A8"}, {"©", "©"/* copyright sign */, "\u00A9"}, {"ª", "ª"/* feminine ordinal indicator */, "\u00AA"}, {"«", "«"/* * left-pointing double angle quotation mark * = left pointing guillemet */, "\u00AB"}, {"¬", "¬"/* not sign = discretionary hyphen */, "\u00AC"}, {"­", "­"/* soft hyphen = discretionary hyphen */, "\u00AD"}, {"®", "®"/* * registered sign = registered trade mark * sign */, "\u00AE"}, {"¯", "¯"/* * macron = spacing macron = overline = APL * overbar */, "\u00AF"}, {"°", "°"/* degree sign */, "\u00B0"}, {"±", "±"/* plus-minus sign = plus-or-minus sign */, "\u00B1"}, {"²", "²"/* * superscript two = superscript digit two = * squared */, "\u00B2"}, {"³", "³"/* * superscript three = superscript digit * three = cubed */, "\u00B3"}, {"´", "´"/* acute accent = spacing acute */, "\u00B4"}, {"µ", "µ"/* micro sign */, "\u00B5"}, {"¶", "¶"/* pilcrow sign = paragraph sign */, "\u00B6"}, {"·", "·"/* * middle dot = Georgian comma = Greek * middle dot */, "\u00B7"}, {"¸", "¸"/* cedilla = spacing cedilla */, "\u00B8"}, {"¹", "¹"/* superscript one = superscript digit one */, "\u00B9"}, {"º", "º"/* masculine ordinal indicator */, "\u00BA"}, {"»", "»"/* * right-pointing double angle quotation * mark = right pointing guillemet */, "\u00BB"}, {"¼", "¼"/* * vulgar fraction one quarter = fraction * one quarter */, "\u00BC"}, {"½", "½"/* * vulgar fraction one half = fraction one * half */, "\u00BD"}, {"¾", "¾"/* * vulgar fraction three quarters = fraction * three quarters */, "\u00BE"}, {"¿", "¿"/* * inverted question mark = turned question * mark */, "\u00BF"}, {"À", "À"/* * latin capital letter A with grave = latin * capital letter A grave */, "\u00C0"}, {"Á", "Á"/* latin capital letter A with acute */, "\u00C1"}, {"Â", "Â"/* latin capital letter A with circumflex */, "\u00C2"}, {"Ã", "Ã"/* latin capital letter A with tilde */, "\u00C3"}, {"Ä", "Ä"/* latin capital letter A with diaeresis */, "\u00C4"}, {"Å", "Å"/* * latin capital letter A with ring above = * latin capital letter A ring */, "\u00C5"}, {"Æ", "Æ"/* * latin capital letter AE = latin capital * ligature AE */, "\u00C6"}, {"Ç", "Ç"/* latin capital letter C with cedilla */, "\u00C7"}, {"È", "È"/* latin capital letter E with grave */, "\u00C8"}, {"É", "É"/* latin capital letter E with acute */, "\u00C9"}, {"Ê", "Ê"/* latin capital letter E with circumflex */, "\u00CA"}, {"Ë", "Ë"/* latin capital letter E with diaeresis */, "\u00CB"}, {"Ì", "Ì"/* latin capital letter I with grave */, "\u00CC"}, {"Í", "Í"/* latin capital letter I with acute */, "\u00CD"}, {"Î", "Î"/* latin capital letter I with circumflex */, "\u00CE"}, {"Ï", "Ï"/* latin capital letter I with diaeresis */, "\u00CF"}, {"Ð", "Ð"/* latin capital letter ETH */, "\u00D0"}, {"Ñ", "Ñ"/* latin capital letter N with tilde */, "\u00D1"}, {"Ò", "Ò"/* latin capital letter O with grave */, "\u00D2"}, {"Ó", "Ó"/* latin capital letter O with acute */, "\u00D3"}, {"Ô", "Ô"/* latin capital letter O with circumflex */, "\u00D4"}, {"Õ", "Õ"/* latin capital letter O with tilde */, "\u00D5"}, {"Ö", "Ö"/* latin capital letter O with diaeresis */, "\u00D6"}, {"×", "×"/* multiplication sign */, "\u00D7"}, {"Ø", "Ø"/* * latin capital letter O with stroke = * latin capital letter O slash */, "\u00D8"}, {"Ù", "Ù"/* latin capital letter U with grave */, "\u00D9"}, {"Ú", "Ú"/* latin capital letter U with acute */, "\u00DA"}, {"Û", "Û"/* latin capital letter U with circumflex */, "\u00DB"}, {"Ü", "Ü"/* latin capital letter U with diaeresis */, "\u00DC"}, {"Ý", "Ý"/* latin capital letter Y with acute */, "\u00DD"}, {"Þ", "Þ"/* latin capital letter THORN */, "\u00DE"}, {"ß", "ß"/* latin small letter sharp s = ess-zed */, "\u00DF"}, {"à", "à"/* * latin small letter a with grave = latin * small letter a grave */, "\u00E0"}, {"á", "á"/* latin small letter a with acute */, "\u00E1"}, {"â", "â"/* latin small letter a with circumflex */, "\u00E2"}, {"ã", "ã"/* latin small letter a with tilde */, "\u00E3"}, {"ä", "ä"/* latin small letter a with diaeresis */, "\u00E4"}, {"å", "å"/* * latin small letter a with ring above = * latin small letter a ring */, "\u00E5"}, {"æ", "æ"/* * latin small letter ae = latin small * ligature ae */, "\u00E6"}, {"ç", "ç"/* latin small letter c with cedilla */, "\u00E7"}, {"è", "è"/* latin small letter e with grave */, "\u00E8"}, {"é", "é"/* latin small letter e with acute */, "\u00E9"}, {"ê", "ê"/* latin small letter e with circumflex */, "\u00EA"}, {"ë", "ë"/* latin small letter e with diaeresis */, "\u00EB"}, {"ì", "ì"/* latin small letter i with grave */, "\u00EC"}, {"í", "í"/* latin small letter i with acute */, "\u00ED"}, {"î", "î"/* latin small letter i with circumflex */, "\u00EE"}, {"ï", "ï"/* latin small letter i with diaeresis */, "\u00EF"}, {"ð", "ð"/* latin small letter eth */, "\u00F0"}, {"ñ", "ñ"/* latin small letter n with tilde */, "\u00F1"}, {"ò", "ò"/* latin small letter o with grave */, "\u00F2"}, {"ó", "ó"/* latin small letter o with acute */, "\u00F3"}, {"ô", "ô"/* latin small letter o with circumflex */, "\u00F4"}, {"õ", "õ"/* latin small letter o with tilde */, "\u00F5"}, {"ö", "ö"/* latin small letter o with diaeresis */, "\u00F6"}, {"÷", "÷"/* division sign */, "\u00F7"}, {"ø", "ø"/* * latin small letter o with stroke = latin * small letter o slash */, "\u00F8"}, {"ù", "ù"/* latin small letter u with grave */, "\u00F9"}, {"ú", "ú"/* latin small letter u with acute */, "\u00FA"}, {"û", "û"/* latin small letter u with circumflex */, "\u00FB"}, {"ü", "ü"/* latin small letter u with diaeresis */, "\u00FC"}, {"ý", "ý"/* latin small letter y with acute */, "\u00FD"}, {"þ", "þ"/* latin small letter thorn with */, "\u00FE"}, {"ÿ", "ÿ"/* latin small letter y with diaeresis */, "\u00FF"}, {"ƒ", "ƒ"/* * latin small f with hook = function = * florin */, "\u0192"} /* Greek */ , {"Α", "Α"/* greek capital letter alpha */, "\u0391"}, {"Β", "Β"/* greek capital letter beta */, "\u0392"}, {"Γ", "Γ"/* greek capital letter gamma */, "\u0393"}, {"Δ", "Δ"/* greek capital letter delta */, "\u0394"}, {"Ε", "Ε"/* greek capital letter epsilon */, "\u0395"}, {"Ζ", "Ζ"/* greek capital letter zeta */, "\u0396"}, {"Η", "Η"/* greek capital letter eta */, "\u0397"}, {"Θ", "Θ"/* greek capital letter theta */, "\u0398"}, {"Ι", "Ι"/* greek capital letter iota */, "\u0399"}, {"Κ", "Κ"/* greek capital letter kappa */, "\u039A"}, {"Λ", "Λ"/* greek capital letter lambda */, "\u039B"}, {"Μ", "Μ"/* greek capital letter mu */, "\u039C"}, {"Ν", "Ν"/* greek capital letter nu */, "\u039D"}, {"Ξ", "Ξ"/* greek capital letter xi */, "\u039E"}, {"Ο", "Ο"/* greek capital letter omicron */, "\u039F"}, {"Π", "Π"/* greek capital letter pi */, "\u03A0"}, {"Ρ", "Ρ"/* greek capital letter rho */, "\u03A1"} /* there is no Sigmaf and no \u03A2 */ , {"Σ", "Σ"/* greek capital letter sigma */, "\u03A3"}, {"Τ", "Τ"/* greek capital letter tau */, "\u03A4"}, {"Υ", "Υ"/* greek capital letter upsilon */, "\u03A5"}, {"Φ", "Φ"/* greek capital letter phi */, "\u03A6"}, {"Χ", "Χ"/* greek capital letter chi */, "\u03A7"}, {"Ψ", "Ψ"/* greek capital letter psi */, "\u03A8"}, {"Ω", "Ω"/* greek capital letter omega */, "\u03A9"}, {"α", "α"/* greek small letter alpha */, "\u03B1"}, {"β", "β"/* greek small letter beta */, "\u03B2"}, {"γ", "γ"/* greek small letter gamma */, "\u03B3"}, {"δ", "δ"/* greek small letter delta */, "\u03B4"}, {"ε", "ε"/* greek small letter epsilon */, "\u03B5"}, {"ζ", "ζ"/* greek small letter zeta */, "\u03B6"}, {"η", "η"/* greek small letter eta */, "\u03B7"}, {"θ", "θ"/* greek small letter theta */, "\u03B8"}, {"ι", "ι"/* greek small letter iota */, "\u03B9"}, {"κ", "κ"/* greek small letter kappa */, "\u03BA"}, {"λ", "λ"/* greek small letter lambda */, "\u03BB"}, {"μ", "μ"/* greek small letter mu */, "\u03BC"}, {"ν", "ν"/* greek small letter nu */, "\u03BD"}, {"ξ", "ξ"/* greek small letter xi */, "\u03BE"}, {"ο", "ο"/* greek small letter omicron */, "\u03BF"}, {"π", "π"/* greek small letter pi */, "\u03C0"}, {"ρ", "ρ"/* greek small letter rho */, "\u03C1"}, {"ς", "ς"/* greek small letter final sigma */, "\u03C2"}, {"σ", "σ"/* greek small letter sigma */, "\u03C3"}, {"τ", "τ"/* greek small letter tau */, "\u03C4"}, {"υ", "υ"/* greek small letter upsilon */, "\u03C5"}, {"φ", "φ"/* greek small letter phi */, "\u03C6"}, {"χ", "χ"/* greek small letter chi */, "\u03C7"}, {"ψ", "ψ"/* greek small letter psi */, "\u03C8"}, {"ω", "ω"/* greek small letter omega */, "\u03C9"}, {"ϑ", "ϑ"/* greek small letter theta symbol */, "\u03D1"}, {"ϒ", "ϒ"/* greek upsilon with hook symbol */, "\u03D2"}, {"ϖ", "ϖ"/* greek pi symbol */, "\u03D6"} /* General Punctuation */ , {"•", "•"/* bullet = black small circle */, "\u2022"} /* bullet is NOT the same as bullet operator ,"\u2219 */ , {"…", "…"/* * horizontal ellipsis = three dot * leader */, "\u2026"}, {"′", "′"/* prime = minutes = feet */, "\u2032"}, {"″", "″"/* double prime = seconds = inches */, "\u2033"}, {"‾", "‾"/* overline = spacing overscore */, "\u203E"}, {"⁄", "⁄"/* fraction slash */, "\u2044"} /* Letterlike Symbols */ , {"℘", "℘"/* * script capital P = power set = * Weierstrass p */, "\u2118"}, {"ℑ", "ℑ"/* blackletter capital I = imaginary part */, "\u2111"}, {"ℜ", "ℜ"/* blackletter capital R = real part symbol */, "\u211C"}, {"™", "™"/* trade mark sign */, "\u2122"}, {"ℵ", "ℵ"/* * alef symbol = first transfinite * cardinal */, "\u2135"} /* alef symbol is NOT the same as hebrew letter alef ,"\u05D0"} */ /* Arrows */ , {"←", "←"/* leftwards arrow */, "\u2190"}, {"↑", "↑"/* upwards arrow */, "\u2191"}, {"→", "→"/* rightwards arrow */, "\u2192"}, {"↓", "↓"/* downwards arrow */, "\u2193"}, {"↔", "↔"/* left right arrow */, "\u2194"}, {"↵", "↵"/* * downwards arrow with corner leftwards = * carriage return */, "\u21B5"}, {"⇐", "⇐"/* leftwards double arrow */, "\u21D0"} /* * Unicode does not say that lArr is the same as the 'is implied * by' arrow but also does not have any other character for that * function. So ? lArr can be used for 'is implied by' as * ISOtech suggests */ , {"⇑", "⇑"/* upwards double arrow */, "\u21D1"}, {"⇒", "⇒"/* rightwards double arrow */, "\u21D2"} /* * Unicode does not say this is the 'implies' character but does * not have another character with this function so ? rArr can * be used for 'implies' as ISOtech suggests */ , {"⇓", "⇓"/* downwards double arrow */, "\u21D3"}, {"⇔", "⇔"/* left right double arrow */, "\u21D4"} /* Mathematical Operators */ , {"∀", "∀"/* for all */, "\u2200"}, {"∂", "∂"/* partial differential */, "\u2202"}, {"∃", "∃"/* there exists */, "\u2203"}, {"∅", "∅"/* empty set = null set = diameter */, "\u2205"}, {"∇", "∇"/* nabla = backward difference */, "\u2207"}, {"∈", "∈"/* element of */, "\u2208"}, {"∉", "∉"/* not an element of */, "\u2209"}, {"∋", "∋"/* contains as member */, "\u220B"} /* should there be a more memorable name than 'ni'? */ , {"∏", "∏"/* n-ary product = product sign */, "\u220F"} /* prod is NOT the same character as ,"\u03A0"} */ , {"∑", "∑"/* n-ary sumation */, "\u2211"} /* sum is NOT the same character as ,"\u03A3"} */ , {"−", "−"/* minus sign */, "\u2212"}, {"∗", "∗"/* asterisk operator */, "\u2217"}, {"√", "√"/* square root = radical sign */, "\u221A"}, {"∝", "∝"/* proportional to */, "\u221D"}, {"∞", "∞"/* infinity */, "\u221E"}, {"∠", "∠"/* angle */, "\u2220"}, {"∧", "∧"/* logical and = wedge */, "\u2227"}, {"∨", "∨"/* logical or = vee */, "\u2228"}, {"∩", "∩"/* intersection = cap */, "\u2229"}, {"∪", "∪"/* union = cup */, "\u222A"}, {"∫", "∫"/* integral */, "\u222B"}, {"∴", "∴"/* therefore */, "\u2234"}, { "∼", "∼"/* tilde operator = varies with = similar to */, "\u223C"} /* * tilde operator is NOT the same character as the tilde * ,"\u007E"} */ , {"≅", "≅"/* approximately equal to */, "\u2245"}, {"≈", "≈"/* almost equal to = asymptotic to */, "\u2248"}, {"≠", "≠"/* not equal to */, "\u2260"}, {"≡", "≡"/* identical to */, "\u2261"}, {"≤", "≤"/* less-than or equal to */, "\u2264"}, {"≥", "≥"/* greater-than or equal to */, "\u2265"}, {"⊂", "⊂"/* subset of */, "\u2282"}, {"⊃", "⊃"/* superset of */, "\u2283"} /* note that nsup 'not a superset of ,"\u2283"} */ , {"⊆", "⊆"/* subset of or equal to */, "\u2286"}, {"⊇", "⊇"/* superset of or equal to */, "\u2287"}, {"⊕", "⊕"/* circled plus = direct sum */, "\u2295"}, {"⊗", "⊗"/* circled times = vector product */, "\u2297"}, {"⊥", "⊥"/* up tack = orthogonal to = perpendicular */, "\u22A5"}, {"⋅", "⋅"/* dot operator */, "\u22C5"} /* * dot operator is NOT the same character as ,"\u00B7"} /* * Miscellaneous Technical */ , {"⌈", "⌈"/* left ceiling = apl upstile */, "\u2308"}, {"⌉", "⌉"/* right ceiling */, "\u2309"}, {"⌊", "⌊"/* left floor = apl downstile */, "\u230A"}, {"⌋", "⌋"/* right floor */, "\u230B"}, {"⟨", "〈"/* left-pointing angle bracket = bra */, "\u2329"} /* lang is NOT the same character as ,"\u003C"} */ , {"⟩", "〉"/* right-pointing angle bracket = ket */, "\u232A"} /* rang is NOT the same character as ,"\u003E"} */ /* Geometric Shapes */ , {"◊", "◊"/* lozenge */, "\u25CA"} /* Miscellaneous Symbols */ , {"♠", "♠"/* black spade suit */, "\u2660"} /* black here seems to mean filled as opposed to hollow */ , {"♣", "♣"/* black club suit = shamrock */, "\u2663"}, {"♥", "♥"/* black heart suit = valentine */, "\u2665"}, {"♦", "♦"/* black diamond suit */, "\u2666"}, {""", """ /* quotation mark = APL quote */, "\""}, {"&", "&" /* ampersand */, "\u0026"}, {"<", "<" /* less-than sign */, "\u003C"}, {">", ">" /* greater-than sign */, "\u003E"} /* Latin Extended-A */ , {"Œ", "Œ" /* latin capital ligature OE */, "\u0152"}, {"œ", "œ" /* latin small ligature oe */, "\u0153"} /* * ligature is a misnomer this is a separate character in some * languages */ , {"Š", "Š" /* latin capital letter S with caron */, "\u0160"}, {"š", "š" /* latin small letter s with caron */, "\u0161"}, {"Ÿ", "Ÿ" /* latin capital letter Y with diaeresis */, "\u0178"} /* Spacing Modifier Letters */ , {"ˆ", "ˆ" /* modifier letter circumflex accent */, "\u02C6"}, {"˜", "˜" /* small tilde */, "\u02DC"} /* General Punctuation */ , {" ", " "/* en space */, "\u2002"}, {" ", " "/* em space */, "\u2003"}, {" ", " "/* thin space */, "\u2009"}, {"‌", "‌"/* zero width non-joiner */, "\u200C"}, {"‍", "‍"/* zero width joiner */, "\u200D"}, {"‎", "‎"/* left-to-right mark */, "\u200E"}, {"‏", "‏"/* right-to-left mark */, "\u200F"}, {"–", "–"/* en dash */, "\u2013"}, {"—", "—"/* em dash */, "\u2014"}, {"‘", "‘"/* left single quotation mark */, "\u2018"}, {"’", "’"/* right single quotation mark */, "\u2019"}, {"‚", "‚"/* single low-9 quotation mark */, "\u201A"}, {"“", "“"/* left double quotation mark */, "\u201C"}, {"”", "”"/* right double quotation mark */, "\u201D"}, {"„", "„"/* double low-9 quotation mark */, "\u201E"}, {"†", "†"/* dagger */, "\u2020"}, {"‡", "‡"/* double dagger */, "\u2021"}, {"‰", "‰"/* per mille sign */, "\u2030"}, {"‹", "‹"/* * single left-pointing angle quotation * mark */, "\u2039"} /* lsaquo is proposed but not yet ISO standardized */ , {"›", "›"/* * single right-pointing angle quotation * mark */, "\u203A"} /* rsaquo is proposed but not yet ISO standardized */ , {"€", "€" /* euro sign */, "\u20AC"}}; for (String[] entity : entities) { entityEscapeMap.put(entity[2], entity[0]); escapeEntityMap.put(entity[0], entity[2]); escapeEntityMap.put(entity[1], entity[2]); } } public static String escape(String original) { StringBuffer buf = new StringBuffer(original); escape(buf); return buf.toString(); } public static void escape(StringBuffer original) { int index = 0; String escaped; while (index < original.length()) { escaped = entityEscapeMap.get(original.substring(index, index + 1)); if (null != escaped) { original.replace(index, index + 1, escaped); index += escaped.length(); } else { index++; } } } public static String unescape(String original) { String returnValue = null; if (null != original) { StringBuffer buf = new StringBuffer(original); unescape(buf); returnValue = buf.toString(); } return returnValue; } public static void unescape(StringBuffer original) { int index = 0; int semicolonIndex; String escaped; String entity; while (index < original.length()) { index = original.indexOf("&", index); if (-1 == index) { break; } semicolonIndex = original.indexOf(";", index); if (-1 != semicolonIndex) { escaped = original.substring(index, semicolonIndex + 1); entity = escapeEntityMap.get(escaped); if (null != entity) { original.replace(index, semicolonIndex + 1, entity); } index++; } else { break; } } } }