/* * Copyright 2007 Yusuke Yamamoto * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package twitter4j.http; import java.util.HashMap; import java.util.Map; public final class HTMLEntity { private static Map<String, String> entityEscapeMap = new HashMap<String, String>(); private static Map<String, String> escapeEntityMap = new HashMap<String, String>(); static { final String[][] entities = { { " ", " "/* * no-break space = * non-breaking space */, "\u00A0" }, { "¡", "¡"/* * inverted * exclamation * mark */, "\u00A1" }, { "¢", "¢"/* cent sign */, "\u00A2" }, { "£", "£"/* * pound * sign */, "\u00A3" }, { "¤", "¤"/* currency sign */, "\u00A4" }, { "¥", "¥"/* * yen * sign * = * yuan * sign */, "\u00A5" }, { "¦", "¦"/* broken bar = broken vertical bar */, "\u00A6" }, { "§", "§"/* section sign */, "\u00A7" }, { "¨", "¨"/* * diaeresis * = * spacing * diaeresis */, "\u00A8" }, { "©", "©"/* copyright sign */, "\u00A9" }, { "ª", "ª"/* * feminine * ordinal * indicator */, "\u00AA" }, { "«", "«"/* * left-pointing double angle quotation mark * = left pointing guillemet */, "\u00AB" }, { "¬", "¬"/* * not * sign * = * discretionary * hyphen */, "\u00AC" }, { "­", "­"/* soft hyphen = discretionary hyphen */, "\u00AD" }, { "®", "®"/* * registered sign = registered trade mark * sign */, "\u00AE" }, { "¯", "¯"/* * macron * = * spacing * macron * = * overline * = APL * overbar */, "\u00AF" }, { "°", "°"/* degree sign */, "\u00B0" }, { "±", "±"/* * plus * - * minus * sign * = * plus * - * or * - * minus * sign */, "\u00B1" }, { "²", "²"/* * superscript two = superscript digit two = * squared */, "\u00B2" }, { "³", "³"/* * superscript * three * = * superscript * digit * three * = * cubed */, "\u00B3" }, { "´", "´"/* acute accent = spacing acute */, "\u00B4" }, { "µ", "µ"/* micro sign */, "\u00B5" }, { "¶", "¶"/* * pilcrow * sign * = * paragraph * sign */, "\u00B6" }, { "·", "·"/* * middle dot = Georgian comma = Greek * middle dot */, "\u00B7" }, { "¸", "¸"/* * cedilla * = * spacing * cedilla */, "\u00B8" }, { "¹", "¹"/* * superscript one = superscript digit one */, "\u00B9" }, { "º", "º"/* * masculine * ordinal * indicator */, "\u00BA" }, { "»", "»"/* * right-pointing double angle quotation * mark = right pointing guillemet */, "\u00BB" }, { "¼", "¼"/* * vulgar * fraction * one * quarter * = * fraction * one * quarter */, "\u00BC" }, { "½", "½"/* * vulgar fraction one half = fraction one * half */, "\u00BD" }, { "¾", "¾"/* * vulgar * fraction * three * quarters * = * fraction * three * quarters */, "\u00BE" }, { "¿", "¿"/* * inverted question mark = turned question * mark */, "\u00BF" }, { "À", "À"/* * latin * capital * letter * A * with * grave * = * latin * capital * letter * A * grave */, "\u00C0" }, { "Á", "Á"/* latin capital letter A with acute */, "\u00C1" }, { "Â", "Â"/* * latin capital letter A with circumflex */, "\u00C2" }, { "Ã", "Ã"/* * latin * capital * letter * A * with * tilde */, "\u00C3" }, { "Ä", "Ä"/* * latin capital letter A with diaeresis */, "\u00C4" }, { "Å", "Å"/* * latin * capital * letter * A * with * ring * above * = * latin * capital * letter * A * ring */, "\u00C5" }, { "Æ", "Æ"/* * latin capital letter AE = latin capital * ligature AE */, "\u00C6" }, { "Ç", "Ç"/* * latin * capital * letter * C * with * cedilla */, "\u00C7" }, { "È", "È"/* latin capital letter E with grave */, "\u00C8" }, { "É", "É"/* latin capital letter E with acute */, "\u00C9" }, { "Ê", "Ê"/* * latin capital letter E with circumflex */, "\u00CA" }, { "Ë", "Ë"/* * latin * capital * letter * E * with * diaeresis */, "\u00CB" }, { "Ì", "Ì"/* latin capital letter I with grave */, "\u00CC" }, { "Í", "Í"/* latin capital letter I with acute */, "\u00CD" }, { "Î", "Î"/* * latin capital letter I with circumflex */, "\u00CE" }, { "Ï", "Ï"/* * latin * capital * letter * I * with * diaeresis */, "\u00CF" }, { "Ð", "Ð"/* latin capital letter ETH */, "\u00D0" }, { "Ñ", "Ñ"/* * latin * capital * letter * N * with * tilde */, "\u00D1" }, { "Ò", "Ò"/* latin capital letter O with grave */, "\u00D2" }, { "Ó", "Ó"/* latin capital letter O with acute */, "\u00D3" }, { "Ô", "Ô"/* * latin capital letter O with circumflex */, "\u00D4" }, { "Õ", "Õ"/* * latin * capital * letter * O * with * tilde */, "\u00D5" }, { "Ö", "Ö"/* * latin capital letter O with diaeresis */, "\u00D6" }, { "×", "×"/* * multiplication * sign */, "\u00D7" }, { "Ø", "Ø"/* * latin capital letter O with stroke = * latin capital letter O slash */, "\u00D8" }, { "Ù", "Ù"/* * latin * capital * letter * U * with * grave */, "\u00D9" }, { "Ú", "Ú"/* latin capital letter U with acute */, "\u00DA" }, { "Û", "Û"/* * latin capital letter U with circumflex */, "\u00DB" }, { "Ü", "Ü"/* * latin * capital * letter * U * with * diaeresis */, "\u00DC" }, { "Ý", "Ý"/* latin capital letter Y with acute */, "\u00DD" }, { "Þ", "Þ"/* latin capital letter THORN */, "\u00DE" }, { "ß", "ß"/* * latin small letter sharp s = ess-zed */, "\u00DF" }, { "à", "à"/* * latin * small * letter * a * with * grave * = * latin * small * letter * a * grave */, "\u00E0" }, { "á", "á"/* * latin small letter a with acute */, "\u00E1" }, { "â", "â"/* * latin * small * letter * a * with * circumflex */, "\u00E2" }, { "ã", "ã"/* latin small letter a with tilde */, "\u00E3" }, { "ä", "ä"/* latin small letter a with diaeresis */, "\u00E4" }, { "å", "å"/* * latin small letter a with ring above = * latin small letter a ring */, "\u00E5" }, { "æ", "æ"/* * latin * small * letter * ae = * latin * small * ligature * ae */, "\u00E6" }, { "ç", "ç"/* latin small letter c with cedilla */, "\u00E7" }, { "è", "è"/* latin small letter e with grave */, "\u00E8" }, { "é", "é"/* latin small letter e with acute */, "\u00E9" }, { "ê", "ê"/* * latin small letter e with circumflex */, "\u00EA" }, { "ë", "ë"/* * latin * small * letter * e * with * diaeresis */, "\u00EB" }, { "ì", "ì"/* latin small letter i with grave */, "\u00EC" }, { "í", "í"/* latin small letter i with acute */, "\u00ED" }, { "î", "î"/* * latin small letter i with circumflex */, "\u00EE" }, { "ï", "ï"/* * latin * small * letter * i * with * diaeresis */, "\u00EF" }, { "ð", "ð"/* latin small letter eth */, "\u00F0" }, { "ñ", "ñ"/* * latin * small * letter * n * with * tilde */, "\u00F1" }, { "ò", "ò"/* latin small letter o with grave */, "\u00F2" }, { "ó", "ó"/* latin small letter o with acute */, "\u00F3" }, { "ô", "ô"/* * latin small letter o with circumflex */, "\u00F4" }, { "õ", "õ"/* * latin * small * letter * o * with * tilde */, "\u00F5" }, { "ö", "ö"/* latin small letter o with diaeresis */, "\u00F6" }, { "÷", "÷"/* division sign */, "\u00F7" }, { "ø", "ø"/* * latin * small * letter * o * with * stroke * = * latin * small * letter * o * slash */, "\u00F8" }, { "ù", "ù"/* latin small letter u with grave */, "\u00F9" }, { "ú", "ú"/* latin small letter u with acute */, "\u00FA" }, { "û", "û"/* * latin small letter u with circumflex */, "\u00FB" }, { "ü", "ü"/* * latin * small * letter * u * with * diaeresis */, "\u00FC" }, { "ý", "ý"/* latin small letter y with acute */, "\u00FD" }, { "þ", "þ"/* latin small letter thorn with */, "\u00FE" }, { "ÿ", "ÿ"/* latin small letter y with diaeresis */, "\u00FF" }, { "ƒ", "ƒ"/* * latin small f with hook = function = * florin */, "\u0192" } /* Greek */ , { "Α", "Α"/* greek capital letter alpha */, "\u0391" }, { "Β", "Β"/* greek capital letter beta */, "\u0392" }, { "Γ", "Γ"/* * greek * capital * letter * gamma */, "\u0393" }, { "Δ", "Δ"/* greek capital letter delta */, "\u0394" }, { "Ε", "Ε"/* greek capital letter epsilon */, "\u0395" }, { "Ζ", "Ζ"/* greek capital letter zeta */, "\u0396" }, { "Η", "Η"/* * greek * capital * letter * eta */, "\u0397" }, { "Θ", "Θ"/* greek capital letter theta */, "\u0398" }, { "Ι", "Ι"/* * greek * capital * letter * iota */, "\u0399" }, { "Κ", "Κ"/* greek capital letter kappa */, "\u039A" }, { "Λ", "Λ"/* greek capital letter lambda */, "\u039B" }, { "Μ", "Μ"/* * greek * capital * letter * mu */, "\u039C" }, { "Ν", "Ν"/* greek capital letter nu */, "\u039D" }, { "Ξ", "Ξ"/* * greek * capital * letter * xi */, "\u039E" }, { "Ο", "Ο"/* greek capital letter omicron */, "\u039F" }, { "Π", "Π"/* greek capital letter pi */, "\u03A0" }, { "Ρ", "Ρ"/* * greek * capital * letter * rho */, "\u03A1" } /* there is no Sigmaf and no \u03A2 */ , { "Σ", "Σ"/* greek capital letter sigma */, "\u03A3" }, { "Τ", "Τ"/* greek capital letter tau */, "\u03A4" }, { "Υ", "Υ"/* greek capital letter upsilon */, "\u03A5" }, { "Φ", "Φ"/* greek capital letter phi */, "\u03A6" }, { "Χ", "Χ"/* * greek * capital * letter * chi */, "\u03A7" }, { "Ψ", "Ψ"/* greek capital letter psi */, "\u03A8" }, { "Ω", "Ω"/* * greek * capital * letter * omega */, "\u03A9" }, { "α", "α"/* greek small letter alpha */, "\u03B1" }, { "β", "β"/* * greek * small * letter * beta */, "\u03B2" }, { "γ", "γ"/* greek small letter gamma */, "\u03B3" }, { "δ", "δ"/* * greek * small * letter * delta */, "\u03B4" }, { "ε", "ε"/* greek small letter epsilon */, "\u03B5" }, { "ζ", "ζ"/* greek small letter zeta */, "\u03B6" }, { "η", "η"/* * greek * small * letter * eta */, "\u03B7" }, { "θ", "θ"/* greek small letter theta */, "\u03B8" }, { "ι", "ι"/* * greek * small * letter * iota */, "\u03B9" }, { "κ", "κ"/* greek small letter kappa */, "\u03BA" }, { "λ", "λ"/* * greek * small * letter * lambda */, "\u03BB" }, { "μ", "μ"/* greek small letter mu */, "\u03BC" }, { "ν", "ν"/* * greek * small * letter * nu */, "\u03BD" }, { "ξ", "ξ"/* greek small letter xi */, "\u03BE" }, { "ο", "ο"/* * greek * small * letter * omicron */, "\u03BF" }, { "π", "π"/* greek small letter pi */, "\u03C0" }, { "ρ", "ρ"/* * greek * small * letter * rho */, "\u03C1" }, { "ς", "ς"/* greek small letter final sigma */, "\u03C2" }, { "σ", "σ"/* greek small letter sigma */, "\u03C3" }, { "τ", "τ"/* * greek * small * letter * tau */, "\u03C4" }, { "υ", "υ"/* greek small letter upsilon */, "\u03C5" }, { "φ", "φ"/* greek small letter phi */, "\u03C6" }, { "χ", "χ"/* * greek * small * letter * chi */, "\u03C7" }, { "ψ", "ψ"/* greek small letter psi */, "\u03C8" }, { "ω", "ω"/* * greek * small * letter * omega */, "\u03C9" }, { "ϑ", "ϑ"/* greek small letter theta symbol */, "\u03D1" }, { "ϒ", "ϒ"/* greek upsilon with hook symbol */, "\u03D2" }, { "ϖ", "ϖ"/* * greek * pi * symbol */, "\u03D6" } /* General Punctuation */ , { "•", "•"/* bullet = black small circle */, "\u2022" } /* bullet is NOT the same as bullet operator ,"\u2219 */ , { "…", "…"/* * horizontal ellipsis = three dot * leader */, "\u2026" }, { "′", "′"/* * prime * = * minutes * = * feet */, "\u2032" }, { "″", "″"/* double prime = seconds = inches */, "\u2033" }, { "‾", "‾"/* overline = spacing overscore */, "\u203E" }, { "⁄", "⁄"/* * fraction * slash */, "\u2044" } /* Letterlike Symbols */ , { "℘", "℘"/* * script capital P = power set = * Weierstrass p */, "\u2118" }, { "ℑ", "ℑ"/* * blackletter * capital * I * = * imaginary * part */, "\u2111" }, { "ℜ", "ℜ"/* * blackletter capital R = real part symbol */, "\u211C" }, { "™", "™"/* * trade * mark * sign */, "\u2122" }, { "ℵ", "ℵ"/* * alef symbol = first transfinite * cardinal */, "\u2135" } /* alef symbol is NOT the same as hebrew letter alef ,"\u05D0"} */ /* Arrows */ , { "←", "←"/* leftwards arrow */, "\u2190" }, { "↑", "↑"/* * upwards * arrow */, "\u2191" }, { "→", "→"/* rightwards arrow */, "\u2192" }, { "↓", "↓"/* * downwards * arrow */, "\u2193" }, { "↔", "↔"/* left right arrow */, "\u2194" }, { "↵", "↵"/* * downwards * arrow * with * corner * leftwards * = * carriage * return */, "\u21B5" }, { "⇐", "⇐"/* leftwards double arrow */, "\u21D0" } /* * Unicode does not say that lArr is the same as the 'is implied * by' arrow but also does not have any other character for that * function. So ? lArr can be used for 'is implied by' as * ISOtech suggests */ , { "⇑", "⇑"/* upwards double arrow */, "\u21D1" }, { "⇒", "⇒"/* * rightwards * double * arrow */, "\u21D2" } /* * Unicode does not say this is the 'implies' character but does * not have another character with this function so ? rArr can * be used for 'implies' as ISOtech suggests */ , { "⇓", "⇓"/* downwards double arrow */, "\u21D3" }, { "⇔", "⇔"/* * left * right * double * arrow */, "\u21D4" } /* Mathematical Operators */ , { "∀", "∀"/* for all */, "\u2200" }, { "∂", "∂"/* * partial * differential */, "\u2202" }, { "∃", "∃"/* there exists */, "\u2203" }, { "∅", "∅"/* * empty * set * = * null * set * = * diameter */, "\u2205" }, { "∇", "∇"/* nabla = backward difference */, "\u2207" }, { "∈", "∈"/* element of */, "\u2208" }, { "∉", "∉"/* * not * an * element * of */, "\u2209" }, { "∋", "∋"/* contains as member */, "\u220B" } /* should there be a more memorable name than 'ni'? */ , { "∏", "∏"/* n-ary product = product sign */, "\u220F" } /* prod is NOT the same character as ,"\u03A0"} */ , { "∑", "∑"/* n-ary sumation */, "\u2211" } /* sum is NOT the same character as ,"\u03A3"} */ , { "−", "−"/* minus sign */, "\u2212" }, { "∗", "∗"/* * asterisk * operator */, "\u2217" }, { "√", "√"/* square root = radical sign */, "\u221A" }, { "∝", "∝"/* proportional to */, "\u221D" }, { "∞", "∞"/* infinity */, "\u221E" }, { "∠", "∠"/* angle */, "\u2220" }, { "∧", "∧"/* * logical * and * = * wedge */, "\u2227" }, { "∨", "∨"/* logical or = vee */, "\u2228" }, { "∩", "∩"/* * intersection * = * cap */, "\u2229" }, { "∪", "∪"/* union = cup */, "\u222A" }, { "∫", "∫"/* integral */, "\u222B" }, { "∴", "∴"/* therefore */, "\u2234" }, { "∼", "∼"/* * tilde * operator * = * varies * with * = * similar * to */, "\u223C" } /* * tilde operator is NOT the same character as the tilde * ,"\u007E"} */ , { "≅", "≅"/* approximately equal to */, "\u2245" }, { "≈", "≈"/* almost equal to = asymptotic to */, "\u2248" }, { "≠", "≠"/* not equal to */, "\u2260" }, { "≡", "≡"/* * identical * to */, "\u2261" }, { "≤", "≤"/* less-than or equal to */, "\u2264" }, { "≥", "≥"/* * greater * - * than * or * equal * to */, "\u2265" }, { "⊂", "⊂"/* subset of */, "\u2282" }, { "⊃", "⊃"/* * superset * of */, "\u2283" } /* note that nsup 'not a superset of ,"\u2283"} */ , { "⊆", "⊆"/* subset of or equal to */, "\u2286" }, { "⊇", "⊇"/* * superset * of * or * equal * to */, "\u2287" }, { "⊕", "⊕"/* circled plus = direct sum */, "\u2295" }, { "⊗", "⊗"/* circled times = vector product */, "\u2297" }, { "⊥", "⊥"/* * up tack = orthogonal to = perpendicular */, "\u22A5" }, { "⋅", "⋅"/* * dot * operator */, "\u22C5" } /* * dot operator is NOT the same character as ,"\u00B7"} /* * Miscellaneous Technical */ , { "⌈", "⌈"/* left ceiling = apl upstile */, "\u2308" }, { "⌉", "⌉"/* right ceiling */, "\u2309" }, { "⌊", "⌊"/* * left * floor * = * apl * downstile */, "\u230A" }, { "⌋", "⌋"/* right floor */, "\u230B" }, { "⟨", "〈"/* * left * - * pointing * angle * bracket * = * bra */, "\u2329" } /* lang is NOT the same character as ,"\u003C"} */ , { "⟩", "〉"/* * right-pointing angle bracket = ket */, "\u232A" } /* rang is NOT the same character as ,"\u003E"} */ /* Geometric Shapes */ , { "◊", "◊"/* lozenge */, "\u25CA" } /* Miscellaneous Symbols */ , { "♠", "♠"/* black spade suit */, "\u2660" } /* black here seems to mean filled as opposed to hollow */ , { "♣", "♣"/* black club suit = shamrock */, "\u2663" }, { "♥", "♥"/* black heart suit = valentine */, "\u2665" }, { "♦", "♦"/* black diamond suit */, "\u2666" }, { """, """ /* * quotation * mark * = * APL * quote */, "\"" }, { "&", "&" /* ampersand */, "\u0026" }, { "<", "<" /* * less * - * than * sign */, "\u003C" }, { ">", ">" /* greater-than sign */, "\u003E" } /* Latin Extended-A */ , { "Œ", "Œ" /* latin capital ligature OE */, "\u0152" }, { "œ", "œ" /* latin small ligature oe */, "\u0153" } /* * ligature is a misnomer this is a separate character in some * languages */ , { "Š", "Š" /* * latin capital letter S with caron */, "\u0160" }, { "š", "š" /* * latin * small * letter * s * with * caron */, "\u0161" }, { "Ÿ", "Ÿ" /* * latin capital letter Y with diaeresis */, "\u0178" } /* Spacing Modifier Letters */ , { "ˆ", "ˆ" /* modifier letter circumflex accent */, "\u02C6" }, { "˜", "˜" /* small tilde */, "\u02DC" } /* General Punctuation */ , { " ", " "/* en space */, "\u2002" }, { " ", " "/* * em * space */, "\u2003" }, { " ", " "/* thin space */, "\u2009" }, { "‌", "‌"/* * zero * width * non * - * joiner */, "\u200C" }, { "‍", "‍"/* zero width joiner */, "\u200D" }, { "‎", "‎"/* * left * - * to * - * right * mark */, "\u200E" }, { "‏", "‏"/* right-to-left mark */, "\u200F" }, { "–", "–"/* * en * dash */, "\u2013" }, { "—", "—"/* em dash */, "\u2014" }, { "‘", "‘"/* * left * single * quotation * mark */, "\u2018" }, { "’", "’"/* right single quotation mark */, "\u2019" }, { "‚", "‚"/* single low-9 quotation mark */, "\u201A" }, { "“", "“"/* left double quotation mark */, "\u201C" }, { "”", "”"/* right double quotation mark */, "\u201D" }, { "„", "„"/* double low-9 quotation mark */, "\u201E" }, { "†", "†"/* dagger */, "\u2020" }, { "‡", "‡"/* * double * dagger */, "\u2021" }, { "‰", "‰"/* per mille sign */, "\u2030" }, { "‹", "‹"/* * single * left * - * pointing * angle * quotation * mark */, "\u2039" } /* lsaquo is proposed but not yet ISO standardized */ , { "›", "›"/* * single right-pointing angle quotation * mark */, "\u203A" } /* rsaquo is proposed but not yet ISO standardized */ , { "€", "€" /* euro sign */, "\u20AC" } }; for (final String[] entity : entities) { entityEscapeMap.put(entity[2], entity[0]); escapeEntityMap.put(entity[0], entity[2]); escapeEntityMap.put(entity[1], entity[2]); } } public static String escape(final String original) { final StringBuffer buf = new StringBuffer(original); escape(buf); return buf.toString(); } public static void escape(final StringBuffer original) { int index = 0; String escaped; while (index < original.length()) { escaped = entityEscapeMap.get(original.substring(index, index + 1)); if (escaped != null) { original.replace(index, index + 1, escaped); index += escaped.length(); } else { index++; } } } public static String unescape(final String original) { String returnValue = null; if (original != null) { final StringBuffer buf = new StringBuffer(original); unescape(buf); returnValue = buf.toString(); } return returnValue; } public static void unescape(final StringBuffer original) { int index = 0; int semicolonIndex; String escaped; String entity; while (index < original.length()) { index = original.indexOf("&", index); if (-1 == index) { break; } semicolonIndex = original.indexOf(";", index); if (-1 != semicolonIndex) { escaped = original.substring(index, semicolonIndex + 1); entity = escapeEntityMap.get(escaped); if (entity != null) { original.replace(index, semicolonIndex + 1, entity); } index++; } else { break; } } } }