/**
* HtmlManipulator.java Copyright 2007 - 2008 Zach Scrivena
* zachscrivena@gmail.com http://zs.freeshell.org/
* <p>
* TERMS AND CONDITIONS: This program is free software: you can redistribute it
* and/or modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the License,
* or (at your option) any later version.
* <p>
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
* <p>
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
package gov.nasa.jpl.mbee.mdk.util;
import java.util.HashMap;
import java.util.Map;
/**
* Perform HTML-related operations.
*/
public final class HtmlManipulator {
/**
* Table of HTML entities obtained from
* http://www.w3.org/TR/html401/sgml/entities.html - Formatted as a series
* of space-delimited triplets (entity_name,entity_value,Unicode_value),
* e.g. (nbsp,#160,00A0). - entity_name and entity_value have been stripped
* of the surrounding "&" and ";", e.g. "nbsp" instead of " ", "#160"
* instead of " ".
*/
private static final String RAW_HTML_ENTITY_TABLE =
"nbsp #160 00A0 iexcl #161 00A1 cent #162 00A2 pound #163 00A3 curren #164 00A4 yen #165 00A5 "
+ "brvbar #166 00A6 sect #167 00A7 uml #168 00A8 copy #169 00A9 ordf #170 00AA laquo #171 00AB "
+ "not #172 00AC shy #173 00AD reg #174 00AE macr #175 00AF deg #176 00B0 plusmn #177 00B1 "
+ "sup2 #178 00B2 sup3 #179 00B3 acute #180 00B4 micro #181 00B5 para #182 00B6 middot #183 00B7 "
+ "cedil #184 00B8 sup1 #185 00B9 ordm #186 00BA raquo #187 00BB frac14 #188 00BC frac12 #189 00BD "
+ "frac34 #190 00BE iquest #191 00BF Agrave #192 00C0 Aacute #193 00C1 Acirc #194 00C2 Atilde #195 00C3 "
+ "Auml #196 00C4 Aring #197 00C5 AElig #198 00C6 Ccedil #199 00C7 Egrave #200 00C8 Eacute #201 00C9 "
+ "Ecirc #202 00CA Euml #203 00CB Igrave #204 00CC Iacute #205 00CD Icirc #206 00CE Iuml #207 00CF "
+ "ETH #208 00D0 Ntilde #209 00D1 Ograve #210 00D2 Oacute #211 00D3 Ocirc #212 00D4 Otilde #213 00D5 "
+ "Ouml #214 00D6 times #215 00D7 Oslash #216 00D8 Ugrave #217 00D9 Uacute #218 00DA Ucirc #219 00DB "
+ "Uuml #220 00DC Yacute #221 00DD THORN #222 00DE szlig #223 00DF agrave #224 00E0 aacute #225 00E1 "
+ "acirc #226 00E2 atilde #227 00E3 auml #228 00E4 aring #229 00E5 aelig #230 00E6 ccedil #231 00E7 "
+ "egrave #232 00E8 eacute #233 00E9 ecirc #234 00EA euml #235 00EB igrave #236 00EC iacute #237 00ED "
+ "icirc #238 00EE iuml #239 00EF eth #240 00F0 ntilde #241 00F1 ograve #242 00F2 oacute #243 00F3 "
+ "ocirc #244 00F4 otilde #245 00F5 ouml #246 00F6 divide #247 00F7 oslash #248 00F8 ugrave #249 00F9 "
+ "uacute #250 00FA ucirc #251 00FB uuml #252 00FC yacute #253 00FD thorn #254 00FE yuml #255 00FF "
+ "fnof #402 0192 Alpha #913 0391 Beta #914 0392 Gamma #915 0393 Delta #916 0394 Epsilon #917 0395 "
+ "Zeta #918 0396 Eta #919 0397 Theta #920 0398 Iota #921 0399 Kappa #922 039A Lambda #923 039B "
+ "Mu #924 039C Nu #925 039D Xi #926 039E Omicron #927 039F Pi #928 03A0 Rho #929 03A1 "
+ "Sigma #931 03A3 Tau #932 03A4 Upsilon #933 03A5 Phi #934 03A6 Chi #935 03A7 Psi #936 03A8 "
+ "Omega #937 03A9 alpha #945 03B1 beta #946 03B2 gamma #947 03B3 delta #948 03B4 epsilon #949 03B5 "
+ "zeta #950 03B6 eta #951 03B7 theta #952 03B8 iota #953 03B9 kappa #954 03BA lambda #955 03BB "
+ "mu #956 03BC nu #957 03BD xi #958 03BE omicron #959 03BF pi #960 03C0 rho #961 03C1 "
+ "sigmaf #962 03C2 sigma #963 03C3 tau #964 03C4 upsilon #965 03C5 phi #966 03C6 chi #967 03C7 "
+ "psi #968 03C8 omega #969 03C9 thetasym #977 03D1 upsih #978 03D2 piv #982 03D6 bull #8226 2022 "
+ "hellip #8230 2026 prime #8242 2032 Prime #8243 2033 oline #8254 203E frasl #8260 2044 weierp #8472 2118 "
+ "image #8465 2111 real #8476 211C trade #8482 2122 alefsym #8501 2135 larr #8592 2190 uarr #8593 2191 "
+ "rarr #8594 2192 darr #8595 2193 harr #8596 2194 crarr #8629 21B5 lArr #8656 21D0 uArr #8657 21D1 "
+ "rArr #8658 21D2 dArr #8659 21D3 hArr #8660 21D4 forall #8704 2200 part #8706 2202 exist #8707 2203 "
+ "empty #8709 2205 nabla #8711 2207 isin #8712 2208 notin #8713 2209 ni #8715 220B prod #8719 220F "
+ "sum #8721 2211 minus #8722 2212 lowast #8727 2217 radic #8730 221A prop #8733 221D infin #8734 221E "
+ "ang #8736 2220 and #8743 2227 or #8744 2228 cap #8745 2229 cup #8746 222A int #8747 222B "
+ "there4 #8756 2234 sim #8764 223C cong #8773 2245 asymp #8776 2248 ne #8800 2260 equiv #8801 2261 "
+ "le #8804 2264 ge #8805 2265 sub #8834 2282 sup #8835 2283 nsub #8836 2284 sube #8838 2286 "
+ "supe #8839 2287 oplus #8853 2295 otimes #8855 2297 perp #8869 22A5 sdot #8901 22C5 lceil #8968 2308 "
+ "rceil #8969 2309 lfloor #8970 230A rfloor #8971 230B lang #9001 2329 rang #9002 232A loz #9674 25CA "
+ "spades #9824 2660 clubs #9827 2663 hearts #9829 2665 diams #9830 2666 "
+ "quot #34 0022 amp #38 0026 lt #60 003C gt #62 003E OElig #338 0152 oelig #339 0153 "
+ "Scaron #352 0160 Yuml #376 0178 circ #710 02C6 tilde #732 02DC ensp #8194 2002 emsp #8195 2003 "
+ "thinsp #8201 2009 zwnj #8204 200C zwj #8205 200D lrm #8206 200E rlm #8207 200F ndash #8211 2013 "
+ "mdash #8212 2014 lsquo #8216 2018 rsquo #8217 2019 sbquo #8218 201A ldquo #8220 201C rdquo #8221 201D "
+ "bdquo #8222 201E dagger #8224 2020 Dagger #8225 2021 permil #8240 2030 lsaquo #8249 2039 rsaquo #8250 203A "
+ "euro #8364 20AC";
/**
* value given by RAW_HTML_ENTITY_TABLE.hashCode(), used to guard against
* accidental modification
*/
private static final int RAW_HTML_ENTITY_TABLE_HASHCODE = -301953893;
/**
* mapping: HTML entity ---> Unicode character
*/
private static final Map<String, Character> HTML_ENTITY_TO_UNICODE_MAP = new HashMap<String, Character>();
private static final Map<String, String> HTML_ENTITY_TO_NUMERIC_MAP = new HashMap<String, String>();
/**
* mapping: Unicode character ---> HTML entity
*/
private static final Map<Character, String> UNICODE_TO_HTML_ENTITY_MAP = new HashMap<Character, String>();
/**
* Static initialization block. Populates HTML_ENTITY_TO_UNICODE_MAP and
* UNICODE_TO_HTML_ENTITY_MAP.
*/
static {
/* check hash code of RAW_HTML_ENTITY_TABLE */
if (RAW_HTML_ENTITY_TABLE.hashCode() != RAW_HTML_ENTITY_TABLE_HASHCODE) {
throw new RuntimeException("(INTERNAL) Malformed HtmlManipulator.RAW_HTML_ENTITY_TABLE.");
}
/* populate HTML entity <---> Unicode character maps */
final String[] elements = RAW_HTML_ENTITY_TABLE.split("[\\s]++");
for (int i = 0; i < elements.length; i += 3) {
final char unicode = (char) Integer.parseInt(elements[i + 2], 16);
HTML_ENTITY_TO_UNICODE_MAP.put(elements[i], unicode);
HTML_ENTITY_TO_UNICODE_MAP.put(elements[i + 1], unicode);
HTML_ENTITY_TO_NUMERIC_MAP.put(elements[i], elements[i + 1]);
UNICODE_TO_HTML_ENTITY_MAP.put(unicode, elements[i]);
}
}
/**
* Private constructor that should never be called.
*/
private HtmlManipulator() {
}
/**
* Replace HTML entities in a given string with their numeric
* representations.
*
* @param s input string
* @return string with HTML entities replaced
*/
public static String replaceHtmlEntities(final String s) {
final StringBuilder t = new StringBuilder();
for (int i = 0, n = s.length(); i < n; i++) {
final char c = s.charAt(i);
if (c == '&') {
/* candidate HTML entity */
final int j = s.indexOf(';', i);
if (j >= 0) {
//final Character unicode = HTML_ENTITY_TO_UNICODE_MAP.get(s.substring(i + 1, j));
final String numeric = HTML_ENTITY_TO_NUMERIC_MAP.get(s.substring(i + 1, j));
if (numeric != null) {
/* insert Unicode representation */
t.append("&" + numeric + ";");
i = j; /* advance index */
continue;
}
}
}
/* treat as a literal character */
t.append(c);
}
return t.toString();
}
/**
* Quote a specified string as HTML, by replacing all special characters
* with their equivalent HTML entities.
*
* @param s input string
* @return string with special characters replaced
*/
public static String quoteHtml(final String s) {
final StringBuilder t = new StringBuilder();
for (char c : s.toCharArray()) {
final String entity = UNICODE_TO_HTML_ENTITY_MAP.get(c);
if (entity == null) {
t.append(c);
}
else {
t.append('&');
t.append(entity);
t.append(';');
}
}
return t.toString();
}
}