/* * Copyright 2004-2011 H2 Group. Multiple-Licensed under the H2 License, * Version 1.0, and under the Eclipse Public License, Version 1.0 * (http://h2database.com/html/license.html). * Initial Developer: H2 Group */ package org.h2.build.indexer; import java.util.HashMap; /** * This class replaces HTML entities in text (for example ü) to the correct * character and vice versa. */ public class HtmlConverter { private static HashMap<String, Character> charMap = new HashMap<String, Character>(); private static HashMap<Character, String> codeMap = new HashMap<Character, String>(); private static final String[] CHARS = { "quot:34", "amp:38", "lt:60", "gt:62", "nbsp:160", "iexcl:161", "cent:162", "pound:163", "curren:164", "yen:165", "brvbar:166", "sect:167", "uml:168", "copy:169", "ordf:170", "laquo:171", "not:172", "shy:173", "reg:174", "macr:175", "deg:176", "plusmn:177", "sup2:178", "sup3:179", "acute:180", "micro:181", "para:182", "middot:183", "cedil:184", "sup1:185", "ordm:186", "raquo:187", "frac14:188", "frac12:189", "frac34:190", "iquest:191", "Agrave:192", "Aacute:193", "Acirc:194", "Atilde:195", "Auml:196", "Aring:197", "AElig:198", "Ccedil:199", "Egrave:200", "Eacute:201", "Ecirc:202", "Euml:203", "Igrave:204", "Iacute:205", "Icirc:206", "Iuml:207", "ETH:208", "Ntilde:209", "Ograve:210", "Oacute:211", "Ocirc:212", "Otilde:213", "Ouml:214", "times:215", "Oslash:216", "Ugrave:217", "Uacute:218", "Ucirc:219", "Uuml:220", "Yacute:221", "THORN:222", "szlig:223", "agrave:224", "aacute:225", "acirc:226", "atilde:227", "auml:228", "aring:229", "aelig:230", "ccedil:231", "egrave:232", "eacute:233", "ecirc:234", "euml:235", "igrave:236", "iacute:237", "icirc:238", "iuml:239", "eth:240", "ntilde:241", "ograve:242", "oacute:243", "ocirc:244", "otilde:245", "ouml:246", "divide:247", "oslash:248", "ugrave:249", "uacute:250", "ucirc:251", "uuml:252", "yacute:253", "thorn:254", "yuml:255", "Alpha:913", "alpha:945", "Beta:914", "beta:946", "Gamma:915", "gamma:947", "Delta:916", "delta:948", "Epsilon:917", "epsilon:949", "Zeta:918", "zeta:950", "Eta:919", "eta:951", "Theta:920", "theta:952", "Iota:921", "iota:953", "Kappa:922", "kappa:954", "Lambda:923", "lambda:955", "Mu:924", "mu:956", "Nu:925", "nu:957", "Xi:926", "xi:958", "Omicron:927", "omicron:959", "Pi:928", "pi:960", "Rho:929", "rho:961", "Sigma:931", "sigmaf:962", "sigma:963", "Tau:932", "tau:964", "Upsilon:933", "upsilon:965", "Phi:934", "phi:966", "Chi:935", "chi:967", "Psi:936", "psi:968", "Omega:937", "omega:969", "thetasym:977", "upsih:978", "piv:982", "forall:8704", "part:8706", "exist:8707", "empty:8709", "nabla:8711", "isin:8712", "notin:8713", "ni:8715", "prod:8719", "sum:8721", "minus:8722", "lowast:8727", "radic:8730", "prop:8733", "infin:8734", "ang:8736", "and:8743", "or:8744", "cap:8745", "cup:8746", "int:8747", "there4:8756", "sim:8764", "cong:8773", "asymp:8776", "ne:8800", "equiv:8801", "le:8804", "ge:8805", "sub:8834", "sup:8835", "nsub:8836", "sube:8838", "supe:8839", "oplus:8853", "otimes:8855", "perp:8869", "sdot:8901", "loz:9674", "lceil:8968", "rceil:8969", "lfloor:8970", "rfloor:8971", "lang:9001", "rang:9002", "larr:8592", "uarr:8593", "rarr:8594", "darr:8595", "harr:8596", "crarr:8629", "lArr:8656", "uArr:8657", "rArr:8658", "dArr:8659", "hArr:8660", "bull:8226", "prime:8242", "oline:8254", "frasl:8260", "weierp:8472", "image:8465", "real:8476", "trade:8482", "euro:8364", "alefsym:8501", "spades:9824", "clubs:9827", "hearts:9829", "diams:9830", "ensp:8194", "emsp:8195", "thinsp:8201", "zwnj:8204", "zwj:8205", "lrm:8206", "rlm:8207", "ndash:8211", "mdash:8212", "lsquo:8216", "rsquo:8217", "sbquo:8218", "ldquo:8220", "rdquo:8221", "bdquo:8222", "dagger:8224", "Dagger:8225", "hellip:8230", "permil:8240", "lsaquo:8249", "rsaquo:8250" }; private HtmlConverter() { // utility class } static { for (String token : CHARS) { int idx = token.indexOf(':'); String key = token.substring(0, idx); int ch = Integer.parseInt(token.substring(idx + 1)); Character character = Character.valueOf((char) ch); charMap.put(key, character); codeMap.put(character, key); } } /** * Convert a string to HTML by encoding all required characters. * * @param s the string * @return the HTML text */ public static String convertStringToHtml(String s) { if (s == null) { return null; } if (s.length() == 0) { return s; } StringBuilder buff = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); String token = codeMap.get(ch); if (token == null) { if (ch < 128) { buff.append(ch); } else { buff.append('&'); buff.append('#'); buff.append((int) ch); buff.append(';'); } } else { buff.append('&'); buff.append(token); buff.append(';'); } } return buff.toString(); } /** * Convert a HTML encoded text to a string. * * @param html the HTML text * @return the string */ public static String convertHtmlToString(String html) { if (html == null) { return null; } if (html.length() == 0) { return html; } if (html.indexOf('&') < 0) { return html; } StringBuilder buff = new StringBuilder(); for (int i = 0; i < html.length(); i++) { char ch = html.charAt(i); if (ch != '&') { buff.append(ch); continue; } int idx = html.indexOf(';', i + 1); if (idx < 0) { buff.append("???"); continue; } String key = html.substring(i + 1, idx); Character repl; if (key.startsWith("#")) { if (key.startsWith("#x")) { try { int code = Integer.parseInt(key.substring(2), 16); if (code < 0 || code > 0xffff) { repl = null; } else { repl = Character.valueOf((char) code); } } catch (NumberFormatException e) { repl = null; } } else { try { int code = Integer.parseInt(key.substring(1)); if (code < 0 || code > 0xffff) { repl = null; } else { repl = Character.valueOf((char) code); } } catch (NumberFormatException e) { repl = null; } } } else { repl = charMap.get(key); } if (repl == null) { buff.append("???" + key + "???"); continue; } buff.append(repl.charValue()); i = idx; } return buff.toString(); } }