package com.smartandroid.sa.tag.nodes; import java.io.IOException; import java.io.InputStream; import java.nio.charset.CharsetEncoder; import java.util.HashMap; import java.util.Map; import java.util.MissingResourceException; import java.util.Properties; import com.smartandroid.sa.tag.helper.StringUtil; import com.smartandroid.sa.tag.parser.Parser; /** * HTML entities, and escape routines. Source: <a href= * "http://www.w3.org/TR/html5/named-character-references.html#named-character-references" * >W3C HTML named character references</a>. */ public class Entities { public enum EscapeMode { /** * Restricted entities suitable for XHTML output: lt, gt, amp, and quot * only. */ xhtml(xhtmlByVal), /** Default HTML output entities. */ base(baseByVal), /** Complete HTML entities. */ extended(fullByVal); private Map<Character, String> map; EscapeMode(Map<Character, String> map) { this.map = map; } public Map<Character, String> getMap() { return map; } } private static final Map<String, Character> full; private static final Map<Character, String> xhtmlByVal; private static final Map<String, Character> base; private static final Map<Character, String> baseByVal; private static final Map<Character, String> fullByVal; private Entities() { } /** * Check if the input is a known named entity * * @param name * the possible entity name (e.g. "lt" or "amp") * @return true if a known named entity */ public static boolean isNamedEntity(String name) { return full.containsKey(name); } /** * Check if the input is a known named entity in the base entity set. * * @param name * the possible entity name (e.g. "lt" or "amp") * @return true if a known named entity in the base set * @see #isNamedEntity(String) */ public static boolean isBaseNamedEntity(String name) { return base.containsKey(name); } /** * Get the Character value of the named entity * @param name named entity (e.g. "lt" or "amp") * @return the Character value of the named entity (e.g. '<' or '&') */ public static Character getCharacterByName(String name) { return full.get(name); } static String escape(String string, Document.OutputSettings out) { StringBuilder accum = new StringBuilder(string.length() * 2); escape(accum, string, out, false, false, false); return accum.toString(); } // this method is ugly, and does a lot. but other breakups cause rescanning // and stringbuilder generations static void escape(StringBuilder accum, String string, Document.OutputSettings out, boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) { boolean lastWasWhite = false; boolean reachedNonWhite = false; EscapeMode escapeMode = out.escapeMode(); CharsetEncoder encoder = out.encoder(); Map<Character, String> map = escapeMode.getMap(); final int length = string.length(); int codePoint; for (int offset = 0; offset < length; offset += Character .charCount(codePoint)) { codePoint = string.codePointAt(offset); if (normaliseWhite) { if (StringUtil.isWhitespace(codePoint)) { if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite) continue; accum.append(' '); lastWasWhite = true; continue; } else { lastWasWhite = false; reachedNonWhite = true; } } // surrogate pairs, split implementation for efficiency on single // char common case (saves creating strings, char[]): if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { final char c = (char) codePoint; // html specific and required escapes: switch (c) { case '&': accum.append("&"); break; case 0xA0: if (escapeMode != EscapeMode.xhtml) accum.append(" "); else accum.append(c); break; case '<': if (!inAttribute) accum.append("<"); else accum.append(c); break; case '>': if (!inAttribute) accum.append(">"); else accum.append(c); break; case '"': if (inAttribute) accum.append("""); else accum.append(c); break; default: if (encoder.canEncode(c)) accum.append(c); else if (map.containsKey(c)) accum.append('&').append(map.get(c)).append(';'); else accum.append("&#x") .append(Integer.toHexString(codePoint)) .append(';'); } } else { final String c = new String(Character.toChars(codePoint)); if (encoder.canEncode(c)) accum.append(c); else accum.append("&#x").append(Integer.toHexString(codePoint)) .append(';'); } } } static String unescape(String string) { return unescape(string, false); } /** * Unescape the input string. * * @param string * @param strict * if "strict" (that is, requires trailing ';' char, otherwise * that's optional) * @return */ static String unescape(String string, boolean strict) { return Parser.unescapeEntities(string, strict); } // xhtml has restricted entities private static final Object[][] xhtmlArray = { { "quot", 0x00022 }, { "amp", 0x00026 }, { "lt", 0x0003C }, { "gt", 0x0003E } }; static { xhtmlByVal = new HashMap<Character, String>(); base = loadEntities("entities-base.properties"); // most common / // default baseByVal = toCharacterKey(base); full = loadEntities("entities-full.properties"); // extended and // overblown. fullByVal = toCharacterKey(full); for (Object[] entity : xhtmlArray) { Character c = Character.valueOf((char) ((Integer) entity[1]) .intValue()); xhtmlByVal.put(c, ((String) entity[0])); } } private static Map<String, Character> loadEntities(String filename) { Properties properties = new Properties(); Map<String, Character> entities = new HashMap<String, Character>(); try { InputStream in = Entities.class.getResourceAsStream(filename); properties.load(in); in.close(); } catch (IOException e) { throw new MissingResourceException( "Error loading entities resource: " + e.getMessage(), "Entities", filename); } for (Map.Entry entry : properties.entrySet()) { Character val = Character.valueOf((char) Integer.parseInt( (String) entry.getValue(), 16)); String name = (String) entry.getKey(); entities.put(name, val); } return entities; } private static Map<Character, String> toCharacterKey( Map<String, Character> inMap) { Map<Character, String> outMap = new HashMap<Character, String>(); for (Map.Entry<String, Character> entry : inMap.entrySet()) { Character character = entry.getValue(); String name = entry.getKey(); if (outMap.containsKey(character)) { // dupe, prefer the lower case version if (name.toLowerCase().equals(name)) outMap.put(character, name); } else { outMap.put(character, name); } } return outMap; } }