package org.rr.commons.utils; import static org.rr.commons.utils.StringUtil.EMPTY; import java.io.Serializable; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; public class HTMLEntityConverter implements Serializable { private static final long serialVersionUID = 5138995692645522618L; private String text; private int encodeCondition; private boolean useNamedEntities = false; private boolean reencodeEntities = false; /** * This constructor provides the minimum of parameters needed. * @param text The text to be encoded / decoded. * @param encodeCondition The encode conditions <code>ENCODE_SEVEN_BIT_ASCII, ENCODE_EIGHT_BIT_ASCII, ENCODE_SEVEN_BIT_XML</code>. */ public HTMLEntityConverter(String text, int encodeCondition) { this.text = text; this.encodeCondition = encodeCondition; } /** * The target/encoded string contains only seven bit ascii characters. All other characters gets entity encoded. */ public static final int ENCODE_SEVEN_BIT_ASCII = 0; /** * The target/encoded string contains only eight bit ascii characters. All other characters gets entity encoded. */ public static final int ENCODE_EIGHT_BIT_ASCII = 1; /** * The target/encoded string contains only seven bit ascii characters. Some special characters, needed for the xml structure will also be encoded. */ public static final int ENCODE_SEVEN_BIT_XML = 2; /** * Contains all characters in the ascii seven area which should not appear in a xml file */ private static final LinkedList<Character> invalidAscii7XMLCharacters = new LinkedList<Character>() { private static final long serialVersionUID = 8718029412734859537L; { add(Character.valueOf('=')); add(Character.valueOf('<')); add(Character.valueOf('>')); add(Character.valueOf('\"')); add(Character.valueOf('\'')); // #39 add(Character.valueOf('`')); // #96 add(Character.valueOf('^')); // #94 add(Character.valueOf('&')); // #38 } }; /** * Contains most common HTML4 entities as key with the referring unicode character */ private static final HashMap<String, Character> htmlEntities = new HashMap<String, Character>() { private static final long serialVersionUID = -3493475860475518475L; @Override public Character put(String key, Character value) { if (!key.startsWith("&") || !key.endsWith(";")) { Logger.getLogger(this.getClass().getName()).log(Level.WARNING, "Bad Key in init htmlEntities '" + (key) + "'"); } return super.put(key, value); } { // HTML Symbols put(""", Character.valueOf('\"')); // Anführungszeichen oben put("&", Character.valueOf('\u0026')); // Ampersand-Zeichen, kaufmännisches Und put("<", Character.valueOf('\u003c')); // öffnende spitze Klammer put(">", Character.valueOf('\u003e')); // schliessende spitze Klammer // Diakretic Smybols put("ˆ", Character.valueOf('\u005e')); // Zirkumflex put("˜", Character.valueOf('\u007e')); // Tilde // ISO 8859-1 Symbols put("'", Character.valueOf('\'')); // ''' xml problematic character put(" ", Character.valueOf('\u00a0')); // Erzwungenes Leerzeichen put("¡", Character.valueOf('\u00a1')); // umgekehrtes Ausrufezeichen put("¢", Character.valueOf('\u00a2')); // Cent-Zeichen put("£", Character.valueOf('\u00a3')); // Pfund-Zeichen put("¤", Character.valueOf('\u00a4')); // Währungszeichen put("¥", Character.valueOf('\u00a5')); // Yen-Zeichen put("¦", Character.valueOf('\u00a6')); // durchbrochener Strich put("§", Character.valueOf('\u00a7')); // Paragraph-Zeichen put("¨", Character.valueOf('\u00a8')); // Pünktchen oben put("©", Character.valueOf('\u00a9')); // Copyright-Zeichen put("ª", Character.valueOf('\u00aa')); // Ordinal-Zeichen weiblich put("«", Character.valueOf('\u00ab')); // angewinkelte Anführungszeichen links put("¬", Character.valueOf('\u00ac')); // Verneinungs-Zeichen put("­", Character.valueOf('\u00ad')); // bedingter Trennstrich put("®", Character.valueOf('\u00ae')); // Registriermarke-Zeichen put("¯", Character.valueOf('\u00af')); // Überstrich put("°", Character.valueOf('\u00b0')); // Grad-Zeichen put("±", Character.valueOf('\u00b1')); // Plusminus-Zeichen put("²", Character.valueOf('\u00b2')); // Hoch-2-Zeichen put("³", Character.valueOf('\u00b3')); // Hoch-3-Zeichen put("´", Character.valueOf('\u00b4')); // Akut-Zeichen put("µ", Character.valueOf('\u00b5')); // Mikro-Zeichen put("¶", Character.valueOf('\u00b6')); // Absatz-Zeichen put("·", Character.valueOf('\u00b7')); // Mittelpunkt put("¸", Character.valueOf('\u00b8')); // Häkchen unten put("¹", Character.valueOf('\u00b9')); // Hoch-1-Zeichen put("º", Character.valueOf('\u00ba')); // Ordinal-Zeichen männlich put("»", Character.valueOf('\u00bb')); // angewinkelte Anführungszeichen rechts put("¼", Character.valueOf('\u00bc')); // ein Viertel put("½", Character.valueOf('\u00bd')); // ein Halb put("¾", Character.valueOf('\u00be')); // drei Viertel put("¿", Character.valueOf('\u00bf')); // umgekehrtes Fragezeichen put("À", Character.valueOf('\u00c0')); // A mit accent grave (Gravis) put("Á", Character.valueOf('\u00c1')); // A mit accent aigu (Akut) put("Â", Character.valueOf('\u00c2')); // A mit Zirkumflex put("Ã", Character.valueOf('\u00c3')); // A mit Tilde put("Ä", Character.valueOf('\u00c4')); // A Umlaut put("Å", Character.valueOf('\u00c5')); // A mit Ring put("Æ", Character.valueOf('\u00c6')); // A mit legiertem E put("Ç", Character.valueOf('\u00c7')); // C mit Häkchen put("È", Character.valueOf('\u00c8')); // E mit accent grave (Gravis) put("É", Character.valueOf('\u00c9')); // E mit accent aigu (Akut) put("Ê", Character.valueOf('\u00ca')); // E mit Zirkumflex put("Ë", Character.valueOf('\u00cb')); // E Umlaut put("Ì", Character.valueOf('\u00cc')); // I mit accent grave (Gravis) put("Í", Character.valueOf('\u00cd')); // I mit accent aigu (Akut) put("Î", Character.valueOf('\u00ce')); // I mit Zirkumflex put("Ï", Character.valueOf('\u00cf')); // I Umlaut put("Ð", Character.valueOf('\u00d0')); // grosses Eth (isländisch) put("Ñ", Character.valueOf('\u00d1')); // N mit Tilde put("Ò", Character.valueOf('\u00d2')); // O mit accent grave (Gravis) put("Ó", Character.valueOf('\u00d3')); // O mit accent aigu (Akut) put("Ô", Character.valueOf('\u00d4')); // O mit Zirkumflex put("Õ", Character.valueOf('\u00d5')); // O mit Tilde put("Ö", Character.valueOf('\u00d6')); // O Umlaut put("×", Character.valueOf('\u00d7')); // Mal-Zeichen put("Ø", Character.valueOf('\u00d8')); // O mit Schrägstrich put("Ù", Character.valueOf('\u00d9')); // U mit accent grave (Gravis) put("Ú", Character.valueOf('\u00da')); // U mit accent aigu (Akut) put("Û", Character.valueOf('\u00db')); // U mit Zirkumflex put("Ü", Character.valueOf('\u00dc')); // U Umlaut put("Þ", Character.valueOf('\u00de')); // grosses Thorn (isländisch) put("ß", Character.valueOf('\u00df')); // scharfes S put("à", Character.valueOf('\u00e0')); // a mit accent grave (Gravis) put("á", Character.valueOf('\u00e1')); // a mit accent aigu (Akut) put("â", Character.valueOf('\u00e2')); // a mit Zirkumflex put("ã", Character.valueOf('\u00e3')); // a mit Tilde put("ä", Character.valueOf('\u00e4')); // a Umlaut put("å", Character.valueOf('\u00e5')); // a mit Ring put("æ", Character.valueOf('\u00e6')); // a mit legiertem e put("ç", Character.valueOf('\u00e7')); // c mit Häkchen put("è", Character.valueOf('\u00e8')); // e mit accent grave (Gravis) put("é", Character.valueOf('\u00e9')); // e mit accent aigu (Akut) put("ê", Character.valueOf('\u00ea')); // e mit Zirkumflex put("ë", Character.valueOf('\u00eb')); // e Umlaut put("ì", Character.valueOf('\u00ec')); // i mit accent grave (Gravis) put("í", Character.valueOf('\u00ed')); // i mit accent aigu (Akut) put("î", Character.valueOf('\u00ee')); // i mit Zirkumflex put("ï", Character.valueOf('\u00ef')); // i Umlaut put("ð", Character.valueOf('\u00f0')); // kleines Eth (isländisch) put("ñ", Character.valueOf('\u00f1')); // n mit Tilde put("ò", Character.valueOf('\u00f2')); // o mit accent grave (Gravis) put("ó", Character.valueOf('\u00f3')); // o mit accent aigu (Akut) put("ô", Character.valueOf('\u00f4')); // o mit Zirkumflex put("õ", Character.valueOf('\u00f5')); // o mit Tilde put("ö", Character.valueOf('\u00f6')); // o Umlaut put("÷", Character.valueOf('\u00f7')); // Divisions-Zeichen put("ø", Character.valueOf('\u00f8')); // o mit Schrägstrich put("ù", Character.valueOf('\u00f9')); // u mit accent grave (Gravis) put("ú", Character.valueOf('\u00fa')); // u mit accent aigu (Akut) put("û", Character.valueOf('\u00fb')); // u mit Zirkumflex put("ü", Character.valueOf('\u00fc')); // u Umlaut put("ý", Character.valueOf('\u00fd')); // y mit accent aigu (Akut) put("Ý", Character.valueOf('\u00dd')); // Y mit accent aigu (Akut) put("þ", Character.valueOf('\u00fe')); // kleines Thorn (isländisch) put("ÿ", Character.valueOf('\u00ff')); // y Umlaut put("Ÿ", Character.valueOf('\u0178')); // Y Umlaut // Greek Symbols put("Α", Character.valueOf('\u0391')); // Alpha gross put("α", Character.valueOf('\u03B1')); // alpha klein put("Β", Character.valueOf('\u0392')); // Beta gross put("β", Character.valueOf('\u03B2')); // Beta klein put("Γ", Character.valueOf('\u0393')); // Gamma gross put("γ", Character.valueOf('\u03B3')); // Gamme klein put("Δ", Character.valueOf('\u0394')); // Delta gross put("δ", Character.valueOf('\u03B4')); // Delta klein put("Ε", Character.valueOf('\u0395')); // Epsilon gross put("ε", Character.valueOf('\u03B5')); // Epsilon klein put("Ζ", Character.valueOf('\u0396')); // Epsilon gross put("ζ", Character.valueOf('\u03B6')); // Epsilon klein put("Η", Character.valueOf('\u0397')); // Eta gross put("η", Character.valueOf('\u03B7')); // Eta klein put("Θ", Character.valueOf('\u0398')); // Theta gross put("θ", Character.valueOf('\u03B8')); // Theta klein put("Ι", Character.valueOf('\u0399')); // Iota gross put("ι", Character.valueOf('\u03B9')); // Iota klein put("Κ", Character.valueOf('\u039A')); // Kappa gross put("κ", Character.valueOf('\u03BA')); // Kappa klein put("Λ", Character.valueOf('\u039B')); // Lambda gross put("λ", Character.valueOf('\u03BB')); // Lambda klein put("Μ", Character.valueOf('\u039C')); // Mu gross put("μ", Character.valueOf('\u03BC')); // Mu klein put("Ν", Character.valueOf('\u039D')); // Nu gross put("ν", Character.valueOf('\u03BD')); // Nu klein put("Ξ", Character.valueOf('\u039E')); // Xi gross put("ξ", Character.valueOf('\u03BE')); // Xi klein put("Ο", Character.valueOf('\u039F')); // Omicron gross put("ο", Character.valueOf('\u03BF')); // Omicron klein put("Π", Character.valueOf('\u03A0')); // Pi gross put("π", Character.valueOf('\u03C0')); // Pi klein put("Ρ", Character.valueOf('\u03A1')); // Rho gross put("ρ", Character.valueOf('\u03C1')); // Rho klein put("ς", Character.valueOf('\u03C2')); // Schluss-Sigma put("Σ", Character.valueOf('\u03A3')); // Sigma gross put("σ", Character.valueOf('\u03C3')); // Sigma klein put("Τ", Character.valueOf('\u03A4')); // Tau gross put("τ", Character.valueOf('\u03C4')); // Tau klein put("Υ", Character.valueOf('\u03A5')); // Upsilon gross put("υ", Character.valueOf('\u03C5')); // Upsilon klein put("Φ", Character.valueOf('\u03A6')); // Phi gross put("φ", Character.valueOf('\u03C6')); // Phi klein put("Χ", Character.valueOf('\u03A7')); // Chi gross put("χ", Character.valueOf('\u03C7')); // Chi klein put("Ψ", Character.valueOf('\u03A8')); // Psi gross put("ψ", Character.valueOf('\u03C8')); // Psi klein put("Ω", Character.valueOf('\u03A9')); // Omega gross put("ω", Character.valueOf('\u03C9')); // Omega klein put("ϑ", Character.valueOf('\u03D1'));// theta Symbol put("ϒ", Character.valueOf('\u03D2')); // ypsilon mit Haken put("ϖ", Character.valueOf('\u03D6')); // greek pi symbol // Mathematical Symbols put("∀", Character.valueOf('\u2200')); // for all put("∂", Character.valueOf('\u2202')); // partial differential put("∃", Character.valueOf('\u2203')); // there exists put("∅", Character.valueOf('\u2205')); // empty set = null set = diameter put("∇", Character.valueOf('\u2207')); // nabla = backward difference put("∈", Character.valueOf('\u2208')); // element of put("∉", Character.valueOf('\u2209')); // not an element of put("∋", Character.valueOf('\u220B')); // contains as member put("∏", Character.valueOf('\u220F')); // n-ary product = product sign put("∑", Character.valueOf('\u2211')); // n-ary sumation put("−", Character.valueOf('\u2212')); // minus sign put("∗", Character.valueOf('\u2217')); // asterisk operator put("√", Character.valueOf('\u221A')); // square root = radical sign put("∝", Character.valueOf('\u221D')); // proportional to put("∞", Character.valueOf('\u221E')); // infinity put("∠", Character.valueOf('\u2220')); // angle put("∧", Character.valueOf('\u2227')); // logical and put("∨", Character.valueOf('\u2228')); // logical or put("∩", Character.valueOf('\u2229')); // intersection put("∪", Character.valueOf('\u222A')); // union put("∫", Character.valueOf('\u222B')); // integral put("∴", Character.valueOf('\u2234')); // therefore put("∼", Character.valueOf('\u223C')); // tilde operator = varies with = similar to put("≅", Character.valueOf('\u2245')); // approximately equal to put("≈", Character.valueOf('\u2248')); // almost equal to put("≠", Character.valueOf('\u2260')); // not equal to put("≡", Character.valueOf('\u2261')); // identical to put("≤", Character.valueOf('\u2264')); // less-than or equal to put("≥", Character.valueOf('\u2265')); // greater-than or equal to put("⊂", Character.valueOf('\u2282')); // subset of put("⊃", Character.valueOf('\u2283')); // superset of put("⊄", Character.valueOf('\u2284')); // not a subset of put("⊆", Character.valueOf('\u2286')); // subset of or equal to put("⊇", Character.valueOf('\u2287')); // superset of or equal to put("⊕", Character.valueOf('\u2295')); // circled plus = direct sum put("⊗", Character.valueOf('\u2297')); // circled times = vector product put("⊥", Character.valueOf('\u22A5')); // up tack = orthogonal to = perpendicular put("⋅", Character.valueOf('\u22C5')); // dot operator put("◊", Character.valueOf('\u25CA')); // lozenge put("ƒ", Character.valueOf('\u0192')); // latin small f with hook = function = florin // General Punctuation put("•", Character.valueOf('\u2022')); // bullet = black small circle put("…", Character.valueOf('\u2026')); // horizontal ellipsis = three dot leader put("′", Character.valueOf('\u2032')); // prime = minutes = feet put("″", Character.valueOf('\u2033')); // double prime = seconds = inches put("‾", Character.valueOf('\u203E')); // overline = spacing overscore put("⁄", Character.valueOf('\u2044')); // fraction slash // Letterlike Symbols put("℘", Character.valueOf('\u2118')); // script capital P = power set = Weierstrass p put("ℑ", Character.valueOf('\u2111')); // blackletter capital I = imaginary part put("ℜ", Character.valueOf('\u211C')); // blackletter capital R = real part symbol put("™", Character.valueOf('\u2122')); // trade mark sign put("ℵ", Character.valueOf('\u2135')); // alef symbol = first transfinite cardinal put("€", Character.valueOf('\u20ac')); // euro currency // Arrow Symbols put("←", Character.valueOf('\u2190')); put("↑", Character.valueOf('\u2191')); put("→", Character.valueOf('\u2192')); put("↓", Character.valueOf('\u2193')); put("↔", Character.valueOf('\u2194')); put("↵", Character.valueOf('\u21b5')); put("⇐", Character.valueOf('\u21d0')); put("⇑", Character.valueOf('\u21d1')); put("⇒", Character.valueOf('\u21d2')); put("⇓", Character.valueOf('\u21d3')); put("⇔", Character.valueOf('\u21d4')); // Miscellaneous Symbols put("♠", Character.valueOf('\u2660')); // black spade suit put("♣", Character.valueOf('\u2663')); // black club suit = shamrock put("♥", Character.valueOf('\u2665')); // black heart suit = valentine put("♦", Character.valueOf('\u2666')); // black diamond suit // Miscellaneous Technical put("⌈", Character.valueOf('\u2308')); // left ceiling = apl upstile put("⌉", Character.valueOf('\u2309')); // right ceiling put("⌊", Character.valueOf('\u230A')); // left floor = apl downstile put("⌋", Character.valueOf('\u230B')); // right floor put("⟨", Character.valueOf('\u2329')); // left-pointing angle bracket = bra put("⟩", Character.valueOf('\u232A')); // right-pointing angle bracket = ket // Benannte Zeichen lateinisch erweitert put("&Oelig;", Character.valueOf('\u0152')); // OE-Ligatur put("œ", Character.valueOf('\u0153')); // OE-Ligatur klein put("Š", Character.valueOf('\u0160')); // S mit Hatschek (Caron) put("š", Character.valueOf('\u0161')); // s mit Hatschek (Caron) // Benannte Zeichen für Interpunktion put("‘", Character.valueOf('\u8216')); put("’", Character.valueOf('\u8217')); put("“", Character.valueOf('\u8220')); put("”", Character.valueOf('\u8221')); put(" ", Character.valueOf('\u2002')); put(" ", Character.valueOf('\u2003')); put(" ", Character.valueOf('\u2009')); put("‌", Character.valueOf('\u200C')); put("‍", Character.valueOf('\u200D')); put("‎", Character.valueOf('\u200E')); put("‏", Character.valueOf('\u200F')); put("–", Character.valueOf('\u2013')); put("—", Character.valueOf('\u2014')); put("‚", Character.valueOf('\u201A')); put("„", Character.valueOf('\u201E')); put("†", Character.valueOf('\u2020')); put("‡", Character.valueOf('\u2021')); put("‰", Character.valueOf('\u2030')); put("‹", Character.valueOf('\u2039')); put("›", Character.valueOf('\u203A')); } }; /** * Contains all characters from the {@link #htmlEntities} Map but the HTML Entity is the value and the unicode char is the key. */ private static final HashMap<Character, String> reverseHtmlEntities = new HashMap<Character, String>() { private static final long serialVersionUID = 94497908219310662L; { for (Map.Entry<String, Character> entry : htmlEntities.entrySet()) { String key = entry.getKey(); Character value = entry.getValue(); // the unicode char put(value, key); } } }; /** * Determines the character for a given HTML entity. If the HTML entity, specified with the <code>entity</code> parameter, is not found, the given * {@link String} is returned. <br> * <br> * The HTML entity {@link String} can be somtehing like " " but it's not wickedly if the '&' or ';' character is missing. These characters can be * omitted. * * @param entity * The entity to be converted into the associated character. * @return The associated character or, if no associated character is found, <code>null</code> is returned. */ private static Character getHTMLEntityCharacter(final String entity) { if (entity == null || entity.length() == 0) { return null; } try { // start some normalization String processEntity = entity.trim(); if (!processEntity.startsWith("&")) { // Append the & char if not presend processEntity = "&" + processEntity; } if (!processEntity.endsWith(";")) { // Append the ; char at the end if not present processEntity = processEntity + ";"; } final Character fetchedEntity = htmlEntities.get(processEntity); return fetchedEntity; } catch (Exception e) { return null; } } /** * Decodes a numeric entity like <code>"@"</code> or <code>"@"</code> using the <code>decodeUnicode(String)</code> method. * <code>decodeDecimalEntity(String)</code> only reformats the given String into an unicode ascii sequence and returns the reuslt from the * <code>decodeUnicode(String)</code> method. * * @param str * The entity to decode. * @return The character matching to the given entity. * @throws IllegalArgumentException * if the entity could not be decoded. */ private static String decodeNumericEntity(final String str) { String hexValue = str; try { if (hexValue.charAt(2) == 'x') { // €, its already hex (only cut the &#x and the ; away) hexValue = str.substring(3, str.length() - 1); } else { // ⁈, convert it to a hex value and cut the &# and the ; away. String substring = str.substring(2, str.length() - 1); hexValue = hex(Integer.valueOf(substring).doubleValue()); } // use the unicode decode method to decode the value. return String.valueOf(decodeNumericUnicodeSequence(hexValue)); } catch (Exception e) { throw new RuntimeException("IllegalUnicodeSequence " + str, e); } } /** * Creates a string that contains a repeating character of a specified length. * * @param size * Number of recurrences. * @param repeat * The character that should be repeated. The first character of the <code>String</code> will be used. * @return A String with the specified number of repeating characters. */ private static final String string(final int size, final char repeat) { if (size <= 0) { return EMPTY; } final StringBuilder returnValue = new StringBuilder(size); for (int i = 0; i < size; i++) { returnValue.append(repeat); } return returnValue.toString(); } /** * Decodes an hexadecimal, numeric value into an UTF-8 String. * * @param s * The numeric String to be decoded. * @return The decoded unicode character. * @throws IllegalArgumentException */ private static char decodeNumericUnicodeSequence(final String s) { // normalize the given numeric string to a two byte numeric string value (40 -> 0040). String toDecode = string(4 - s.length(), '0') + s; int value = 0; for (int i = 0; i < 4; i++) { char aChar = toDecode.charAt(i); switch (aChar) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value = (value << 4) + aChar - '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': value = (value << 4) + 10 + aChar - 'a'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': value = (value << 4) + 10 + aChar - 'A'; break; default: throw new RuntimeException("IllegalUnicodeSequence " + s); } } return (char) value; } /** * Creates a string that represents the hexadecimal value of a specified number. * * If number is not a whole number, it is rounded to the nearest whole number before being evaluated. * * @param value * The number to be used for calculation. * @return The hex <code>String</code> from the given number. */ private static final String hex(double value) { value = round(value, 0); return Integer.toHexString((int) value).toUpperCase(); } /** * Round a double value to the next closest number considing the specified number of decimal places. * * @param value * The number to be rounded. * @param places * Specifies how many places to the right of the decimal are included in the rounding. * @return The rounded value. */ private static final double round(double value, final int places) { long factor = (long) Math.pow(10, places); // Shift the decimal the correct number of places to the right. value = value * factor; return (double) Math.round(value) / factor; } /** * Decodes all HTML entities like:   € or > in the given text and replaces them with the correct String. Did not throws any kind of Exception. * * @param text * The text to be proecessed. * @return The processed <code>String</code>. If the <code>text</code> is <code>null</code>, <code>null</code> will be returned. */ public String decodeEntities() { if (text == null) { return null; } try { final StringBuilder resultBuf = new StringBuilder(); for (int i = 0; i < text.length(); i++) { if (text.charAt(i) == '&') { // this can be possibly a HTML entity! String tmp = text.substring(i); final int semikolonIndex = tmp.indexOf(';'); if (semikolonIndex != -1) { tmp = tmp.substring(0, semikolonIndex + 1); Character entity = getHTMLEntityCharacter(tmp); // is there an entity found, than it's only one character! if (entity != null) { resultBuf.append(entity); i += tmp.length() - 1; // skip the rest from processing continue; } else { // entity not in list, test for a numeric entity. A numeric entity could not be longer than 8 chars. if (tmp.charAt(1) == '#' && tmp.length() <= 8) { resultBuf.append(decodeNumericEntity(tmp)); i += tmp.length() - 1; // skip the rest from processing continue; } } } // append the & character if the previous process did not match. resultBuf.append(text.charAt(i)); } else { // not escaped, just append it resultBuf.append(text.charAt(i)); } } return resultBuf.toString(); } catch (Exception e) { Logger.getLogger(HTMLEntityConverter.class.getName()).log(Level.WARNING, "converting html entity has failed for string " + text + ". The unconverted text is used instead.", e); } return text; } /** * Exchanges all chars in the given text which are not a member of seven or eight bit ascii or which are problematic for xml processing (<>="' etc.). <BR> * <BT> The exchange character is a numeric html entity character. For example for the euro char will be replaced by the numeric html entity € * * @param text * The text to be encoded. * @param encodeCondition * use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit * ascii chars. * @param useNamedEntities * Tells if the encoding should be done with named entites or not. For example &euro will be used instead of € * @return The encoded text. If the given text is <code>null</code>, <code>null</code> will be returned. */ public String encodeEntities(final boolean useNamedEntities) { if (text == null) { return null; } final StringBuilder resultBuf = new StringBuilder(); for (int i = 0; i < text.length(); i++) { final char textChar = text.charAt(i); if (Character.isWhitespace(textChar) || Character.isSpaceChar(textChar)) { resultBuf.append(" "); } else if (shouldEncodeCharacer(textChar, encodeCondition)) { //skip already encoded entities. if (textChar == '&' && !isReencodeEntities()) { String entity = isEntity(text, i); if (entity != null) { i += entity.length() - 1; resultBuf.append(entity); continue; } } String entity = "&#" + (int) textChar + ";"; if (useNamedEntities) { // take a look if there is a named entity available for the current character. String namedEntity = reverseHtmlEntities.get(Character.valueOf(textChar)); if (namedEntity != null) { entity = namedEntity; } } resultBuf.append(entity); } else { resultBuf.append(textChar); } } return resultBuf.toString(); } /** * Determines if there is a known entity int the given <code>text</code> at * the given <code>idx</code> (position). * * @param text The text to be tested * @param idx The index where the entity should be. * @return The identified entity or <code>null</code> if the entity isn't onw or * could not be identified. */ private static String isEntity(String text, int idx) { if (text.charAt(idx) == '&' && text.length() >= idx + 3) { if (text.charAt(idx + 1) == '#') { //test if we have a hex entity. int hexEntityEnd = -1; boolean digitFound = false; for (int i = idx + 2; i < text.length(); i++) { if (Character.isDigit(text.charAt(i))) { digitFound = true; } else if (text.charAt(i) == ';' && digitFound) { hexEntityEnd = i; break; } else { break; } } if (hexEntityEnd != -1) { //return the hex entity. return text.substring(idx, hexEntityEnd + 1); } } else { //test if we have a named entity for (Map.Entry<String, Character> entry : htmlEntities.entrySet()) { final String namedEntity = entry.getKey(); if (text.startsWith(namedEntity, idx)) { return namedEntity; } } } } return null; } /** * Exchanges all chars in the given text which are not a member of seven or eight bit ascii or which are problematic for xml processing (<>="' etc.). <BR> * <BT> The exchange character is a numeric html entity character. For example for the euro char will be replaced by the numeric html entity € * * @param text * The text to be encoded. * @param encodeCondition * use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit * ascii chars. * @return The encoded text. If the given text is <code>null</code>, <code>null</code> will be returned. */ public String encodeEntities() { return encodeEntities(this.isUseNamedEntities()); } /** * Tells if the given character should be encoded or not. * * @param c * The character * @param encodeCondition * use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit * ascii chars. * @return <code>true</code> if the char should be encoded and <code>false</code> otherwise. */ private static boolean shouldEncodeCharacer(final char c, final int encodeCondition) { switch (encodeCondition) { case ENCODE_SEVEN_BIT_ASCII: return c > 127; case ENCODE_EIGHT_BIT_ASCII: return c > 255; case ENCODE_SEVEN_BIT_XML: if (c > 127) { return true; } else if (invalidAscii7XMLCharacters.contains(Character.valueOf(c))) { return true; } else if (Character.isISOControl(c)) { return true; } default: return false; } } /** * @see #setEncodeCondition(int) */ public int getEncodeCondition() { return encodeCondition; } /** * Sets the encode condition. * @param encodeCondition <code>ENCODE_SEVEN_BIT_ASCII, ENCODE_EIGHT_BIT_ASCII, ENCODE_SEVEN_BIT_XML</code> */ public void setEncodeCondition(int encodeCondition) { this.encodeCondition = encodeCondition; } /** * @see #setReencodeEntities(boolean) */ public boolean isReencodeEntities() { return reencodeEntities; } /** * If there already entities in the given html, reencode them or do not touch them. * @param reencodeEntities <code>true</code> reencode them and <code>false</code> do not touch them. */ public void setReencodeEntities(boolean reencodeEntities) { this.reencodeEntities = reencodeEntities; } /** * @see #setText(java.lang.String) */ public String getText() { return text; } /** * Sets the text to be encoded / decoded. * @param text The text to be encoded / decoded. */ public void setText(String text) { this.text = text; } /** * @see #setUseNamedEntities(boolean) */ public boolean isUseNamedEntities() { return useNamedEntities; } /** * Use named entities like & for the encoding process or not. * @param useNamedEntities <code>true</code> for using named entities and <code>false</code> * for using the numeric ones like "ac;". */ public void setUseNamedEntities(boolean useNamedEntities) { this.useNamedEntities = useNamedEntities; } }