HTMLEntityConverter.java example

Explorer
jeboorker-master
package org.rr.commons.utils;

import static org.rr.commons.utils.StringUtil.EMPTY;

import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

public class HTMLEntityConverter implements Serializable {

	private static final long serialVersionUID = 5138995692645522618L;
	
	private String text;
    private int encodeCondition;
    private boolean useNamedEntities = false;
    private boolean reencodeEntities = false;

    /**
     * This constructor provides the minimum of parameters needed.
     * @param text The text to be encoded / decoded.
     * @param encodeCondition The encode conditions <code>ENCODE_SEVEN_BIT_ASCII, ENCODE_EIGHT_BIT_ASCII, ENCODE_SEVEN_BIT_XML</code>.
     */
    public HTMLEntityConverter(String text, int encodeCondition) {
        this.text = text;
        this.encodeCondition = encodeCondition;
    }

    /**
     * The target/encoded string contains only seven bit ascii characters. All other characters gets entity encoded.
     */
    public static final int ENCODE_SEVEN_BIT_ASCII = 0;
    
    /**
     * The target/encoded string contains only eight bit ascii characters. All other characters gets entity encoded.
     */
    public static final int ENCODE_EIGHT_BIT_ASCII = 1;
    
    /**
     * The target/encoded string contains only seven bit ascii characters. Some special characters, needed for the xml structure will also be encoded.
     */
    public static final int ENCODE_SEVEN_BIT_XML = 2;
    
    /**
     * Contains all characters in the ascii seven area which should not appear in a xml file
     */
    private static final LinkedList<Character> invalidAscii7XMLCharacters = new LinkedList<Character>() {

		private static final long serialVersionUID = 8718029412734859537L;

		{
            add(Character.valueOf('='));
            add(Character.valueOf('<'));
            add(Character.valueOf('>'));
            add(Character.valueOf('\"'));
            add(Character.valueOf('\'')); // #39
            add(Character.valueOf('`')); // #96
            add(Character.valueOf('^')); // #94
            add(Character.valueOf('&')); // #38
        }
    };
    /**
     * Contains most common HTML4 entities as key with the referring unicode character
     */
    private static final HashMap<String, Character> htmlEntities = new HashMap<String, Character>() {

        private static final long serialVersionUID = -3493475860475518475L;

        @Override
        public Character put(String key, Character value) {
            if (!key.startsWith("&") || !key.endsWith(";")) {
                Logger.getLogger(this.getClass().getName()).log(Level.WARNING, "Bad Key in init htmlEntities '" + (key) + "'");
            }
            return super.put(key, value);
        }

        {
            // HTML Symbols
            put(""", Character.valueOf('\"')); // Anführungszeichen oben
            put("&", Character.valueOf('\u0026')); // Ampersand-Zeichen, kaufmännisches Und
            put("<", Character.valueOf('\u003c')); // öffnende spitze Klammer
            put(">", Character.valueOf('\u003e')); // schliessende spitze Klammer

            // Diakretic Smybols
            put("ˆ", Character.valueOf('\u005e')); // Zirkumflex
            put("˜", Character.valueOf('\u007e')); // Tilde

            // ISO 8859-1 Symbols
            put("'", Character.valueOf('\'')); // ''' xml problematic character
            put(" ", Character.valueOf('\u00a0')); // Erzwungenes Leerzeichen
            put("¡", Character.valueOf('\u00a1')); // umgekehrtes Ausrufezeichen
            put("¢", Character.valueOf('\u00a2')); // Cent-Zeichen
            put("£", Character.valueOf('\u00a3')); // Pfund-Zeichen
            put("¤", Character.valueOf('\u00a4')); // Währungszeichen
            put("¥", Character.valueOf('\u00a5')); // Yen-Zeichen
            put("¦", Character.valueOf('\u00a6')); // durchbrochener Strich
            put("§", Character.valueOf('\u00a7')); // Paragraph-Zeichen
            put("¨", Character.valueOf('\u00a8')); // Pünktchen oben
            put("©", Character.valueOf('\u00a9')); // Copyright-Zeichen
            put("ª", Character.valueOf('\u00aa')); // Ordinal-Zeichen weiblich
            put("«", Character.valueOf('\u00ab')); // angewinkelte Anführungszeichen links
            put("¬", Character.valueOf('\u00ac')); // Verneinungs-Zeichen
            put("", Character.valueOf('\u00ad')); // bedingter Trennstrich
            put("®", Character.valueOf('\u00ae')); // Registriermarke-Zeichen
            put("¯", Character.valueOf('\u00af')); // Überstrich
            put("°", Character.valueOf('\u00b0')); // Grad-Zeichen
            put("±", Character.valueOf('\u00b1')); // Plusminus-Zeichen
            put("²", Character.valueOf('\u00b2')); // Hoch-2-Zeichen
            put("³", Character.valueOf('\u00b3')); // Hoch-3-Zeichen
            put("´", Character.valueOf('\u00b4')); // Akut-Zeichen
            put("µ", Character.valueOf('\u00b5')); // Mikro-Zeichen
            put("¶", Character.valueOf('\u00b6')); // Absatz-Zeichen
            put("·", Character.valueOf('\u00b7')); // Mittelpunkt
            put("¸", Character.valueOf('\u00b8')); // Häkchen unten
            put("¹", Character.valueOf('\u00b9')); // Hoch-1-Zeichen
            put("º", Character.valueOf('\u00ba')); // Ordinal-Zeichen männlich
            put("»", Character.valueOf('\u00bb')); // angewinkelte Anführungszeichen rechts
            put("¼", Character.valueOf('\u00bc')); // ein Viertel
            put("½", Character.valueOf('\u00bd')); // ein Halb
            put("¾", Character.valueOf('\u00be')); // drei Viertel
            put("¿", Character.valueOf('\u00bf')); // umgekehrtes Fragezeichen
            put("À", Character.valueOf('\u00c0')); // A mit accent grave (Gravis)
            put("Á", Character.valueOf('\u00c1')); // A mit accent aigu (Akut)
            put("Â", Character.valueOf('\u00c2')); // A mit Zirkumflex
            put("Ã", Character.valueOf('\u00c3')); // A mit Tilde
            put("Ä", Character.valueOf('\u00c4')); // A Umlaut
            put("Å", Character.valueOf('\u00c5')); // A mit Ring
            put("Æ", Character.valueOf('\u00c6')); // A mit legiertem E
            put("Ç", Character.valueOf('\u00c7')); // C mit Häkchen
            put("È", Character.valueOf('\u00c8')); // E mit accent grave (Gravis)
            put("É", Character.valueOf('\u00c9')); // E mit accent aigu (Akut)
            put("Ê", Character.valueOf('\u00ca')); // E mit Zirkumflex
            put("Ë", Character.valueOf('\u00cb')); // E Umlaut
            put("Ì", Character.valueOf('\u00cc')); // I mit accent grave (Gravis)
            put("Í", Character.valueOf('\u00cd')); // I mit accent aigu (Akut)
            put("Î", Character.valueOf('\u00ce')); // I mit Zirkumflex
            put("Ï", Character.valueOf('\u00cf')); // I Umlaut
            put("Ð", Character.valueOf('\u00d0')); // grosses Eth (isländisch)
            put("Ñ", Character.valueOf('\u00d1')); // N mit Tilde
            put("Ò", Character.valueOf('\u00d2')); // O mit accent grave (Gravis)
            put("Ó", Character.valueOf('\u00d3')); // O mit accent aigu (Akut)
            put("Ô", Character.valueOf('\u00d4')); // O mit Zirkumflex
            put("Õ", Character.valueOf('\u00d5')); // O mit Tilde
            put("Ö", Character.valueOf('\u00d6')); // O Umlaut
            put("×", Character.valueOf('\u00d7')); // Mal-Zeichen
            put("Ø", Character.valueOf('\u00d8')); // O mit Schrägstrich
            put("Ù", Character.valueOf('\u00d9')); // U mit accent grave (Gravis)
            put("Ú", Character.valueOf('\u00da')); // U mit accent aigu (Akut)
            put("Û", Character.valueOf('\u00db')); // U mit Zirkumflex
            put("Ü", Character.valueOf('\u00dc')); // U Umlaut
            put("Þ", Character.valueOf('\u00de')); // grosses Thorn (isländisch)
            put("ß", Character.valueOf('\u00df')); // scharfes S
            put("à", Character.valueOf('\u00e0')); // a mit accent grave (Gravis)
            put("á", Character.valueOf('\u00e1')); // a mit accent aigu (Akut)
            put("â", Character.valueOf('\u00e2')); // a mit Zirkumflex
            put("ã", Character.valueOf('\u00e3')); // a mit Tilde
            put("ä", Character.valueOf('\u00e4')); // a Umlaut
            put("å", Character.valueOf('\u00e5')); // a mit Ring
            put("æ", Character.valueOf('\u00e6')); // a mit legiertem e
            put("ç", Character.valueOf('\u00e7')); // c mit Häkchen
            put("è", Character.valueOf('\u00e8')); // e mit accent grave (Gravis)
            put("é", Character.valueOf('\u00e9')); // e mit accent aigu (Akut)
            put("ê", Character.valueOf('\u00ea')); // e mit Zirkumflex
            put("ë", Character.valueOf('\u00eb')); // e Umlaut
            put("ì", Character.valueOf('\u00ec')); // i mit accent grave (Gravis)
            put("í", Character.valueOf('\u00ed')); // i mit accent aigu (Akut)
            put("î", Character.valueOf('\u00ee')); // i mit Zirkumflex
            put("ï", Character.valueOf('\u00ef')); // i Umlaut
            put("ð", Character.valueOf('\u00f0')); // kleines Eth (isländisch)
            put("ñ", Character.valueOf('\u00f1')); // n mit Tilde
            put("ò", Character.valueOf('\u00f2')); // o mit accent grave (Gravis)
            put("ó", Character.valueOf('\u00f3')); // o mit accent aigu (Akut)
            put("ô", Character.valueOf('\u00f4')); // o mit Zirkumflex
            put("õ", Character.valueOf('\u00f5')); // o mit Tilde
            put("ö", Character.valueOf('\u00f6')); // o Umlaut
            put("÷", Character.valueOf('\u00f7')); // Divisions-Zeichen
            put("ø", Character.valueOf('\u00f8')); // o mit Schrägstrich
            put("ù", Character.valueOf('\u00f9')); // u mit accent grave (Gravis)
            put("ú", Character.valueOf('\u00fa')); // u mit accent aigu (Akut)
            put("û", Character.valueOf('\u00fb')); // u mit Zirkumflex
            put("ü", Character.valueOf('\u00fc')); // u Umlaut
            put("ý", Character.valueOf('\u00fd')); // y mit accent aigu (Akut)
            put("Ý", Character.valueOf('\u00dd')); // Y mit accent aigu (Akut)
            put("þ", Character.valueOf('\u00fe')); // kleines Thorn (isländisch)
            put("ÿ", Character.valueOf('\u00ff')); // y Umlaut
            put("Ÿ", Character.valueOf('\u0178')); // Y Umlaut

            // Greek Symbols
            put("Α", Character.valueOf('\u0391')); // Alpha gross
            put("α", Character.valueOf('\u03B1')); // alpha klein
            put("Β", Character.valueOf('\u0392')); // Beta gross
            put("β", Character.valueOf('\u03B2')); // Beta klein
            put("Γ", Character.valueOf('\u0393')); // Gamma gross
            put("γ", Character.valueOf('\u03B3')); // Gamme klein
            put("Δ", Character.valueOf('\u0394')); // Delta gross
            put("δ", Character.valueOf('\u03B4')); // Delta klein
            put("Ε", Character.valueOf('\u0395')); // Epsilon gross
            put("ε", Character.valueOf('\u03B5')); // Epsilon klein
            put("Ζ", Character.valueOf('\u0396')); // Epsilon gross
            put("ζ", Character.valueOf('\u03B6')); // Epsilon klein
            put("Η", Character.valueOf('\u0397')); // Eta gross
            put("η", Character.valueOf('\u03B7')); // Eta klein
            put("Θ", Character.valueOf('\u0398')); // Theta gross
            put("θ", Character.valueOf('\u03B8')); // Theta klein
            put("Ι", Character.valueOf('\u0399')); // Iota gross
            put("ι", Character.valueOf('\u03B9')); // Iota klein
            put("Κ", Character.valueOf('\u039A')); // Kappa gross
            put("κ", Character.valueOf('\u03BA')); // Kappa klein
            put("Λ", Character.valueOf('\u039B')); // Lambda gross
            put("λ", Character.valueOf('\u03BB')); // Lambda klein
            put("Μ", Character.valueOf('\u039C')); // Mu gross
            put("μ", Character.valueOf('\u03BC')); // Mu klein
            put("Ν", Character.valueOf('\u039D')); // Nu gross
            put("ν", Character.valueOf('\u03BD')); // Nu klein
            put("Ξ", Character.valueOf('\u039E')); // Xi gross
            put("ξ", Character.valueOf('\u03BE')); // Xi klein
            put("Ο", Character.valueOf('\u039F')); // Omicron gross
            put("ο", Character.valueOf('\u03BF')); // Omicron klein
            put("Π", Character.valueOf('\u03A0')); // Pi gross
            put("π", Character.valueOf('\u03C0')); // Pi klein
            put("Ρ", Character.valueOf('\u03A1')); // Rho gross
            put("ρ", Character.valueOf('\u03C1')); // Rho klein
            put("ς", Character.valueOf('\u03C2')); // Schluss-Sigma
            put("Σ", Character.valueOf('\u03A3')); // Sigma gross
            put("σ", Character.valueOf('\u03C3')); // Sigma klein
            put("Τ", Character.valueOf('\u03A4')); // Tau gross
            put("τ", Character.valueOf('\u03C4')); // Tau klein
            put("Υ", Character.valueOf('\u03A5')); // Upsilon gross
            put("υ", Character.valueOf('\u03C5')); // Upsilon klein
            put("Φ", Character.valueOf('\u03A6')); // Phi gross
            put("φ", Character.valueOf('\u03C6')); // Phi klein
            put("Χ", Character.valueOf('\u03A7')); // Chi gross
            put("χ", Character.valueOf('\u03C7')); // Chi klein
            put("Ψ", Character.valueOf('\u03A8')); // Psi gross
            put("ψ", Character.valueOf('\u03C8')); // Psi klein
            put("Ω", Character.valueOf('\u03A9')); // Omega gross
            put("ω", Character.valueOf('\u03C9')); // Omega klein
            put("ϑ", Character.valueOf('\u03D1'));// theta Symbol
            put("ϒ", Character.valueOf('\u03D2')); // ypsilon mit Haken
            put("ϖ", Character.valueOf('\u03D6')); // greek pi symbol

            // Mathematical Symbols
            put("∀", Character.valueOf('\u2200')); // for all
            put("∂", Character.valueOf('\u2202')); // partial differential
            put("∃", Character.valueOf('\u2203')); // there exists
            put("∅", Character.valueOf('\u2205')); // empty set = null set = diameter
            put("∇", Character.valueOf('\u2207')); // nabla = backward difference
            put("∈", Character.valueOf('\u2208')); // element of
            put("∉", Character.valueOf('\u2209')); // not an element of
            put("∋", Character.valueOf('\u220B')); // contains as member
            put("∏", Character.valueOf('\u220F')); // n-ary product = product sign
            put("∑", Character.valueOf('\u2211')); // n-ary sumation
            put("−", Character.valueOf('\u2212')); // minus sign
            put("∗", Character.valueOf('\u2217')); // asterisk operator
            put("√", Character.valueOf('\u221A')); // square root = radical sign
            put("∝", Character.valueOf('\u221D')); // proportional to
            put("∞", Character.valueOf('\u221E')); // infinity
            put("∠", Character.valueOf('\u2220')); // angle
            put("∧", Character.valueOf('\u2227')); // logical and
            put("∨", Character.valueOf('\u2228')); // logical or
            put("∩", Character.valueOf('\u2229')); // intersection
            put("∪", Character.valueOf('\u222A')); // union
            put("∫", Character.valueOf('\u222B')); // integral
            put("∴", Character.valueOf('\u2234')); // therefore
            put("∼", Character.valueOf('\u223C')); // tilde operator = varies with = similar to
            put("≅", Character.valueOf('\u2245')); // approximately equal to
            put("≈", Character.valueOf('\u2248')); // almost equal to
            put("≠", Character.valueOf('\u2260')); // not equal to
            put("≡", Character.valueOf('\u2261')); // identical to
            put("≤", Character.valueOf('\u2264')); // less-than or equal to
            put("≥", Character.valueOf('\u2265')); // greater-than or equal to
            put("⊂", Character.valueOf('\u2282')); // subset of
            put("⊃", Character.valueOf('\u2283')); // superset of
            put("⊄", Character.valueOf('\u2284')); // not a subset of
            put("⊆", Character.valueOf('\u2286')); // subset of or equal to
            put("⊇", Character.valueOf('\u2287')); // superset of or equal to
            put("⊕", Character.valueOf('\u2295')); // circled plus = direct sum
            put("⊗", Character.valueOf('\u2297')); // circled times = vector product
            put("⊥", Character.valueOf('\u22A5')); // up tack = orthogonal to = perpendicular
            put("⋅", Character.valueOf('\u22C5')); // dot operator
            put("◊", Character.valueOf('\u25CA')); // lozenge
            put("ƒ", Character.valueOf('\u0192')); // latin small f with hook = function = florin

            // General Punctuation
            put("•", Character.valueOf('\u2022')); // bullet = black small circle
            put("…", Character.valueOf('\u2026')); // horizontal ellipsis = three dot leader
            put("′", Character.valueOf('\u2032')); // prime = minutes = feet
            put("″", Character.valueOf('\u2033')); // double prime = seconds = inches
            put("‾", Character.valueOf('\u203E')); // overline = spacing overscore
            put("⁄", Character.valueOf('\u2044')); // fraction slash

            // Letterlike Symbols
            put("℘", Character.valueOf('\u2118')); // script capital P = power set = Weierstrass p
            put("ℑ", Character.valueOf('\u2111')); // blackletter capital I = imaginary part
            put("ℜ", Character.valueOf('\u211C')); // blackletter capital R = real part symbol
            put("™", Character.valueOf('\u2122')); // trade mark sign
            put("ℵ", Character.valueOf('\u2135')); // alef symbol = first transfinite cardinal
            put("€", Character.valueOf('\u20ac')); // euro currency

            // Arrow Symbols
            put("←", Character.valueOf('\u2190'));
            put("↑", Character.valueOf('\u2191'));
            put("→", Character.valueOf('\u2192'));
            put("↓", Character.valueOf('\u2193'));
            put("↔", Character.valueOf('\u2194'));
            put("↵", Character.valueOf('\u21b5'));
            put("⇐", Character.valueOf('\u21d0'));
            put("⇑", Character.valueOf('\u21d1'));
            put("⇒", Character.valueOf('\u21d2'));
            put("⇓", Character.valueOf('\u21d3'));
            put("⇔", Character.valueOf('\u21d4'));

            // Miscellaneous Symbols
            put("♠", Character.valueOf('\u2660')); // black spade suit
            put("♣", Character.valueOf('\u2663')); // black club suit = shamrock
            put("♥", Character.valueOf('\u2665')); // black heart suit = valentine
            put("♦", Character.valueOf('\u2666')); // black diamond suit

            // Miscellaneous Technical
            put("⌈", Character.valueOf('\u2308')); // left ceiling = apl upstile
            put("⌉", Character.valueOf('\u2309')); // right ceiling
            put("⌊", Character.valueOf('\u230A')); // left floor = apl downstile
            put("⌋", Character.valueOf('\u230B')); // right floor
            put("⟨", Character.valueOf('\u2329')); // left-pointing angle bracket = bra
            put("⟩", Character.valueOf('\u232A')); // right-pointing angle bracket = ket

            // Benannte Zeichen lateinisch erweitert
            put("&Oelig;", Character.valueOf('\u0152')); // OE-Ligatur
            put("œ", Character.valueOf('\u0153')); // OE-Ligatur klein
            put("Š", Character.valueOf('\u0160')); // S mit Hatschek (Caron)
            put("š", Character.valueOf('\u0161')); // s mit Hatschek (Caron)

            // Benannte Zeichen für Interpunktion
            put("‘", Character.valueOf('\u8216'));
            put("’", Character.valueOf('\u8217'));
            put("“", Character.valueOf('\u8220'));
            put("”", Character.valueOf('\u8221'));
            put(" ", Character.valueOf('\u2002'));
            put(" ", Character.valueOf('\u2003'));
            put(" ", Character.valueOf('\u2009'));
            put("‌", Character.valueOf('\u200C'));
            put("‍", Character.valueOf('\u200D'));
            put("‎", Character.valueOf('\u200E'));
            put("‏", Character.valueOf('\u200F'));
            put("–", Character.valueOf('\u2013'));
            put("—", Character.valueOf('\u2014'));
            put("‚", Character.valueOf('\u201A'));
            put("„", Character.valueOf('\u201E'));
            put("†", Character.valueOf('\u2020'));
            put("‡", Character.valueOf('\u2021'));
            put("‰", Character.valueOf('\u2030'));
            put("‹", Character.valueOf('\u2039'));
            put("›", Character.valueOf('\u203A'));

        }
    };
    /**
     * Contains all characters from the {@link #htmlEntities} Map but the HTML Entity is the value and the unicode char is the key.
     */
    private static final HashMap<Character, String> reverseHtmlEntities = new HashMap<Character, String>() {

        private static final long serialVersionUID = 94497908219310662L;

        {
            for (Map.Entry<String, Character> entry : htmlEntities.entrySet()) {
                String key = entry.getKey();
                Character value = entry.getValue(); // the unicode char

                put(value, key);
            }
        }
    };

    /**
     * Determines the character for a given HTML entity. If the HTML entity, specified with the <code>entity</code> parameter, is not found, the given
     * {@link String} is returned. <br>
     * <br>
     * The HTML entity {@link String} can be somtehing like " " but it's not wickedly if the '&' or ';' character is missing. These characters can be
     * omitted.
     *
     * @param entity
     *            The entity to be converted into the associated character.
     * @return The associated character or, if no associated character is found, <code>null</code> is returned.
     */
    private static Character getHTMLEntityCharacter(final String entity) {
        if (entity == null || entity.length() == 0) {
            return null;
        }
        try {
            // start some normalization
            String processEntity = entity.trim();
            if (!processEntity.startsWith("&")) {
                // Append the & char if not presend
                processEntity = "&" + processEntity;
            }

            if (!processEntity.endsWith(";")) {
                // Append the ; char at the end if not present
                processEntity = processEntity + ";";
            }

            final Character fetchedEntity = htmlEntities.get(processEntity);

            return fetchedEntity;
        } catch (Exception e) {
            return null;
        }
    }

    /**
     * Decodes a numeric entity like <code>"@"</code> or <code>"@"</code> using the <code>decodeUnicode(String)</code> method.
     * <code>decodeDecimalEntity(String)</code> only reformats the given String into an unicode ascii sequence and returns the reuslt from the
     * <code>decodeUnicode(String)</code> method.
     *
     * @param str
     *            The entity to decode.
     * @return The character matching to the given entity.
     * @throws IllegalArgumentException
     *             if the entity could not be decoded.
     */
    private static String decodeNumericEntity(final String str) {
        String hexValue = str;
        try {
            if (hexValue.charAt(2) == 'x') {
                // €, its already hex (only cut the &#x and the ; away)
                hexValue = str.substring(3, str.length() - 1);
            } else {
                // ⁈, convert it to a hex value and cut the &# and the ; away.
                String substring = str.substring(2, str.length() - 1);
                hexValue = hex(Integer.valueOf(substring).doubleValue());
            }

            // use the unicode decode method to decode the value.
            return String.valueOf(decodeNumericUnicodeSequence(hexValue));
        } catch (Exception e) {
            throw new RuntimeException("IllegalUnicodeSequence " + str, e);
        }
    }

    /**
     * Creates a string that contains a repeating character of a specified length.
     *
     * @param size
     *            Number of recurrences.
     * @param repeat
     *            The character that should be repeated. The first character of the <code>String</code> will be used.
     * @return A String with the specified number of repeating characters.
     */
    private static final String string(final int size, final char repeat) {
        if (size <= 0) {
            return EMPTY;
        }
        final StringBuilder returnValue = new StringBuilder(size);

        for (int i = 0; i < size; i++) {
            returnValue.append(repeat);
        }
        return returnValue.toString();
    }

    /**
     * Decodes an hexadecimal, numeric value into an UTF-8 String.
     *
     * @param s
     *            The numeric String to be decoded.
     * @return The decoded unicode character.
     * @throws IllegalArgumentException
     */
    private static char decodeNumericUnicodeSequence(final String s) {
        // normalize the given numeric string to a two byte numeric string value (40 -> 0040).
        String toDecode = string(4 - s.length(), '0') + s;

        int value = 0;
        for (int i = 0; i < 4; i++) {
            char aChar = toDecode.charAt(i);
            switch (aChar) {
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                case '8':
                case '9':
                    value = (value << 4) + aChar - '0';
                    break;
                case 'a':
                case 'b':
                case 'c':
                case 'd':
                case 'e':
                case 'f':
                    value = (value << 4) + 10 + aChar - 'a';
                    break;
                case 'A':
                case 'B':
                case 'C':
                case 'D':
                case 'E':
                case 'F':
                    value = (value << 4) + 10 + aChar - 'A';
                    break;
                default:
                    throw new RuntimeException("IllegalUnicodeSequence " + s);
            }
        }
        return (char) value;
    }

    /**
     * Creates a string that represents the hexadecimal value of a specified number.
     *
     * If number is not a whole number, it is rounded to the nearest whole number before being evaluated.
     *
     * @param value
     *            The number to be used for calculation.
     * @return The hex <code>String</code> from the given number.
     */
    private static final String hex(double value) {
        value = round(value, 0);
        return Integer.toHexString((int) value).toUpperCase();
    }

    /**
     * Round a double value to the next closest number considing the specified number of decimal places.
     *
     * @param value
     *            The number to be rounded.
     * @param places
     *            Specifies how many places to the right of the decimal are included in the rounding.
     * @return The rounded value.
     */
    private static final double round(double value, final int places) {
        long factor = (long) Math.pow(10, places);
        // Shift the decimal the correct number of places to the right.
        value = value * factor;
        return (double) Math.round(value) / factor;
    }

    /**
     * Decodes all HTML entities like:   € or > in the given text and replaces them with the correct String. Did not throws any kind of Exception.
     *
     * @param text
     *            The text to be proecessed.
     * @return The processed <code>String</code>. If the <code>text</code> is <code>null</code>, <code>null</code> will be returned.
     */
    public String decodeEntities() {
        if (text == null) {
            return null;
        }

        try {
            final StringBuilder resultBuf = new StringBuilder();
            for (int i = 0; i < text.length(); i++) {
                if (text.charAt(i) == '&') {
                    // this can be possibly a HTML entity!
                    String tmp = text.substring(i);
                    final int semikolonIndex = tmp.indexOf(';');
                    if (semikolonIndex != -1) {
                        tmp = tmp.substring(0, semikolonIndex + 1);
                        Character entity = getHTMLEntityCharacter(tmp);

                        // is there an entity found, than it's only one character!
                        if (entity != null) {
                            resultBuf.append(entity);
                            i += tmp.length() - 1; // skip the rest from processing
                            continue;
                        } else {
                            // entity not in list, test for a numeric entity. A numeric entity could not be longer than 8 chars.
                            if (tmp.charAt(1) == '#' && tmp.length() <= 8) {
                                resultBuf.append(decodeNumericEntity(tmp));
                                i += tmp.length() - 1; // skip the rest from processing
                                continue;
                            }
                        }
                    }
                    // append the & character if the previous process did not match.
                    resultBuf.append(text.charAt(i));
                } else {
                    // not escaped, just append it
                    resultBuf.append(text.charAt(i));
                }
            }

            return resultBuf.toString();
        } catch (Exception e) {
            Logger.getLogger(HTMLEntityConverter.class.getName()).log(Level.WARNING,
                    "converting html entity has failed for string " + text + ". The unconverted text is used instead.", e);
        }
        return text;
    }

    /**
     * Exchanges all chars in the given text which are not a member of seven or eight bit ascii or which are problematic for xml processing (<>="' etc.). <BR>
     * <BT> The exchange character is a numeric html entity character. For example for the euro char will be replaced by the numeric html entity €
     *
     * @param text
     *            The text to be encoded.
     * @param encodeCondition
     *            use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit
     *            ascii chars.
     * @param useNamedEntities
     *            Tells if the encoding should be done with named entites or not. For example &euro will be used instead of €
     * @return The encoded text. If the given text is <code>null</code>, <code>null</code> will be returned.
     */
    public String encodeEntities(final boolean useNamedEntities) {
        if (text == null) {
            return null;
        }



        final StringBuilder resultBuf = new StringBuilder();
        for (int i = 0; i < text.length(); i++) {
            final char textChar = text.charAt(i);

            if (Character.isWhitespace(textChar) || Character.isSpaceChar(textChar)) {
                resultBuf.append(" ");
            } else if (shouldEncodeCharacer(textChar, encodeCondition)) {
                //skip already encoded entities.
                if (textChar == '&' && !isReencodeEntities()) {
                    String entity = isEntity(text, i);
                    if (entity != null) {
                        i += entity.length() - 1;
                        resultBuf.append(entity);
                        continue;
                    }
                }
                String entity = "&#" + (int) textChar + ";";
                if (useNamedEntities) {
                    // take a look if there is a named entity available for the current character.
                    String namedEntity = reverseHtmlEntities.get(Character.valueOf(textChar));
                    if (namedEntity != null) {
                        entity = namedEntity;
                    }
                }
                resultBuf.append(entity);
            } else {
                resultBuf.append(textChar);
            }
        }

        return resultBuf.toString();
    }

    /**
     * Determines if there is a known entity int the given <code>text</code> at
     * the given <code>idx</code> (position).
     *
     * @param text The text to be tested
     * @param idx The index where the entity should be.
     * @return The identified entity or <code>null</code> if the entity isn't onw or
     *   could not be identified.
     */
    private static String isEntity(String text, int idx) {
        if (text.charAt(idx) == '&' && text.length() >= idx + 3) {
            if (text.charAt(idx + 1) == '#') {
                //test if we have a hex entity.
                int hexEntityEnd = -1;
                boolean digitFound = false;
                for (int i = idx + 2; i < text.length(); i++) {
                    if (Character.isDigit(text.charAt(i))) {
                        digitFound = true;
                    } else if (text.charAt(i) == ';' && digitFound) {
                        hexEntityEnd = i;
                        break;
                    } else {
                        break;
                    }
                }
                if (hexEntityEnd != -1) {
                    //return the hex entity.
                    return text.substring(idx, hexEntityEnd + 1);
                }
            } else {
                //test if we have a named entity
                for (Map.Entry<String, Character> entry : htmlEntities.entrySet()) {
                    final String namedEntity = entry.getKey();
                    if (text.startsWith(namedEntity, idx)) {
                        return namedEntity;
                    }
                }
            }
        }
        return null;
    }

    /**
     * Exchanges all chars in the given text which are not a member of seven or eight bit ascii or which are problematic for xml processing (<>="' etc.). <BR>
     * <BT> The exchange character is a numeric html entity character. For example for the euro char will be replaced by the numeric html entity €
     *
     * @param text
     *            The text to be encoded.
     * @param encodeCondition
     *            use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit
     *            ascii chars.
     * @return The encoded text. If the given text is <code>null</code>, <code>null</code> will be returned.
     */
    public String encodeEntities() {
        return encodeEntities(this.isUseNamedEntities());
    }

    /**
     * Tells if the given character should be encoded or not.
     *
     * @param c
     *            The character
     * @param encodeCondition
     *            use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit
     *            ascii chars.
     * @return <code>true</code> if the char should be encoded and <code>false</code> otherwise.
     */
    private static boolean shouldEncodeCharacer(final char c, final int encodeCondition) {
        switch (encodeCondition) {
            case ENCODE_SEVEN_BIT_ASCII:
                return c > 127;
            case ENCODE_EIGHT_BIT_ASCII:
                return c > 255;
            case ENCODE_SEVEN_BIT_XML:
                if (c > 127) {
                    return true;
                } else if (invalidAscii7XMLCharacters.contains(Character.valueOf(c))) {
                    return true;
                } else if (Character.isISOControl(c)) {
                    return true;
                }
            default:
                return false;
        }
    }

    /**
     * @see #setEncodeCondition(int)
     */
    public int getEncodeCondition() {
        return encodeCondition;
    }

    /**
     * Sets the encode condition.
     * @param encodeCondition <code>ENCODE_SEVEN_BIT_ASCII, ENCODE_EIGHT_BIT_ASCII, ENCODE_SEVEN_BIT_XML</code>
     */
    public void setEncodeCondition(int encodeCondition) {
        this.encodeCondition = encodeCondition;
    }

    /**
     * @see #setReencodeEntities(boolean)
     */
    public boolean isReencodeEntities() {
        return reencodeEntities;
    }

    /**
     * If there already entities in the given html, reencode them or do not touch them.
     * @param reencodeEntities <code>true</code> reencode them and <code>false</code> do not touch them.
     */
    public void setReencodeEntities(boolean reencodeEntities) {
        this.reencodeEntities = reencodeEntities;
    }

    /**
     * @see #setText(java.lang.String)
     */
    public String getText() {
        return text;
    }

    /**
     * Sets the text to be encoded / decoded.
     * @param text The text to be encoded / decoded.
     */
    public void setText(String text) {
        this.text = text;
    }

    /**
     * @see #setUseNamedEntities(boolean)
     */
    public boolean isUseNamedEntities() {
        return useNamedEntities;
    }

    /**
     * Use named entities like & for the encoding process or not.
     * @param useNamedEntities <code>true</code> for using named entities and <code>false</code>
     *   for using the numeric ones like "ac;".
     */
    public void setUseNamedEntities(boolean useNamedEntities) {
        this.useNamedEntities = useNamedEntities;
    }
}