package org.rr.commons.utils;
import static org.rr.commons.utils.StringUtil.EMPTY;
import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
public class HTMLEntityConverter implements Serializable {
private static final long serialVersionUID = 5138995692645522618L;
private String text;
private int encodeCondition;
private boolean useNamedEntities = false;
private boolean reencodeEntities = false;
/**
* This constructor provides the minimum of parameters needed.
* @param text The text to be encoded / decoded.
* @param encodeCondition The encode conditions <code>ENCODE_SEVEN_BIT_ASCII, ENCODE_EIGHT_BIT_ASCII, ENCODE_SEVEN_BIT_XML</code>.
*/
public HTMLEntityConverter(String text, int encodeCondition) {
this.text = text;
this.encodeCondition = encodeCondition;
}
/**
* The target/encoded string contains only seven bit ascii characters. All other characters gets entity encoded.
*/
public static final int ENCODE_SEVEN_BIT_ASCII = 0;
/**
* The target/encoded string contains only eight bit ascii characters. All other characters gets entity encoded.
*/
public static final int ENCODE_EIGHT_BIT_ASCII = 1;
/**
* The target/encoded string contains only seven bit ascii characters. Some special characters, needed for the xml structure will also be encoded.
*/
public static final int ENCODE_SEVEN_BIT_XML = 2;
/**
* Contains all characters in the ascii seven area which should not appear in a xml file
*/
private static final LinkedList<Character> invalidAscii7XMLCharacters = new LinkedList<Character>() {
private static final long serialVersionUID = 8718029412734859537L;
{
add(Character.valueOf('='));
add(Character.valueOf('<'));
add(Character.valueOf('>'));
add(Character.valueOf('\"'));
add(Character.valueOf('\'')); // #39
add(Character.valueOf('`')); // #96
add(Character.valueOf('^')); // #94
add(Character.valueOf('&')); // #38
}
};
/**
* Contains most common HTML4 entities as key with the referring unicode character
*/
private static final HashMap<String, Character> htmlEntities = new HashMap<String, Character>() {
private static final long serialVersionUID = -3493475860475518475L;
@Override
public Character put(String key, Character value) {
if (!key.startsWith("&") || !key.endsWith(";")) {
Logger.getLogger(this.getClass().getName()).log(Level.WARNING, "Bad Key in init htmlEntities '" + (key) + "'");
}
return super.put(key, value);
}
{
// HTML Symbols
put(""", Character.valueOf('\"')); // Anführungszeichen oben
put("&", Character.valueOf('\u0026')); // Ampersand-Zeichen, kaufmännisches Und
put("<", Character.valueOf('\u003c')); // öffnende spitze Klammer
put(">", Character.valueOf('\u003e')); // schliessende spitze Klammer
// Diakretic Smybols
put("ˆ", Character.valueOf('\u005e')); // Zirkumflex
put("˜", Character.valueOf('\u007e')); // Tilde
// ISO 8859-1 Symbols
put("'", Character.valueOf('\'')); // ''' xml problematic character
put(" ", Character.valueOf('\u00a0')); // Erzwungenes Leerzeichen
put("¡", Character.valueOf('\u00a1')); // umgekehrtes Ausrufezeichen
put("¢", Character.valueOf('\u00a2')); // Cent-Zeichen
put("£", Character.valueOf('\u00a3')); // Pfund-Zeichen
put("¤", Character.valueOf('\u00a4')); // Währungszeichen
put("¥", Character.valueOf('\u00a5')); // Yen-Zeichen
put("¦", Character.valueOf('\u00a6')); // durchbrochener Strich
put("§", Character.valueOf('\u00a7')); // Paragraph-Zeichen
put("¨", Character.valueOf('\u00a8')); // Pünktchen oben
put("©", Character.valueOf('\u00a9')); // Copyright-Zeichen
put("ª", Character.valueOf('\u00aa')); // Ordinal-Zeichen weiblich
put("«", Character.valueOf('\u00ab')); // angewinkelte Anführungszeichen links
put("¬", Character.valueOf('\u00ac')); // Verneinungs-Zeichen
put("", Character.valueOf('\u00ad')); // bedingter Trennstrich
put("®", Character.valueOf('\u00ae')); // Registriermarke-Zeichen
put("¯", Character.valueOf('\u00af')); // Überstrich
put("°", Character.valueOf('\u00b0')); // Grad-Zeichen
put("±", Character.valueOf('\u00b1')); // Plusminus-Zeichen
put("²", Character.valueOf('\u00b2')); // Hoch-2-Zeichen
put("³", Character.valueOf('\u00b3')); // Hoch-3-Zeichen
put("´", Character.valueOf('\u00b4')); // Akut-Zeichen
put("µ", Character.valueOf('\u00b5')); // Mikro-Zeichen
put("¶", Character.valueOf('\u00b6')); // Absatz-Zeichen
put("·", Character.valueOf('\u00b7')); // Mittelpunkt
put("¸", Character.valueOf('\u00b8')); // Häkchen unten
put("¹", Character.valueOf('\u00b9')); // Hoch-1-Zeichen
put("º", Character.valueOf('\u00ba')); // Ordinal-Zeichen männlich
put("»", Character.valueOf('\u00bb')); // angewinkelte Anführungszeichen rechts
put("¼", Character.valueOf('\u00bc')); // ein Viertel
put("½", Character.valueOf('\u00bd')); // ein Halb
put("¾", Character.valueOf('\u00be')); // drei Viertel
put("¿", Character.valueOf('\u00bf')); // umgekehrtes Fragezeichen
put("À", Character.valueOf('\u00c0')); // A mit accent grave (Gravis)
put("Á", Character.valueOf('\u00c1')); // A mit accent aigu (Akut)
put("Â", Character.valueOf('\u00c2')); // A mit Zirkumflex
put("Ã", Character.valueOf('\u00c3')); // A mit Tilde
put("Ä", Character.valueOf('\u00c4')); // A Umlaut
put("Å", Character.valueOf('\u00c5')); // A mit Ring
put("Æ", Character.valueOf('\u00c6')); // A mit legiertem E
put("Ç", Character.valueOf('\u00c7')); // C mit Häkchen
put("È", Character.valueOf('\u00c8')); // E mit accent grave (Gravis)
put("É", Character.valueOf('\u00c9')); // E mit accent aigu (Akut)
put("Ê", Character.valueOf('\u00ca')); // E mit Zirkumflex
put("Ë", Character.valueOf('\u00cb')); // E Umlaut
put("Ì", Character.valueOf('\u00cc')); // I mit accent grave (Gravis)
put("Í", Character.valueOf('\u00cd')); // I mit accent aigu (Akut)
put("Î", Character.valueOf('\u00ce')); // I mit Zirkumflex
put("Ï", Character.valueOf('\u00cf')); // I Umlaut
put("Ð", Character.valueOf('\u00d0')); // grosses Eth (isländisch)
put("Ñ", Character.valueOf('\u00d1')); // N mit Tilde
put("Ò", Character.valueOf('\u00d2')); // O mit accent grave (Gravis)
put("Ó", Character.valueOf('\u00d3')); // O mit accent aigu (Akut)
put("Ô", Character.valueOf('\u00d4')); // O mit Zirkumflex
put("Õ", Character.valueOf('\u00d5')); // O mit Tilde
put("Ö", Character.valueOf('\u00d6')); // O Umlaut
put("×", Character.valueOf('\u00d7')); // Mal-Zeichen
put("Ø", Character.valueOf('\u00d8')); // O mit Schrägstrich
put("Ù", Character.valueOf('\u00d9')); // U mit accent grave (Gravis)
put("Ú", Character.valueOf('\u00da')); // U mit accent aigu (Akut)
put("Û", Character.valueOf('\u00db')); // U mit Zirkumflex
put("Ü", Character.valueOf('\u00dc')); // U Umlaut
put("Þ", Character.valueOf('\u00de')); // grosses Thorn (isländisch)
put("ß", Character.valueOf('\u00df')); // scharfes S
put("à", Character.valueOf('\u00e0')); // a mit accent grave (Gravis)
put("á", Character.valueOf('\u00e1')); // a mit accent aigu (Akut)
put("â", Character.valueOf('\u00e2')); // a mit Zirkumflex
put("ã", Character.valueOf('\u00e3')); // a mit Tilde
put("ä", Character.valueOf('\u00e4')); // a Umlaut
put("å", Character.valueOf('\u00e5')); // a mit Ring
put("æ", Character.valueOf('\u00e6')); // a mit legiertem e
put("ç", Character.valueOf('\u00e7')); // c mit Häkchen
put("è", Character.valueOf('\u00e8')); // e mit accent grave (Gravis)
put("é", Character.valueOf('\u00e9')); // e mit accent aigu (Akut)
put("ê", Character.valueOf('\u00ea')); // e mit Zirkumflex
put("ë", Character.valueOf('\u00eb')); // e Umlaut
put("ì", Character.valueOf('\u00ec')); // i mit accent grave (Gravis)
put("í", Character.valueOf('\u00ed')); // i mit accent aigu (Akut)
put("î", Character.valueOf('\u00ee')); // i mit Zirkumflex
put("ï", Character.valueOf('\u00ef')); // i Umlaut
put("ð", Character.valueOf('\u00f0')); // kleines Eth (isländisch)
put("ñ", Character.valueOf('\u00f1')); // n mit Tilde
put("ò", Character.valueOf('\u00f2')); // o mit accent grave (Gravis)
put("ó", Character.valueOf('\u00f3')); // o mit accent aigu (Akut)
put("ô", Character.valueOf('\u00f4')); // o mit Zirkumflex
put("õ", Character.valueOf('\u00f5')); // o mit Tilde
put("ö", Character.valueOf('\u00f6')); // o Umlaut
put("÷", Character.valueOf('\u00f7')); // Divisions-Zeichen
put("ø", Character.valueOf('\u00f8')); // o mit Schrägstrich
put("ù", Character.valueOf('\u00f9')); // u mit accent grave (Gravis)
put("ú", Character.valueOf('\u00fa')); // u mit accent aigu (Akut)
put("û", Character.valueOf('\u00fb')); // u mit Zirkumflex
put("ü", Character.valueOf('\u00fc')); // u Umlaut
put("ý", Character.valueOf('\u00fd')); // y mit accent aigu (Akut)
put("Ý", Character.valueOf('\u00dd')); // Y mit accent aigu (Akut)
put("þ", Character.valueOf('\u00fe')); // kleines Thorn (isländisch)
put("ÿ", Character.valueOf('\u00ff')); // y Umlaut
put("Ÿ", Character.valueOf('\u0178')); // Y Umlaut
// Greek Symbols
put("Α", Character.valueOf('\u0391')); // Alpha gross
put("α", Character.valueOf('\u03B1')); // alpha klein
put("Β", Character.valueOf('\u0392')); // Beta gross
put("β", Character.valueOf('\u03B2')); // Beta klein
put("Γ", Character.valueOf('\u0393')); // Gamma gross
put("γ", Character.valueOf('\u03B3')); // Gamme klein
put("Δ", Character.valueOf('\u0394')); // Delta gross
put("δ", Character.valueOf('\u03B4')); // Delta klein
put("Ε", Character.valueOf('\u0395')); // Epsilon gross
put("ε", Character.valueOf('\u03B5')); // Epsilon klein
put("Ζ", Character.valueOf('\u0396')); // Epsilon gross
put("ζ", Character.valueOf('\u03B6')); // Epsilon klein
put("Η", Character.valueOf('\u0397')); // Eta gross
put("η", Character.valueOf('\u03B7')); // Eta klein
put("Θ", Character.valueOf('\u0398')); // Theta gross
put("θ", Character.valueOf('\u03B8')); // Theta klein
put("Ι", Character.valueOf('\u0399')); // Iota gross
put("ι", Character.valueOf('\u03B9')); // Iota klein
put("Κ", Character.valueOf('\u039A')); // Kappa gross
put("κ", Character.valueOf('\u03BA')); // Kappa klein
put("Λ", Character.valueOf('\u039B')); // Lambda gross
put("λ", Character.valueOf('\u03BB')); // Lambda klein
put("Μ", Character.valueOf('\u039C')); // Mu gross
put("μ", Character.valueOf('\u03BC')); // Mu klein
put("Ν", Character.valueOf('\u039D')); // Nu gross
put("ν", Character.valueOf('\u03BD')); // Nu klein
put("Ξ", Character.valueOf('\u039E')); // Xi gross
put("ξ", Character.valueOf('\u03BE')); // Xi klein
put("Ο", Character.valueOf('\u039F')); // Omicron gross
put("ο", Character.valueOf('\u03BF')); // Omicron klein
put("Π", Character.valueOf('\u03A0')); // Pi gross
put("π", Character.valueOf('\u03C0')); // Pi klein
put("Ρ", Character.valueOf('\u03A1')); // Rho gross
put("ρ", Character.valueOf('\u03C1')); // Rho klein
put("ς", Character.valueOf('\u03C2')); // Schluss-Sigma
put("Σ", Character.valueOf('\u03A3')); // Sigma gross
put("σ", Character.valueOf('\u03C3')); // Sigma klein
put("Τ", Character.valueOf('\u03A4')); // Tau gross
put("τ", Character.valueOf('\u03C4')); // Tau klein
put("Υ", Character.valueOf('\u03A5')); // Upsilon gross
put("υ", Character.valueOf('\u03C5')); // Upsilon klein
put("Φ", Character.valueOf('\u03A6')); // Phi gross
put("φ", Character.valueOf('\u03C6')); // Phi klein
put("Χ", Character.valueOf('\u03A7')); // Chi gross
put("χ", Character.valueOf('\u03C7')); // Chi klein
put("Ψ", Character.valueOf('\u03A8')); // Psi gross
put("ψ", Character.valueOf('\u03C8')); // Psi klein
put("Ω", Character.valueOf('\u03A9')); // Omega gross
put("ω", Character.valueOf('\u03C9')); // Omega klein
put("ϑ", Character.valueOf('\u03D1'));// theta Symbol
put("ϒ", Character.valueOf('\u03D2')); // ypsilon mit Haken
put("ϖ", Character.valueOf('\u03D6')); // greek pi symbol
// Mathematical Symbols
put("∀", Character.valueOf('\u2200')); // for all
put("∂", Character.valueOf('\u2202')); // partial differential
put("∃", Character.valueOf('\u2203')); // there exists
put("∅", Character.valueOf('\u2205')); // empty set = null set = diameter
put("∇", Character.valueOf('\u2207')); // nabla = backward difference
put("∈", Character.valueOf('\u2208')); // element of
put("∉", Character.valueOf('\u2209')); // not an element of
put("∋", Character.valueOf('\u220B')); // contains as member
put("∏", Character.valueOf('\u220F')); // n-ary product = product sign
put("∑", Character.valueOf('\u2211')); // n-ary sumation
put("−", Character.valueOf('\u2212')); // minus sign
put("∗", Character.valueOf('\u2217')); // asterisk operator
put("√", Character.valueOf('\u221A')); // square root = radical sign
put("∝", Character.valueOf('\u221D')); // proportional to
put("∞", Character.valueOf('\u221E')); // infinity
put("∠", Character.valueOf('\u2220')); // angle
put("∧", Character.valueOf('\u2227')); // logical and
put("∨", Character.valueOf('\u2228')); // logical or
put("∩", Character.valueOf('\u2229')); // intersection
put("∪", Character.valueOf('\u222A')); // union
put("∫", Character.valueOf('\u222B')); // integral
put("∴", Character.valueOf('\u2234')); // therefore
put("∼", Character.valueOf('\u223C')); // tilde operator = varies with = similar to
put("≅", Character.valueOf('\u2245')); // approximately equal to
put("≈", Character.valueOf('\u2248')); // almost equal to
put("≠", Character.valueOf('\u2260')); // not equal to
put("≡", Character.valueOf('\u2261')); // identical to
put("≤", Character.valueOf('\u2264')); // less-than or equal to
put("≥", Character.valueOf('\u2265')); // greater-than or equal to
put("⊂", Character.valueOf('\u2282')); // subset of
put("⊃", Character.valueOf('\u2283')); // superset of
put("⊄", Character.valueOf('\u2284')); // not a subset of
put("⊆", Character.valueOf('\u2286')); // subset of or equal to
put("⊇", Character.valueOf('\u2287')); // superset of or equal to
put("⊕", Character.valueOf('\u2295')); // circled plus = direct sum
put("⊗", Character.valueOf('\u2297')); // circled times = vector product
put("⊥", Character.valueOf('\u22A5')); // up tack = orthogonal to = perpendicular
put("⋅", Character.valueOf('\u22C5')); // dot operator
put("◊", Character.valueOf('\u25CA')); // lozenge
put("ƒ", Character.valueOf('\u0192')); // latin small f with hook = function = florin
// General Punctuation
put("•", Character.valueOf('\u2022')); // bullet = black small circle
put("…", Character.valueOf('\u2026')); // horizontal ellipsis = three dot leader
put("′", Character.valueOf('\u2032')); // prime = minutes = feet
put("″", Character.valueOf('\u2033')); // double prime = seconds = inches
put("‾", Character.valueOf('\u203E')); // overline = spacing overscore
put("⁄", Character.valueOf('\u2044')); // fraction slash
// Letterlike Symbols
put("℘", Character.valueOf('\u2118')); // script capital P = power set = Weierstrass p
put("ℑ", Character.valueOf('\u2111')); // blackletter capital I = imaginary part
put("ℜ", Character.valueOf('\u211C')); // blackletter capital R = real part symbol
put("™", Character.valueOf('\u2122')); // trade mark sign
put("ℵ", Character.valueOf('\u2135')); // alef symbol = first transfinite cardinal
put("€", Character.valueOf('\u20ac')); // euro currency
// Arrow Symbols
put("←", Character.valueOf('\u2190'));
put("↑", Character.valueOf('\u2191'));
put("→", Character.valueOf('\u2192'));
put("↓", Character.valueOf('\u2193'));
put("↔", Character.valueOf('\u2194'));
put("↵", Character.valueOf('\u21b5'));
put("⇐", Character.valueOf('\u21d0'));
put("⇑", Character.valueOf('\u21d1'));
put("⇒", Character.valueOf('\u21d2'));
put("⇓", Character.valueOf('\u21d3'));
put("⇔", Character.valueOf('\u21d4'));
// Miscellaneous Symbols
put("♠", Character.valueOf('\u2660')); // black spade suit
put("♣", Character.valueOf('\u2663')); // black club suit = shamrock
put("♥", Character.valueOf('\u2665')); // black heart suit = valentine
put("♦", Character.valueOf('\u2666')); // black diamond suit
// Miscellaneous Technical
put("⌈", Character.valueOf('\u2308')); // left ceiling = apl upstile
put("⌉", Character.valueOf('\u2309')); // right ceiling
put("⌊", Character.valueOf('\u230A')); // left floor = apl downstile
put("⌋", Character.valueOf('\u230B')); // right floor
put("〈", Character.valueOf('\u2329')); // left-pointing angle bracket = bra
put("〉", Character.valueOf('\u232A')); // right-pointing angle bracket = ket
// Benannte Zeichen lateinisch erweitert
put("&Oelig;", Character.valueOf('\u0152')); // OE-Ligatur
put("œ", Character.valueOf('\u0153')); // OE-Ligatur klein
put("Š", Character.valueOf('\u0160')); // S mit Hatschek (Caron)
put("š", Character.valueOf('\u0161')); // s mit Hatschek (Caron)
// Benannte Zeichen für Interpunktion
put("‘", Character.valueOf('\u8216'));
put("’", Character.valueOf('\u8217'));
put("“", Character.valueOf('\u8220'));
put("”", Character.valueOf('\u8221'));
put(" ", Character.valueOf('\u2002'));
put(" ", Character.valueOf('\u2003'));
put(" ", Character.valueOf('\u2009'));
put("", Character.valueOf('\u200C'));
put("", Character.valueOf('\u200D'));
put("", Character.valueOf('\u200E'));
put("", Character.valueOf('\u200F'));
put("–", Character.valueOf('\u2013'));
put("—", Character.valueOf('\u2014'));
put("‚", Character.valueOf('\u201A'));
put("„", Character.valueOf('\u201E'));
put("†", Character.valueOf('\u2020'));
put("‡", Character.valueOf('\u2021'));
put("‰", Character.valueOf('\u2030'));
put("‹", Character.valueOf('\u2039'));
put("›", Character.valueOf('\u203A'));
}
};
/**
* Contains all characters from the {@link #htmlEntities} Map but the HTML Entity is the value and the unicode char is the key.
*/
private static final HashMap<Character, String> reverseHtmlEntities = new HashMap<Character, String>() {
private static final long serialVersionUID = 94497908219310662L;
{
for (Map.Entry<String, Character> entry : htmlEntities.entrySet()) {
String key = entry.getKey();
Character value = entry.getValue(); // the unicode char
put(value, key);
}
}
};
/**
* Determines the character for a given HTML entity. If the HTML entity, specified with the <code>entity</code> parameter, is not found, the given
* {@link String} is returned. <br>
* <br>
* The HTML entity {@link String} can be somtehing like " " but it's not wickedly if the '&' or ';' character is missing. These characters can be
* omitted.
*
* @param entity
* The entity to be converted into the associated character.
* @return The associated character or, if no associated character is found, <code>null</code> is returned.
*/
private static Character getHTMLEntityCharacter(final String entity) {
if (entity == null || entity.length() == 0) {
return null;
}
try {
// start some normalization
String processEntity = entity.trim();
if (!processEntity.startsWith("&")) {
// Append the & char if not presend
processEntity = "&" + processEntity;
}
if (!processEntity.endsWith(";")) {
// Append the ; char at the end if not present
processEntity = processEntity + ";";
}
final Character fetchedEntity = htmlEntities.get(processEntity);
return fetchedEntity;
} catch (Exception e) {
return null;
}
}
/**
* Decodes a numeric entity like <code>"@"</code> or <code>"@"</code> using the <code>decodeUnicode(String)</code> method.
* <code>decodeDecimalEntity(String)</code> only reformats the given String into an unicode ascii sequence and returns the reuslt from the
* <code>decodeUnicode(String)</code> method.
*
* @param str
* The entity to decode.
* @return The character matching to the given entity.
* @throws IllegalArgumentException
* if the entity could not be decoded.
*/
private static String decodeNumericEntity(final String str) {
String hexValue = str;
try {
if (hexValue.charAt(2) == 'x') {
// €, its already hex (only cut the and the ; away)
hexValue = str.substring(3, str.length() - 1);
} else {
// ⁈, convert it to a hex value and cut the and the ; away.
String substring = str.substring(2, str.length() - 1);
hexValue = hex(Integer.valueOf(substring).doubleValue());
}
// use the unicode decode method to decode the value.
return String.valueOf(decodeNumericUnicodeSequence(hexValue));
} catch (Exception e) {
throw new RuntimeException("IllegalUnicodeSequence " + str, e);
}
}
/**
* Creates a string that contains a repeating character of a specified length.
*
* @param size
* Number of recurrences.
* @param repeat
* The character that should be repeated. The first character of the <code>String</code> will be used.
* @return A String with the specified number of repeating characters.
*/
private static final String string(final int size, final char repeat) {
if (size <= 0) {
return EMPTY;
}
final StringBuilder returnValue = new StringBuilder(size);
for (int i = 0; i < size; i++) {
returnValue.append(repeat);
}
return returnValue.toString();
}
/**
* Decodes an hexadecimal, numeric value into an UTF-8 String.
*
* @param s
* The numeric String to be decoded.
* @return The decoded unicode character.
* @throws IllegalArgumentException
*/
private static char decodeNumericUnicodeSequence(final String s) {
// normalize the given numeric string to a two byte numeric string value (40 -> 0040).
String toDecode = string(4 - s.length(), '0') + s;
int value = 0;
for (int i = 0; i < 4; i++) {
char aChar = toDecode.charAt(i);
switch (aChar) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value = (value << 4) + aChar - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
value = (value << 4) + 10 + aChar - 'a';
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
value = (value << 4) + 10 + aChar - 'A';
break;
default:
throw new RuntimeException("IllegalUnicodeSequence " + s);
}
}
return (char) value;
}
/**
* Creates a string that represents the hexadecimal value of a specified number.
*
* If number is not a whole number, it is rounded to the nearest whole number before being evaluated.
*
* @param value
* The number to be used for calculation.
* @return The hex <code>String</code> from the given number.
*/
private static final String hex(double value) {
value = round(value, 0);
return Integer.toHexString((int) value).toUpperCase();
}
/**
* Round a double value to the next closest number considing the specified number of decimal places.
*
* @param value
* The number to be rounded.
* @param places
* Specifies how many places to the right of the decimal are included in the rounding.
* @return The rounded value.
*/
private static final double round(double value, final int places) {
long factor = (long) Math.pow(10, places);
// Shift the decimal the correct number of places to the right.
value = value * factor;
return (double) Math.round(value) / factor;
}
/**
* Decodes all HTML entities like: € or > in the given text and replaces them with the correct String. Did not throws any kind of Exception.
*
* @param text
* The text to be proecessed.
* @return The processed <code>String</code>. If the <code>text</code> is <code>null</code>, <code>null</code> will be returned.
*/
public String decodeEntities() {
if (text == null) {
return null;
}
try {
final StringBuilder resultBuf = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
if (text.charAt(i) == '&') {
// this can be possibly a HTML entity!
String tmp = text.substring(i);
final int semikolonIndex = tmp.indexOf(';');
if (semikolonIndex != -1) {
tmp = tmp.substring(0, semikolonIndex + 1);
Character entity = getHTMLEntityCharacter(tmp);
// is there an entity found, than it's only one character!
if (entity != null) {
resultBuf.append(entity);
i += tmp.length() - 1; // skip the rest from processing
continue;
} else {
// entity not in list, test for a numeric entity. A numeric entity could not be longer than 8 chars.
if (tmp.charAt(1) == '#' && tmp.length() <= 8) {
resultBuf.append(decodeNumericEntity(tmp));
i += tmp.length() - 1; // skip the rest from processing
continue;
}
}
}
// append the & character if the previous process did not match.
resultBuf.append(text.charAt(i));
} else {
// not escaped, just append it
resultBuf.append(text.charAt(i));
}
}
return resultBuf.toString();
} catch (Exception e) {
Logger.getLogger(HTMLEntityConverter.class.getName()).log(Level.WARNING,
"converting html entity has failed for string " + text + ". The unconverted text is used instead.", e);
}
return text;
}
/**
* Exchanges all chars in the given text which are not a member of seven or eight bit ascii or which are problematic for xml processing (<>="' etc.). <BR>
* <BT> The exchange character is a numeric html entity character. For example for the euro char will be replaced by the numeric html entity €
*
* @param text
* The text to be encoded.
* @param encodeCondition
* use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit
* ascii chars.
* @param useNamedEntities
* Tells if the encoding should be done with named entites or not. For example &euro will be used instead of €
* @return The encoded text. If the given text is <code>null</code>, <code>null</code> will be returned.
*/
public String encodeEntities(final boolean useNamedEntities) {
if (text == null) {
return null;
}
final StringBuilder resultBuf = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
final char textChar = text.charAt(i);
if (Character.isWhitespace(textChar) || Character.isSpaceChar(textChar)) {
resultBuf.append(" ");
} else if (shouldEncodeCharacer(textChar, encodeCondition)) {
//skip already encoded entities.
if (textChar == '&' && !isReencodeEntities()) {
String entity = isEntity(text, i);
if (entity != null) {
i += entity.length() - 1;
resultBuf.append(entity);
continue;
}
}
String entity = "" + (int) textChar + ";";
if (useNamedEntities) {
// take a look if there is a named entity available for the current character.
String namedEntity = reverseHtmlEntities.get(Character.valueOf(textChar));
if (namedEntity != null) {
entity = namedEntity;
}
}
resultBuf.append(entity);
} else {
resultBuf.append(textChar);
}
}
return resultBuf.toString();
}
/**
* Determines if there is a known entity int the given <code>text</code> at
* the given <code>idx</code> (position).
*
* @param text The text to be tested
* @param idx The index where the entity should be.
* @return The identified entity or <code>null</code> if the entity isn't onw or
* could not be identified.
*/
private static String isEntity(String text, int idx) {
if (text.charAt(idx) == '&' && text.length() >= idx + 3) {
if (text.charAt(idx + 1) == '#') {
//test if we have a hex entity.
int hexEntityEnd = -1;
boolean digitFound = false;
for (int i = idx + 2; i < text.length(); i++) {
if (Character.isDigit(text.charAt(i))) {
digitFound = true;
} else if (text.charAt(i) == ';' && digitFound) {
hexEntityEnd = i;
break;
} else {
break;
}
}
if (hexEntityEnd != -1) {
//return the hex entity.
return text.substring(idx, hexEntityEnd + 1);
}
} else {
//test if we have a named entity
for (Map.Entry<String, Character> entry : htmlEntities.entrySet()) {
final String namedEntity = entry.getKey();
if (text.startsWith(namedEntity, idx)) {
return namedEntity;
}
}
}
}
return null;
}
/**
* Exchanges all chars in the given text which are not a member of seven or eight bit ascii or which are problematic for xml processing (<>="' etc.). <BR>
* <BT> The exchange character is a numeric html entity character. For example for the euro char will be replaced by the numeric html entity €
*
* @param text
* The text to be encoded.
* @param encodeCondition
* use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit
* ascii chars.
* @return The encoded text. If the given text is <code>null</code>, <code>null</code> will be returned.
*/
public String encodeEntities() {
return encodeEntities(this.isUseNamedEntities());
}
/**
* Tells if the given character should be encoded or not.
*
* @param c
* The character
* @param encodeCondition
* use {@link #ENCODE_SEVEN_BIT_ASCII} and {@link #ENCODE_EIGHT_BIT_ASCII} for telling to get a string which only contains seven or eight bit
* ascii chars.
* @return <code>true</code> if the char should be encoded and <code>false</code> otherwise.
*/
private static boolean shouldEncodeCharacer(final char c, final int encodeCondition) {
switch (encodeCondition) {
case ENCODE_SEVEN_BIT_ASCII:
return c > 127;
case ENCODE_EIGHT_BIT_ASCII:
return c > 255;
case ENCODE_SEVEN_BIT_XML:
if (c > 127) {
return true;
} else if (invalidAscii7XMLCharacters.contains(Character.valueOf(c))) {
return true;
} else if (Character.isISOControl(c)) {
return true;
}
default:
return false;
}
}
/**
* @see #setEncodeCondition(int)
*/
public int getEncodeCondition() {
return encodeCondition;
}
/**
* Sets the encode condition.
* @param encodeCondition <code>ENCODE_SEVEN_BIT_ASCII, ENCODE_EIGHT_BIT_ASCII, ENCODE_SEVEN_BIT_XML</code>
*/
public void setEncodeCondition(int encodeCondition) {
this.encodeCondition = encodeCondition;
}
/**
* @see #setReencodeEntities(boolean)
*/
public boolean isReencodeEntities() {
return reencodeEntities;
}
/**
* If there already entities in the given html, reencode them or do not touch them.
* @param reencodeEntities <code>true</code> reencode them and <code>false</code> do not touch them.
*/
public void setReencodeEntities(boolean reencodeEntities) {
this.reencodeEntities = reencodeEntities;
}
/**
* @see #setText(java.lang.String)
*/
public String getText() {
return text;
}
/**
* Sets the text to be encoded / decoded.
* @param text The text to be encoded / decoded.
*/
public void setText(String text) {
this.text = text;
}
/**
* @see #setUseNamedEntities(boolean)
*/
public boolean isUseNamedEntities() {
return useNamedEntities;
}
/**
* Use named entities like & for the encoding process or not.
* @param useNamedEntities <code>true</code> for using named entities and <code>false</code>
* for using the numeric ones like "ac;".
*/
public void setUseNamedEntities(boolean useNamedEntities) {
this.useNamedEntities = useNamedEntities;
}
}