/* This code is part of Freenet. It is distributed under the GNU General
* Public License, version 2 (or at your option any later version). See
* http://www.gnu.org/ for further details of the GPL. */
package freenet.support;
//import java.util.HashMap;
import java.util.Map;
/**
* Description: Utility for converting character references e.g.: < >
* " å И 水
*
* @author Yves Lempereur (avian)
*/
public class HTMLDecoder {
static Map<String, Character> charTable = HTMLEntities.decodeMap;
public static String decode(String s) {
String t;
Character ch;
int tmpPos, i;
int maxPos = s.length();
StringBuilder sb = new StringBuilder(maxPos);
int curPos = 0;
while (curPos < maxPos) {
char c = s.charAt(curPos++);
if (c == '&') {
tmpPos = curPos;
if (tmpPos < maxPos) {
char d = s.charAt(tmpPos++);
if (d == '#') { // REDFLAG: FIXME: We might want to prevent control characters from beeing created here...
if (tmpPos < maxPos) {
d = s.charAt(tmpPos++);
if ((d == 'x') || (d == 'X')) {
if (tmpPos < maxPos) {
d = s.charAt(tmpPos++);
if (isHexDigit(d)) {
while (tmpPos < maxPos) {
d = s.charAt(tmpPos++);
if (!isHexDigit(d)) {
if (d == ';') {
t =
s.substring(
curPos + 2,
tmpPos - 1);
try {
i =
Integer.parseInt(
t,
16);
if ((i >= 0)
&& (i < 65536)) {
c = (char) i;
curPos = tmpPos;
}
} catch (NumberFormatException e) {
}
}
break;
}
}
}
}
} else if (isDigit(d)) {
while (tmpPos < maxPos) {
d = s.charAt(tmpPos++);
if (!isDigit(d)) {
if (d == ';') {
t =
s.substring(
curPos + 1,
tmpPos - 1);
try {
i = Integer.parseInt(t);
if ((i >= 0) && (i < 65536)) {
c = (char) i;
curPos = tmpPos;
}
} catch (NumberFormatException e) {
}
}
break;
}
}
}
}
} else if (isLetter(d)) {
while (tmpPos < maxPos) {
d = s.charAt(tmpPos++);
if (!isLetterOrDigit(d)) {
if (d == ';') {
t = s.substring(curPos, tmpPos - 1);
ch = charTable.get(t);
if (ch != null) {
c = ch.charValue();
curPos = tmpPos;
}
}
break;
}
}
}
}
}
sb.append(c);
}
return sb.toString();
}
private static boolean isLetterOrDigit(char c) {
return isLetter(c) || isDigit(c);
}
private static boolean isHexDigit(char c) {
return isHexLetter(c) || isDigit(c);
}
private static boolean isLetter(char c) {
return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
}
private static boolean isHexLetter(char c) {
return ((c >= 'a') && (c <= 'f')) || ((c >= 'A') && (c <= 'F'));
}
private static boolean isDigit(char c) {
return (c >= '0') && (c <= '9');
}
public static String compact(String s) {
int maxPos = s.length();
StringBuilder sb = new StringBuilder(maxPos);
int curPos = 0;
while (curPos < maxPos) {
char c = s.charAt(curPos++);
if (isWhitespace(c)) {
while ((curPos < maxPos) && isWhitespace(s.charAt(curPos))) {
curPos++;
}
c = '\u0020';
}
sb.append(c);
}
return sb.toString();
}
// HTML is very particular about what constitutes white space.
public static boolean isWhitespace(char ch) {
return
//space
(ch == '\u0020')
//Mac newline
|| (ch == '\r')
//Unix newline
|| (ch == '\n')
//tab
|| (ch == '\u0009')
//Control
|| (ch == '\u000c')
//zero width space
|| (ch == '\u200b');
}
}