package net.nightwhistler.htmlspanner;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TextUtil {
private static Pattern SPECIAL_CHAR_WHITESPACE = Pattern
.compile("(&[a-z]*;|?([a-f]|[A-F]|[0-9])*;|[\\s\n]+)");
private static Pattern SPECIAL_CHAR_NO_WHITESPACE = Pattern
.compile("(&[a-z]*;|?([a-f]|[A-F]|[0-9])*;)");
private static Map<String, String> REPLACEMENTS = new HashMap<String, String>();
static {
REPLACEMENTS.put(" ", "\u00A0");
REPLACEMENTS.put("&", "&");
REPLACEMENTS.put(""", "\"");
REPLACEMENTS.put("¢", "¢");
REPLACEMENTS.put("<", "<");
REPLACEMENTS.put(">", ">");
REPLACEMENTS.put("§", "§");
REPLACEMENTS.put("“", "“");
REPLACEMENTS.put("”", "”");
REPLACEMENTS.put("‘", "‘");
REPLACEMENTS.put("’", "’");
REPLACEMENTS.put("–", "\u2013");
REPLACEMENTS.put("—", "\u2014");
REPLACEMENTS.put("―", "\u2015");
}
/**
* Replaces all HTML entities ( <, & ), with their Unicode
* characters.
*
* @param aText
* text to replace entities in
* @return the text with entities replaced.
*/
public static String replaceHtmlEntities(String aText,
boolean preserveFormatting) {
StringBuffer result = new StringBuffer();
Map<String, String> replacements = new HashMap<String, String>(
REPLACEMENTS);
Matcher matcher;
if (preserveFormatting) {
matcher = SPECIAL_CHAR_NO_WHITESPACE.matcher(aText);
} else {
matcher = SPECIAL_CHAR_WHITESPACE.matcher(aText);
replacements.put("", " ");
replacements.put("\n", " ");
}
while (matcher.find()) {
try {
matcher.appendReplacement(result,
getReplacement(matcher, replacements));
} catch ( ArrayIndexOutOfBoundsException i ) {
//Ignore, seems to be a matcher bug
}
}
matcher.appendTail(result);
return result.toString();
}
private static String getReplacement(Matcher aMatcher,
Map<String, String> replacements) {
String match = aMatcher.group(0).trim();
String result = replacements.get(match);
if (result != null) {
return result;
} else if ( match.startsWith("")) {
Integer code;
// Translate to unicode character.
try {
//Check if it's hex or normal
if ( match.startsWith("") ) {
code = Integer.decode( "0x" + match.substring(3, match.length() -1));
} else {
code = Integer.parseInt(match.substring(2,
match.length() - 1));
}
return "" + (char) code.intValue();
} catch (NumberFormatException nfe) {
return "";
}
} else {
return "";
}
}
}