HtmlTools.java example

Explorer
freemind-mmx-master
/*FreeMind - A Program for creating and viewing Mindmaps
 *Copyright (C) 2006  Christian Foltin <christianfoltin@users.sourceforge.net>
 *See COPYING for Details
 *
 *This program is free software; you can redistribute it and/or
 *modify it under the terms of the GNU General Public License
 *as published by the Free Software Foundation; either version 2
 *of the License, or (at your option) any later version.
 *
 *This program is distributed in the hope that it will be useful,
 *but WITHOUT ANY WARRANTY; without even the implied warranty of
 *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *GNU General Public License for more details.
 *
 *You should have received a copy of the GNU General Public License
 *along with this program; if not, write to the Free Software
 *Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
/*$Id: HtmlTools.java,v 1.1.2.28 2010/12/04 21:07:23 christianfoltin Exp $*/

package freemind.main;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.text.BadLocationException;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;

/** */
public class HtmlTools {

	public static final String NBSP = "\u00A0";

	private static Logger logger;

	private static HtmlTools sInstance = new HtmlTools();

	private static final Pattern HTML_PATTERN = Pattern
			.compile("(?s)^\\s*<\\s*html.*?>.*");
	private static final Pattern FIND_TAGS_PATTERN = Pattern
			.compile("([^<]*)(<[^>]+>)");
	private static final Pattern SLASHED_TAGS_PATTERN = Pattern.compile("<(("
			+ "br|area|base|basefont|" + "bgsound|button|col|colgroup|embed|hr"
			+ "|img|input|isindex|keygen|link|meta"
			+ "|object|plaintext|spacer|wbr" + ")(\\s[^>]*)?)/>");

	private static final Pattern TAGS_PATTERN = Pattern.compile("(?s)<[^><]*>");

	public static final String SP = " ";

	/**
     * 
     */
	private HtmlTools() {
		super();
		logger = Resources.getInstance().getLogger(HtmlTools.class.getName());
	}

	public static HtmlTools getInstance() {
		return sInstance;
	}

	public String toXhtml(String htmlText) {
		if (!isHtmlNode(htmlText)) {
			return null;
		}
		logger.fine("Enter toXhtml with " + htmlText);
		StringReader reader = new StringReader(htmlText);
		StringWriter writer = new StringWriter();
		try {
			XHTMLWriter.html2xhtml(reader, writer);
			String resultXml = writer.toString();
			if (Resources.getInstance().getBoolProperty("wh_nonascii_in_utf8")) {
				resultXml = unescape_utf8(resultXml);
			}
			// for safety:
			if (isWellformedXml(resultXml)) {
				logger.fine("Leave toXhtml with " + resultXml);
				return resultXml;
			}
		} catch (IOException e) {
			freemind.main.Resources.getInstance().logException(e);
		} catch (BadLocationException e) {
			freemind.main.Resources.getInstance().logException(e);
		}
		// fallback:
		String fallbackText = toXMLEscapedText(htmlText);
		logger.fine("Leave toXhtml with fallback " + fallbackText);
		return fallbackText;
	}

	public String toHtml(String xhtmlText) {
		// Remove '/' from <.../> of elements that do not have '/' there in HTML
		return SLASHED_TAGS_PATTERN.matcher(xhtmlText).replaceAll("<$1>");
	}

	public static class IndexPair {
		public int originalStart;
		public int originalEnd;
		public int replacedStart;
		public int replacedEnd;
		public boolean mIsTag;
		public boolean mIsAlreadyAppended = false;

		/**
		 * @param pIsTag
		 *            TODO
		 */
		public IndexPair(int pOriginalStart, int pOriginalEnd,
				int pReplacedStart, int pReplacedEnd, boolean pIsTag) {
			super();

			originalStart = pOriginalStart;
			originalEnd = pOriginalEnd;
			replacedStart = pReplacedStart;
			replacedEnd = pReplacedEnd;
			mIsTag = pIsTag;
		}

		/**
		 * generated by CodeSugar http://sourceforge.net/projects/codesugar
		 */

		public String toString() {
			StringBuffer buffer = new StringBuffer();
			buffer.append("[IndexPair:");
			buffer.append(" originalStart: ");
			buffer.append(originalStart);
			buffer.append(" originalEnd: ");
			buffer.append(originalEnd);
			buffer.append(" replacedStart: ");
			buffer.append(replacedStart);
			buffer.append(" replacedEnd: ");
			buffer.append(replacedEnd);
			buffer.append(" is a tag: ");
			buffer.append(mIsTag);
			buffer.append("]");
			return buffer.toString();
		}
	}

	/**
	 * Replaces text in node content without replacing tags. fc, 19.12.06: This
	 * method is very difficult. If you have a simplier method, please supply
	 * it. But look that it complies with FindTextTests!!!
	 */
	public String getReplaceResult(Pattern pattern, String replacement,
			String text) {
		ArrayList splittedStringList = new ArrayList();
		String stringWithoutTags = null;
		// remove tags and denote their positions:
		{
			StringBuffer sb = new StringBuffer();
			Matcher matcher = FIND_TAGS_PATTERN.matcher(text);
			int lastMatchEnd = 0;
			while (matcher.find()) {
				String textWithoutTag = matcher.group(1);
				// Append text without tags:
				int replStart = sb.length();
				matcher.appendReplacement(sb, "$1");
				IndexPair indexPair;
				if (textWithoutTag.length() > 0) {
					indexPair = new IndexPair(lastMatchEnd, matcher.end(1),
							replStart, sb.length(), false);
					lastMatchEnd = matcher.end(1);
					// System.out.println(sb.toString()
					// + ", "
					// + input.substring(indexPair.originalStart,
					// indexPair.originalEnd) + ", " + indexPair);
					splittedStringList.add(indexPair);
				}
				// String tag = matcher.group(2);
				replStart = sb.length();
				indexPair = new IndexPair(lastMatchEnd, matcher.end(2),
						replStart, sb.length(), true);
				lastMatchEnd = matcher.end(2);
				// System.out.println(sb.toString() + ", " +
				// input.substring(indexPair.originalStart,
				// indexPair.originalEnd)+ ", " + indexPair);
				splittedStringList.add(indexPair);
			}
			int replStart = sb.length();
			matcher.appendTail(sb);
			// append tail only if there is a tail
			if (sb.length() != replStart) {
				IndexPair indexPair = new IndexPair(lastMatchEnd,
						text.length(), replStart, sb.length(), false);
				// System.out.println(sb.toString() + ", " + indexPair);
				splittedStringList.add(indexPair);
			}
			// System.out.println(sb.toString());
			stringWithoutTags = sb.toString();
		}

		// // give it out:
		// for (Iterator iter = splittedStringList.iterator(); iter.hasNext();)
		// {
		// IndexPair pair = (IndexPair) iter.next();
		// System.out.println(text.substring(pair.originalStart,
		// pair.originalEnd) + ", " + pair);
		// }

		/**
		 * For each pair which is not a tag we find concurrences and replace
		 * them, if pair is a tag then we just append
		 */
		StringBuffer sbResult = new StringBuffer();
		for (Iterator iter = splittedStringList.iterator(); iter.hasNext();) {
			IndexPair pair = (IndexPair) iter.next();

			if (pair.mIsTag)
				append(sbResult, text, pair.originalStart, pair.originalEnd);
			else {

				Matcher matcher = pattern.matcher(text.substring(
						pair.originalStart, pair.originalEnd));
				int mStart = 0;
				int mEnd = 0;
				int mEndOld = 0;
				int mStartOld = 0;

				while (matcher.find()) {
					mStart = matcher.start();
					mEnd = matcher.end();

					append(sbResult, text, pair.originalStart + mEndOld,
							pair.originalStart + mStart);
					/**
					 * If it's a first iteration then we append text between
					 * start and first concurrence, and when it's not first
					 * iteration (mEndOld != 0) we append text between two
					 * concurrences
					 */

					// sbResult.append(text, pair.originalStart + mStart,
					// pair.originalStart + mEnd);
					// original text
					sbResult.append(replacement);
					mEndOld = mEnd;
					mStartOld = mStart;
				}
				append(sbResult, text, pair.originalStart + mEndOld,
						pair.originalEnd);
				// append tail
			}
		}
		// System.out.println("Result:'"+sbResult.toString()+"'");
		return sbResult.toString();
	}

	/**
	 * Need to program this, as the stringbuffer method appears in java 1.5
	 * first.
	 * */
	private void append(StringBuffer pSbResult, String pText, int pStart,
			int pEnd) {
		pSbResult.append(pText.substring(pStart, pEnd));
	}

	public int getMinimalOriginalPosition(int pI, ArrayList pListOfIndices) {
		for (Iterator iter = pListOfIndices.iterator(); iter.hasNext();) {
			IndexPair pair = (IndexPair) iter.next();
			if (pI >= pair.replacedStart && pI <= pair.replacedEnd) {
				return pair.originalStart + pI - pair.replacedStart;
			}
		}
		throw new IllegalArgumentException("Position " + pI + " not found.");
	}

	/**
	 * @return the maximal index i such that pI is mapped to i by removing all
	 *         tags from the original input.
	 */
	public int getMaximalOriginalPosition(int pI, ArrayList pListOfIndices) {
		for (int i = pListOfIndices.size() - 1; i >= 0; --i) {
			IndexPair pair = (IndexPair) pListOfIndices.get(i);
			if (pI >= pair.replacedStart) {
				if (!pair.mIsTag) {
					return pair.originalStart + pI - pair.replacedStart;
				} else {
					return pair.originalEnd;
				}
			}
		}
		throw new IllegalArgumentException("Position " + pI + " not found.");
	}

	/**
     */
	public static boolean isHtmlNode(String text) {
		for (int i = 0; i < text.length(); i++) {
			final char ch = text.charAt(i);
			if (ch == '<') {
				break;
			}
			if (!Character.isWhitespace(ch) || i == text.length()) {
				return false;
			}
		}
		return HTML_PATTERN.matcher(text.toLowerCase(Locale.ENGLISH)).matches();
	}

	/**
	 * Changes all unicode characters into &#xxx values.
	 * Opposite to {@link HtmlTools#unescapeHTMLUnicodeEntity(String)}
	 */
	public static String unicodeToHTMLUnicodeEntity(String text, boolean pPreserveNewlines) {
		/*
		 * Heuristic reserve for expansion : factor 1.2
		 */
		StringBuffer result = new StringBuffer((int) (text.length() * 1.2));
		int intValue;
		char myChar;
		for (int i = 0; i < text.length(); ++i) {
			myChar = text.charAt(i);
			intValue = (int) text.charAt(i);
			boolean outOfRange = intValue < 32  || !Resources.getInstance().getBoolProperty("wh_nonascii_in_utf8") && intValue > 126;
			if(pPreserveNewlines && myChar == '\n') {
				outOfRange = false;
			}
			if(pPreserveNewlines && myChar == '\r') {
				outOfRange = false;
			}
			if (outOfRange) {
				result.append("&#x").append(Integer.toString(intValue, 16))
						.append(';');
			} else {
				result.append(myChar);
			}
		}
		return result.toString();
	}

	/**
	 * Converts XML unicode entity-encoded characters into plain Java unicode
	 * characters; for example, ''&#xff;'' gets converted. Removes all
	 * XML-invalid entity characters, such as &#xb;.
	 * 
	 * Opposite to {@link HtmlTools#unicodeToHTMLUnicodeEntity(String, boolean)}
	 * 
	 * @param text
	 *            input
	 * @return the converted output.
	 */
	public static String unescapeHTMLUnicodeEntity(String text) {
		StringBuffer result = new StringBuffer(text.length());
		StringBuffer entity = new StringBuffer();
		boolean readingEntity = false;
		char myChar;
		char entityChar;
		for (int i = 0; i < text.length(); ++i) {
			myChar = text.charAt(i);
			if (readingEntity) {
				if (myChar == ';') {
					if (entity.charAt(0) == '#') {
						try {
							if (entity.charAt(1) == 'x') {
								// Hexadecimal
								entityChar = (char) Integer.parseInt(
										entity.substring(2), 16);
							} else {
								// Decimal
								entityChar = (char) Integer.parseInt(
										entity.substring(1), 10);
							}
							if (isXMLValidCharacter(entityChar))
								result.append(entityChar);
						} catch (NumberFormatException e) {
							result.append('&').append(entity).append(';');
						}
					} else {
						result.append('&').append(entity).append(';');
					}
					entity.setLength(0);
					readingEntity = false;
				} else {
					if (isXMLValidCharacter(myChar))
						entity.append(myChar);
				}
			} else {
				if (myChar == '&') {
					readingEntity = true;
				} else {
					if (isXMLValidCharacter(myChar))
						result.append(myChar);
				}
			}
		}
		if (entity.length() > 0) {
			result.append('&').append(entity).append(';');
		}
		return result.toString();
	}

	/**
	 * Removes all tags (<..>) from a string if it starts with "<html>..." to
	 * make it compareable.
	 */
	public static String removeHtmlTagsFromString(String text) {
		if (HtmlTools.isHtmlNode(text)) {
			return removeAllTagsFromString(text); // (?s) enables that . matches
													// newline.
		} else {
			return text;
		}
	}

	public static String removeAllTagsFromString(String text) {
		return TAGS_PATTERN.matcher(text).replaceAll("");
	}

	public static String htmlToPlain(String text) {
		return htmlToPlain(text, /* strictHTMLOnly= */true);
	}

	public static String htmlToPlain(String text, boolean strictHTMLOnly) {
		// 0. remove all newlines
		// 1. replace newlines, paragraphs, and table rows
		// 2. remove XML tags
		// 3. replace HTML entities including  
		// 4. unescape unicode entities
		// This is a very basic conversion, fixing the most annoying
		// inconvenience. You can imagine much better conversion of
		// HTML to plain text. Most of HTML tags can be handled
		// sensibly, like web browsers do it.
		if (strictHTMLOnly && !isHtmlNode(text)) {
			return text;
		}
		// System.err.println("base:"+text);
		String intermediate = text
				.replaceAll("(?ims)[\n\t]", "")
				. // Remove newlines
				replaceAll("(?ims) +", " ")
				. // Condense spaces
				replaceAll("(?ims)<br.*?>", "\n")
				.replaceAll("(?ims)<p.*?>", "\n\n")
				. // Paragraph
				replaceAll("(?ims)<div.*?>", "\n")
				. // Div - block
				replaceAll("(?ims)<tr.*?>", "\n")
				.replaceAll("(?ims)<dt.*?>", "\n")
				. // Defined term
				replaceAll("(?ims)<dd.*?>", "\n   ")
				. // Definition of defined term
				replaceAll("(?ims)<td.*?>", " ")
				.replaceAll("(?ims)<[uo]l.*?>", "\n")
				. // Beginning of a list
				replaceAll("(?ims)<li.*?>", "\n   * ")
				.replaceAll("(?ims) *</[^>]*>", ""). // Remaining closing HTML
														// tags
				replaceAll("(?ims)<[^/][^>]*> *", ""). // Remaining opening HTML
														// tags
				// FIXME Dimitry: is removing of all new lines at the begin a
				// good idea?
				replaceAll("^\n+", "").
				// fc: to remove start and end spaces.
				trim();

		intermediate = HtmlTools.unescapeHTMLUnicodeEntity(intermediate);

		// Entities, with the exception of &.

		intermediate = intermediate.replaceAll("(?ims)<", "<")
				.replaceAll("(?ims)>", ">").replaceAll("(?ims)"", "\"")
				.replaceAll("(?ims) ", " ");
		// System.err.println("intermediate:"+intermediate);
		return intermediate.replaceAll("(?ims)&", "&");
	}

	public static String plainToHTML(String text) {
		char myChar;
		String textTabsExpanded = text.replaceAll("\t", "         "); // Use
																		// eight
																		// spaces
																		// as
																		// tab
																		// width.
		StringBuffer result = new StringBuffer(textTabsExpanded.length()); // Heuristic
		int lengthMinus1 = textTabsExpanded.length() - 1;
		result.append("<html><body><p>");
		for (int i = 0; i < textTabsExpanded.length(); ++i) {
			myChar = textTabsExpanded.charAt(i);
			switch (myChar) {
			case '&':
				result.append("&");
				break;
			case '<':
				result.append("<");
				break;
			case '>':
				result.append(">");
				break;
			case ' ':
				if (i > 0 && i < lengthMinus1
						&& (int) textTabsExpanded.charAt(i - 1) > 32
						&& (int) textTabsExpanded.charAt(i + 1) > 32) {
					result.append(' ');
				} else {
					result.append(" ");
				}
				break;
			case '\n':
				result.append("<br>");
				break;
			default:
				result.append(myChar);
			}
		}
		return result.toString();
	}

	public static String toXMLUnescapedText(String text) {
		return text.replaceAll("<", "<").replaceAll(">", ">")
				.replaceAll(""", "\"").replaceAll("&", "&");
	}

	public static String toXMLEscapedTextExpandingWhitespace(String text) {
		// Spaces and tabs are handled
		text = text.replaceAll("\t", "         "); // Use eight spaces as tab
													// width.
		int len = text.length();
		StringBuffer result = new StringBuffer(len);
		char myChar;
		for (int i = 0; i < len; ++i) {
			myChar = text.charAt(i);
			switch (myChar) {
			case '&':
				result.append("&");
				break;
			case '<':
				result.append("<");
				break;
			case '>':
				result.append(">");
				break;
			case ' ':
				if (i > 0 && i < len - 1 && (int) text.charAt(i - 1) > 32
						&& (int) text.charAt(i + 1) > 32) {
					result.append(' ');
				} else {
					result.append(" ");
				}
				break;
			default:
				result.append(myChar);
			}
		}
		return result.toString();
	}

	public static String toXMLEscapedText(String text) {
		if(text == null) {
			return "ERROR: none";
		}
		return text.replaceAll("&", "&").replaceAll("<", "<")
				.replaceAll(">", ">").replaceAll("\"", """);
	}

	/**
	 * @return true, if well formed XML.
	 */
	public boolean isWellformedXml(String xml) {
		try {
			// Create a builder factory
			SAXParserFactory factory = SAXParserFactory.newInstance();
			factory.setValidating(false);

			// Create the builder and parse the file
			factory.newSAXParser().parse(
					new InputSource(new StringReader(xml)),
					new DefaultHandler());
			return true;
		} catch (SAXParseException e) {
			logger.log(
					Level.SEVERE,
					"XmlParseError on line " + e.getLineNumber() + " of " + xml,
					e);
		} catch (Exception e) {
			logger.log(Level.SEVERE, "XmlParseError", e);
		}
		return false;
	}

	/** \0 is not allowed: */
	public static String makeValidXml(String pXmlNoteText) {
		return pXmlNoteText.replaceAll("\0", "").replaceAll("�", "");
	}

	public static String replaceIllegalXmlCharacters(String fileContents) {
		// replace &xa; by newline.
		fileContents = fileContents.replaceAll("�*[Aa];", "\n");
		/*
		 *  is illegal, but sometimes occurs in 0.8.x maps. Thus, we
		 * exclude all from 0 - 1f and replace them by nothing. TODO: Which more
		 * are illegal??
		 */
		fileContents = fileContents.replaceAll("�*1?[0-9A-Fa-f];", "");
		// decimal: 0-31
		fileContents = fileContents.replaceAll("�*[1-2]?[0-9];", "");
		fileContents = fileContents.replaceAll("�*3[0-1];", "");
		return fileContents;
	}

	/**
	 * Determines whether the character is valid in XML. Invalid characters
	 * include most of the range x00-x1F, and more.
	 * 
	 * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char.
	 */
	public static boolean isXMLValidCharacter(char character) {
		// Order the tests in such a sequence that the most probable
		// conditions are tested first.
		return character >= 0x20 && character <= 0xD7FF || character == 0x9
				|| character == 0xA || character == 0xD || character >= 0xE000
				&& character <= 0xFFFD || character >= 0x10000
				&& character <= 0x10FFFF;
	}

	/** Precondition: The input text contains XML unicode entities rather
	   than Java unicode text.
	
	   The algorithm:
	   Search the string for XML entities. For each XML entity inspect
	   whether it is valid. If valid, append it. To be on the safe side,
	   also inspect for no-entity unicode whether it is XML-valid, and
	   pass on only XML-valid characters.
	
	   This method uses the method isXMLValidCharacter, which makes use
	   of http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char. */
	public static String removeInvalidXmlCharacters(String text) {
		StringBuffer result = new StringBuffer(text.length());
		StringBuffer entity = new StringBuffer();
		boolean readingEntity = false;
		char myChar;
		char entityChar;
		for (int i = 0; i < text.length(); ++i) {
			myChar = text.charAt(i);
			if (readingEntity) {
				if (myChar == ';') {
					if (entity.charAt(0) == '#') {
						try {
							if (entity.charAt(1) == 'x') {
								// Hexadecimal
								entityChar = (char) Integer.parseInt(
										entity.substring(2), 16);
							} else {
								// Decimal
								entityChar = (char) Integer.parseInt(
										entity.substring(1), 10);
							}
							if (isXMLValidCharacter(entityChar))
								result.append('&').append(entity).append(';');
						} catch (NumberFormatException e) {
							result.append('&').append(entity).append(';');
						}
					} else {
						result.append('&').append(entity).append(';');
					}
					entity.setLength(0);
					readingEntity = false;
				} else {
					entity.append(myChar);
				}
			} else {
				if (myChar == '&') {
					readingEntity = true;
				} else {
					// The following test is superfluous under the assumption
					// that the string only contains unicode in XML entities.
					// Removing this test could significantly speed up this
					// method; maybe.
					if (isXMLValidCharacter(myChar))
						result.append(myChar);
				}
			}
		}
		if (entity.length() > 0) {
			result.append('&').append(entity).append(';');
		}
		return result.toString();
	}

	public static String extractHtmlBody(String output) {
		if (output.startsWith("<html")) {
			output = output.substring(6); // do not write
		}
		int start = output.indexOf("<body");
		if (start == -1) {
			start = output.indexOf('>') + 1;
		} else {
			start = output.indexOf('>', start + 5) + 1;
		}
		int end = output.indexOf("</body>");
		if (end == -1) {
			end = output.indexOf("</html>");
		}
		if (end == -1) {
			end = output.length();
		}
		output = output.substring(start, end);
		return output;
	}

	/**
	 * Is used from XSLT! Don't change, unless you change the freemind_version_updater.xslt, too.
	 * @param input
	 * @return
	 */
	public static String replaceSpacesToNonbreakableSpaces(String input) {
		StringBuffer result = new StringBuffer(input.length());
		boolean readingSpaces = false;
		char myChar;
		for (int i = 0; i < input.length(); ++i) {
			myChar = input.charAt(i);
			if (myChar == ' ') {
				if (readingSpaces) {
					result.append(NBSP);
				} else {
					result.append(myChar);
					readingSpaces = true;
				}
			} else {
				readingSpaces = false;
				result.append(myChar);
			}
		}
		return result.toString();
	}

	/* Borrow code from org.apache.commons.lang.Entities */
	public String unescape_utf8(String str) {
		int firstAmp = str.indexOf('&');
		if (firstAmp < 0) {
			return str;
		} else {
			StringWriter stringWriter = createStringWriter(str);
			try {
				this.doUnescapeUtf8(stringWriter, str, firstAmp);
			} catch (IOException e) {
				// This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
				// do not throw IOExceptions.
				return str;
			}
			return stringWriter.toString();
		}
	}

	/**
	 * Make the StringWriter 10% larger than the source String to avoid growing the writer
	 *
	 * @param str The source string
	 * @return A newly created StringWriter
	 */
	private StringWriter createStringWriter(String str) {
		return new StringWriter((int) (str.length() + (str.length() * 0.1)));
	}


	/**
	 * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
	 *
	 * @param writer
	 *            The <code>Writer</code> to write the results to; assumed to be non-null.
	 * @param str
	 *            The source <code>String</code> to unescape; assumed to be non-null.
	 * @param firstAmp
	 *            The <code>int</code> index of the first ampersand in the source String.
	 * @throws IOException
	 *             when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
	 *             methods.
	 */
	private void doUnescapeUtf8(Writer writer, String str, int firstAmp) throws IOException {
		writer.write(str, 0, firstAmp);
		int len = str.length();
		for (int i = firstAmp; i < len; i++) {
			char c = str.charAt(i);
			if (c == '&') {
				int nextIdx = i + 1;
				int semiColonIdx = str.indexOf(';', nextIdx);
				if (semiColonIdx == -1) {
					writer.write(c);
					continue;
				}
				int amphersandIdx = str.indexOf('&', i + 1);
				if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
					// Then the text looks like &...&...;
					writer.write(c);
					continue;
				}
				String entityContent = str.substring(nextIdx, semiColonIdx);
				int entityValue = -1;
				int entityContentLen = entityContent.length();
				if (entityContentLen > 0) {
					if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
						// hexidecimal)
						if (entityContentLen > 1) {
							char isHexChar = entityContent.charAt(1);
							try {
								switch (isHexChar) {
									case 'X' :
									case 'x' : {
										entityValue = Integer.parseInt(entityContent.substring(2), 16);
										break;
									}
									default : {
										entityValue = Integer.parseInt(entityContent.substring(1), 10);
									}
								}
								if (entityValue > 0xFFFF || entityValue < 128 ) {
									entityValue = -1;
								}
							} catch (NumberFormatException e) {
								entityValue = -1;
							}
						}
					} else { // escaped value content is an entity name
						//entityValue = this.entityValue(entityContent);
						entityValue = -1;
					}
				}

				if (entityValue == -1) {
					writer.write('&');
					writer.write(entityContent);
					writer.write(';');
				} else {
					writer.write(entityValue);
				}
				i = semiColonIdx; // move index up to the semi-colon
			} else {
				writer.write(c);
			}
		}
	}
}