/*FreeMind - A Program for creating and viewing Mindmaps
*Copyright (C) 2006 Christian Foltin <christianfoltin@users.sourceforge.net>
*See COPYING for Details
*
*This program is free software; you can redistribute it and/or
*modify it under the terms of the GNU General Public License
*as published by the Free Software Foundation; either version 2
*of the License, or (at your option) any later version.
*
*This program is distributed in the hope that it will be useful,
*but WITHOUT ANY WARRANTY; without even the implied warranty of
*MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
*GNU General Public License for more details.
*
*You should have received a copy of the GNU General Public License
*along with this program; if not, write to the Free Software
*Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
/*$Id: HtmlTools.java,v 1.1.2.28 2010/12/04 21:07:23 christianfoltin Exp $*/
package freemind.main;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.BadLocationException;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
/** */
public class HtmlTools {
public static final String NBSP = "\u00A0";
private static Logger logger;
private static HtmlTools sInstance = new HtmlTools();
private static final Pattern HTML_PATTERN = Pattern
.compile("(?s)^\\s*<\\s*html.*?>.*");
private static final Pattern FIND_TAGS_PATTERN = Pattern
.compile("([^<]*)(<[^>]+>)");
private static final Pattern SLASHED_TAGS_PATTERN = Pattern.compile("<(("
+ "br|area|base|basefont|" + "bgsound|button|col|colgroup|embed|hr"
+ "|img|input|isindex|keygen|link|meta"
+ "|object|plaintext|spacer|wbr" + ")(\\s[^>]*)?)/>");
private static final Pattern TAGS_PATTERN = Pattern.compile("(?s)<[^><]*>");
public static final String SP = " ";
/**
*
*/
private HtmlTools() {
super();
logger = Resources.getInstance().getLogger(HtmlTools.class.getName());
}
public static HtmlTools getInstance() {
return sInstance;
}
public String toXhtml(String htmlText) {
if (!isHtmlNode(htmlText)) {
return null;
}
logger.fine("Enter toXhtml with " + htmlText);
StringReader reader = new StringReader(htmlText);
StringWriter writer = new StringWriter();
try {
XHTMLWriter.html2xhtml(reader, writer);
String resultXml = writer.toString();
if (Resources.getInstance().getBoolProperty("wh_nonascii_in_utf8")) {
resultXml = unescape_utf8(resultXml);
}
// for safety:
if (isWellformedXml(resultXml)) {
logger.fine("Leave toXhtml with " + resultXml);
return resultXml;
}
} catch (IOException e) {
freemind.main.Resources.getInstance().logException(e);
} catch (BadLocationException e) {
freemind.main.Resources.getInstance().logException(e);
}
// fallback:
String fallbackText = toXMLEscapedText(htmlText);
logger.fine("Leave toXhtml with fallback " + fallbackText);
return fallbackText;
}
public String toHtml(String xhtmlText) {
// Remove '/' from <.../> of elements that do not have '/' there in HTML
return SLASHED_TAGS_PATTERN.matcher(xhtmlText).replaceAll("<$1>");
}
public static class IndexPair {
public int originalStart;
public int originalEnd;
public int replacedStart;
public int replacedEnd;
public boolean mIsTag;
public boolean mIsAlreadyAppended = false;
/**
* @param pIsTag
* TODO
*/
public IndexPair(int pOriginalStart, int pOriginalEnd,
int pReplacedStart, int pReplacedEnd, boolean pIsTag) {
super();
originalStart = pOriginalStart;
originalEnd = pOriginalEnd;
replacedStart = pReplacedStart;
replacedEnd = pReplacedEnd;
mIsTag = pIsTag;
}
/**
* generated by CodeSugar http://sourceforge.net/projects/codesugar
*/
public String toString() {
StringBuffer buffer = new StringBuffer();
buffer.append("[IndexPair:");
buffer.append(" originalStart: ");
buffer.append(originalStart);
buffer.append(" originalEnd: ");
buffer.append(originalEnd);
buffer.append(" replacedStart: ");
buffer.append(replacedStart);
buffer.append(" replacedEnd: ");
buffer.append(replacedEnd);
buffer.append(" is a tag: ");
buffer.append(mIsTag);
buffer.append("]");
return buffer.toString();
}
}
/**
* Replaces text in node content without replacing tags. fc, 19.12.06: This
* method is very difficult. If you have a simplier method, please supply
* it. But look that it complies with FindTextTests!!!
*/
public String getReplaceResult(Pattern pattern, String replacement,
String text) {
ArrayList splittedStringList = new ArrayList();
String stringWithoutTags = null;
// remove tags and denote their positions:
{
StringBuffer sb = new StringBuffer();
Matcher matcher = FIND_TAGS_PATTERN.matcher(text);
int lastMatchEnd = 0;
while (matcher.find()) {
String textWithoutTag = matcher.group(1);
// Append text without tags:
int replStart = sb.length();
matcher.appendReplacement(sb, "$1");
IndexPair indexPair;
if (textWithoutTag.length() > 0) {
indexPair = new IndexPair(lastMatchEnd, matcher.end(1),
replStart, sb.length(), false);
lastMatchEnd = matcher.end(1);
// System.out.println(sb.toString()
// + ", "
// + input.substring(indexPair.originalStart,
// indexPair.originalEnd) + ", " + indexPair);
splittedStringList.add(indexPair);
}
// String tag = matcher.group(2);
replStart = sb.length();
indexPair = new IndexPair(lastMatchEnd, matcher.end(2),
replStart, sb.length(), true);
lastMatchEnd = matcher.end(2);
// System.out.println(sb.toString() + ", " +
// input.substring(indexPair.originalStart,
// indexPair.originalEnd)+ ", " + indexPair);
splittedStringList.add(indexPair);
}
int replStart = sb.length();
matcher.appendTail(sb);
// append tail only if there is a tail
if (sb.length() != replStart) {
IndexPair indexPair = new IndexPair(lastMatchEnd,
text.length(), replStart, sb.length(), false);
// System.out.println(sb.toString() + ", " + indexPair);
splittedStringList.add(indexPair);
}
// System.out.println(sb.toString());
stringWithoutTags = sb.toString();
}
// // give it out:
// for (Iterator iter = splittedStringList.iterator(); iter.hasNext();)
// {
// IndexPair pair = (IndexPair) iter.next();
// System.out.println(text.substring(pair.originalStart,
// pair.originalEnd) + ", " + pair);
// }
/**
* For each pair which is not a tag we find concurrences and replace
* them, if pair is a tag then we just append
*/
StringBuffer sbResult = new StringBuffer();
for (Iterator iter = splittedStringList.iterator(); iter.hasNext();) {
IndexPair pair = (IndexPair) iter.next();
if (pair.mIsTag)
append(sbResult, text, pair.originalStart, pair.originalEnd);
else {
Matcher matcher = pattern.matcher(text.substring(
pair.originalStart, pair.originalEnd));
int mStart = 0;
int mEnd = 0;
int mEndOld = 0;
int mStartOld = 0;
while (matcher.find()) {
mStart = matcher.start();
mEnd = matcher.end();
append(sbResult, text, pair.originalStart + mEndOld,
pair.originalStart + mStart);
/**
* If it's a first iteration then we append text between
* start and first concurrence, and when it's not first
* iteration (mEndOld != 0) we append text between two
* concurrences
*/
// sbResult.append(text, pair.originalStart + mStart,
// pair.originalStart + mEnd);
// original text
sbResult.append(replacement);
mEndOld = mEnd;
mStartOld = mStart;
}
append(sbResult, text, pair.originalStart + mEndOld,
pair.originalEnd);
// append tail
}
}
// System.out.println("Result:'"+sbResult.toString()+"'");
return sbResult.toString();
}
/**
* Need to program this, as the stringbuffer method appears in java 1.5
* first.
* */
private void append(StringBuffer pSbResult, String pText, int pStart,
int pEnd) {
pSbResult.append(pText.substring(pStart, pEnd));
}
public int getMinimalOriginalPosition(int pI, ArrayList pListOfIndices) {
for (Iterator iter = pListOfIndices.iterator(); iter.hasNext();) {
IndexPair pair = (IndexPair) iter.next();
if (pI >= pair.replacedStart && pI <= pair.replacedEnd) {
return pair.originalStart + pI - pair.replacedStart;
}
}
throw new IllegalArgumentException("Position " + pI + " not found.");
}
/**
* @return the maximal index i such that pI is mapped to i by removing all
* tags from the original input.
*/
public int getMaximalOriginalPosition(int pI, ArrayList pListOfIndices) {
for (int i = pListOfIndices.size() - 1; i >= 0; --i) {
IndexPair pair = (IndexPair) pListOfIndices.get(i);
if (pI >= pair.replacedStart) {
if (!pair.mIsTag) {
return pair.originalStart + pI - pair.replacedStart;
} else {
return pair.originalEnd;
}
}
}
throw new IllegalArgumentException("Position " + pI + " not found.");
}
/**
*/
public static boolean isHtmlNode(String text) {
for (int i = 0; i < text.length(); i++) {
final char ch = text.charAt(i);
if (ch == '<') {
break;
}
if (!Character.isWhitespace(ch) || i == text.length()) {
return false;
}
}
return HTML_PATTERN.matcher(text.toLowerCase(Locale.ENGLISH)).matches();
}
/**
* Changes all unicode characters into xx values.
* Opposite to {@link HtmlTools#unescapeHTMLUnicodeEntity(String)}
*/
public static String unicodeToHTMLUnicodeEntity(String text, boolean pPreserveNewlines) {
/*
* Heuristic reserve for expansion : factor 1.2
*/
StringBuffer result = new StringBuffer((int) (text.length() * 1.2));
int intValue;
char myChar;
for (int i = 0; i < text.length(); ++i) {
myChar = text.charAt(i);
intValue = (int) text.charAt(i);
boolean outOfRange = intValue < 32 || !Resources.getInstance().getBoolProperty("wh_nonascii_in_utf8") && intValue > 126;
if(pPreserveNewlines && myChar == '\n') {
outOfRange = false;
}
if(pPreserveNewlines && myChar == '\r') {
outOfRange = false;
}
if (outOfRange) {
result.append("").append(Integer.toString(intValue, 16))
.append(';');
} else {
result.append(myChar);
}
}
return result.toString();
}
/**
* Converts XML unicode entity-encoded characters into plain Java unicode
* characters; for example, ''ÿ'' gets converted. Removes all
* XML-invalid entity characters, such as .
*
* Opposite to {@link HtmlTools#unicodeToHTMLUnicodeEntity(String, boolean)}
*
* @param text
* input
* @return the converted output.
*/
public static String unescapeHTMLUnicodeEntity(String text) {
StringBuffer result = new StringBuffer(text.length());
StringBuffer entity = new StringBuffer();
boolean readingEntity = false;
char myChar;
char entityChar;
for (int i = 0; i < text.length(); ++i) {
myChar = text.charAt(i);
if (readingEntity) {
if (myChar == ';') {
if (entity.charAt(0) == '#') {
try {
if (entity.charAt(1) == 'x') {
// Hexadecimal
entityChar = (char) Integer.parseInt(
entity.substring(2), 16);
} else {
// Decimal
entityChar = (char) Integer.parseInt(
entity.substring(1), 10);
}
if (isXMLValidCharacter(entityChar))
result.append(entityChar);
} catch (NumberFormatException e) {
result.append('&').append(entity).append(';');
}
} else {
result.append('&').append(entity).append(';');
}
entity.setLength(0);
readingEntity = false;
} else {
if (isXMLValidCharacter(myChar))
entity.append(myChar);
}
} else {
if (myChar == '&') {
readingEntity = true;
} else {
if (isXMLValidCharacter(myChar))
result.append(myChar);
}
}
}
if (entity.length() > 0) {
result.append('&').append(entity).append(';');
}
return result.toString();
}
/**
* Removes all tags (<..>) from a string if it starts with "<html>..." to
* make it compareable.
*/
public static String removeHtmlTagsFromString(String text) {
if (HtmlTools.isHtmlNode(text)) {
return removeAllTagsFromString(text); // (?s) enables that . matches
// newline.
} else {
return text;
}
}
public static String removeAllTagsFromString(String text) {
return TAGS_PATTERN.matcher(text).replaceAll("");
}
public static String htmlToPlain(String text) {
return htmlToPlain(text, /* strictHTMLOnly= */true);
}
public static String htmlToPlain(String text, boolean strictHTMLOnly) {
// 0. remove all newlines
// 1. replace newlines, paragraphs, and table rows
// 2. remove XML tags
// 3. replace HTML entities including
// 4. unescape unicode entities
// This is a very basic conversion, fixing the most annoying
// inconvenience. You can imagine much better conversion of
// HTML to plain text. Most of HTML tags can be handled
// sensibly, like web browsers do it.
if (strictHTMLOnly && !isHtmlNode(text)) {
return text;
}
// System.err.println("base:"+text);
String intermediate = text
.replaceAll("(?ims)[\n\t]", "")
. // Remove newlines
replaceAll("(?ims) +", " ")
. // Condense spaces
replaceAll("(?ims)<br.*?>", "\n")
.replaceAll("(?ims)<p.*?>", "\n\n")
. // Paragraph
replaceAll("(?ims)<div.*?>", "\n")
. // Div - block
replaceAll("(?ims)<tr.*?>", "\n")
.replaceAll("(?ims)<dt.*?>", "\n")
. // Defined term
replaceAll("(?ims)<dd.*?>", "\n ")
. // Definition of defined term
replaceAll("(?ims)<td.*?>", " ")
.replaceAll("(?ims)<[uo]l.*?>", "\n")
. // Beginning of a list
replaceAll("(?ims)<li.*?>", "\n * ")
.replaceAll("(?ims) *</[^>]*>", ""). // Remaining closing HTML
// tags
replaceAll("(?ims)<[^/][^>]*> *", ""). // Remaining opening HTML
// tags
// FIXME Dimitry: is removing of all new lines at the begin a
// good idea?
replaceAll("^\n+", "").
// fc: to remove start and end spaces.
trim();
intermediate = HtmlTools.unescapeHTMLUnicodeEntity(intermediate);
// Entities, with the exception of &.
intermediate = intermediate.replaceAll("(?ims)<", "<")
.replaceAll("(?ims)>", ">").replaceAll("(?ims)"", "\"")
.replaceAll("(?ims) ", " ");
// System.err.println("intermediate:"+intermediate);
return intermediate.replaceAll("(?ims)&", "&");
}
public static String plainToHTML(String text) {
char myChar;
String textTabsExpanded = text.replaceAll("\t", " "); // Use
// eight
// spaces
// as
// tab
// width.
StringBuffer result = new StringBuffer(textTabsExpanded.length()); // Heuristic
int lengthMinus1 = textTabsExpanded.length() - 1;
result.append("<html><body><p>");
for (int i = 0; i < textTabsExpanded.length(); ++i) {
myChar = textTabsExpanded.charAt(i);
switch (myChar) {
case '&':
result.append("&");
break;
case '<':
result.append("<");
break;
case '>':
result.append(">");
break;
case ' ':
if (i > 0 && i < lengthMinus1
&& (int) textTabsExpanded.charAt(i - 1) > 32
&& (int) textTabsExpanded.charAt(i + 1) > 32) {
result.append(' ');
} else {
result.append(" ");
}
break;
case '\n':
result.append("<br>");
break;
default:
result.append(myChar);
}
}
return result.toString();
}
public static String toXMLUnescapedText(String text) {
return text.replaceAll("<", "<").replaceAll(">", ">")
.replaceAll(""", "\"").replaceAll("&", "&");
}
public static String toXMLEscapedTextExpandingWhitespace(String text) {
// Spaces and tabs are handled
text = text.replaceAll("\t", " "); // Use eight spaces as tab
// width.
int len = text.length();
StringBuffer result = new StringBuffer(len);
char myChar;
for (int i = 0; i < len; ++i) {
myChar = text.charAt(i);
switch (myChar) {
case '&':
result.append("&");
break;
case '<':
result.append("<");
break;
case '>':
result.append(">");
break;
case ' ':
if (i > 0 && i < len - 1 && (int) text.charAt(i - 1) > 32
&& (int) text.charAt(i + 1) > 32) {
result.append(' ');
} else {
result.append(" ");
}
break;
default:
result.append(myChar);
}
}
return result.toString();
}
public static String toXMLEscapedText(String text) {
if(text == null) {
return "ERROR: none";
}
return text.replaceAll("&", "&").replaceAll("<", "<")
.replaceAll(">", ">").replaceAll("\"", """);
}
/**
* @return true, if well formed XML.
*/
public boolean isWellformedXml(String xml) {
try {
// Create a builder factory
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setValidating(false);
// Create the builder and parse the file
factory.newSAXParser().parse(
new InputSource(new StringReader(xml)),
new DefaultHandler());
return true;
} catch (SAXParseException e) {
logger.log(
Level.SEVERE,
"XmlParseError on line " + e.getLineNumber() + " of " + xml,
e);
} catch (Exception e) {
logger.log(Level.SEVERE, "XmlParseError", e);
}
return false;
}
/** \0 is not allowed: */
public static String makeValidXml(String pXmlNoteText) {
return pXmlNoteText.replaceAll("\0", "").replaceAll("", "");
}
public static String replaceIllegalXmlCharacters(String fileContents) {
// replace &xa; by newline.
fileContents = fileContents.replaceAll("*[Aa];", "\n");
/*
* is illegal, but sometimes occurs in 0.8.x maps. Thus, we
* exclude all from 0 - 1f and replace them by nothing. TODO: Which more
* are illegal??
*/
fileContents = fileContents.replaceAll("*1?[0-9A-Fa-f];", "");
// decimal: 0-31
fileContents = fileContents.replaceAll("*[1-2]?[0-9];", "");
fileContents = fileContents.replaceAll("*3[0-1];", "");
return fileContents;
}
/**
* Determines whether the character is valid in XML. Invalid characters
* include most of the range x00-x1F, and more.
*
* @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char.
*/
public static boolean isXMLValidCharacter(char character) {
// Order the tests in such a sequence that the most probable
// conditions are tested first.
return character >= 0x20 && character <= 0xD7FF || character == 0x9
|| character == 0xA || character == 0xD || character >= 0xE000
&& character <= 0xFFFD || character >= 0x10000
&& character <= 0x10FFFF;
}
/** Precondition: The input text contains XML unicode entities rather
than Java unicode text.
The algorithm:
Search the string for XML entities. For each XML entity inspect
whether it is valid. If valid, append it. To be on the safe side,
also inspect for no-entity unicode whether it is XML-valid, and
pass on only XML-valid characters.
This method uses the method isXMLValidCharacter, which makes use
of http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char. */
public static String removeInvalidXmlCharacters(String text) {
StringBuffer result = new StringBuffer(text.length());
StringBuffer entity = new StringBuffer();
boolean readingEntity = false;
char myChar;
char entityChar;
for (int i = 0; i < text.length(); ++i) {
myChar = text.charAt(i);
if (readingEntity) {
if (myChar == ';') {
if (entity.charAt(0) == '#') {
try {
if (entity.charAt(1) == 'x') {
// Hexadecimal
entityChar = (char) Integer.parseInt(
entity.substring(2), 16);
} else {
// Decimal
entityChar = (char) Integer.parseInt(
entity.substring(1), 10);
}
if (isXMLValidCharacter(entityChar))
result.append('&').append(entity).append(';');
} catch (NumberFormatException e) {
result.append('&').append(entity).append(';');
}
} else {
result.append('&').append(entity).append(';');
}
entity.setLength(0);
readingEntity = false;
} else {
entity.append(myChar);
}
} else {
if (myChar == '&') {
readingEntity = true;
} else {
// The following test is superfluous under the assumption
// that the string only contains unicode in XML entities.
// Removing this test could significantly speed up this
// method; maybe.
if (isXMLValidCharacter(myChar))
result.append(myChar);
}
}
}
if (entity.length() > 0) {
result.append('&').append(entity).append(';');
}
return result.toString();
}
public static String extractHtmlBody(String output) {
if (output.startsWith("<html")) {
output = output.substring(6); // do not write
}
int start = output.indexOf("<body");
if (start == -1) {
start = output.indexOf('>') + 1;
} else {
start = output.indexOf('>', start + 5) + 1;
}
int end = output.indexOf("</body>");
if (end == -1) {
end = output.indexOf("</html>");
}
if (end == -1) {
end = output.length();
}
output = output.substring(start, end);
return output;
}
/**
* Is used from XSLT! Don't change, unless you change the freemind_version_updater.xslt, too.
* @param input
* @return
*/
public static String replaceSpacesToNonbreakableSpaces(String input) {
StringBuffer result = new StringBuffer(input.length());
boolean readingSpaces = false;
char myChar;
for (int i = 0; i < input.length(); ++i) {
myChar = input.charAt(i);
if (myChar == ' ') {
if (readingSpaces) {
result.append(NBSP);
} else {
result.append(myChar);
readingSpaces = true;
}
} else {
readingSpaces = false;
result.append(myChar);
}
}
return result.toString();
}
/* Borrow code from org.apache.commons.lang.Entities */
public String unescape_utf8(String str) {
int firstAmp = str.indexOf('&');
if (firstAmp < 0) {
return str;
} else {
StringWriter stringWriter = createStringWriter(str);
try {
this.doUnescapeUtf8(stringWriter, str, firstAmp);
} catch (IOException e) {
// This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
// do not throw IOExceptions.
return str;
}
return stringWriter.toString();
}
}
/**
* Make the StringWriter 10% larger than the source String to avoid growing the writer
*
* @param str The source string
* @return A newly created StringWriter
*/
private StringWriter createStringWriter(String str) {
return new StringWriter((int) (str.length() + (str.length() * 0.1)));
}
/**
* Underlying unescape method that allows the optimisation of not starting from the 0 index again.
*
* @param writer
* The <code>Writer</code> to write the results to; assumed to be non-null.
* @param str
* The source <code>String</code> to unescape; assumed to be non-null.
* @param firstAmp
* The <code>int</code> index of the first ampersand in the source String.
* @throws IOException
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
* methods.
*/
private void doUnescapeUtf8(Writer writer, String str, int firstAmp) throws IOException {
writer.write(str, 0, firstAmp);
int len = str.length();
for (int i = firstAmp; i < len; i++) {
char c = str.charAt(i);
if (c == '&') {
int nextIdx = i + 1;
int semiColonIdx = str.indexOf(';', nextIdx);
if (semiColonIdx == -1) {
writer.write(c);
continue;
}
int amphersandIdx = str.indexOf('&', i + 1);
if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
// Then the text looks like &...&...;
writer.write(c);
continue;
}
String entityContent = str.substring(nextIdx, semiColonIdx);
int entityValue = -1;
int entityContentLen = entityContent.length();
if (entityContentLen > 0) {
if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
// hexidecimal)
if (entityContentLen > 1) {
char isHexChar = entityContent.charAt(1);
try {
switch (isHexChar) {
case 'X' :
case 'x' : {
entityValue = Integer.parseInt(entityContent.substring(2), 16);
break;
}
default : {
entityValue = Integer.parseInt(entityContent.substring(1), 10);
}
}
if (entityValue > 0xFFFF || entityValue < 128 ) {
entityValue = -1;
}
} catch (NumberFormatException e) {
entityValue = -1;
}
}
} else { // escaped value content is an entity name
//entityValue = this.entityValue(entityContent);
entityValue = -1;
}
}
if (entityValue == -1) {
writer.write('&');
writer.write(entityContent);
writer.write(';');
} else {
writer.write(entityValue);
}
i = semiColonIdx; // move index up to the semi-colon
} else {
writer.write(c);
}
}
}
}