/*FreeMind - A Program for creating and viewing Mindmaps *Copyright (C) 2006 Christian Foltin <christianfoltin@users.sourceforge.net> *See COPYING for Details * *This program is free software; you can redistribute it and/or *modify it under the terms of the GNU General Public License *as published by the Free Software Foundation; either version 2 *of the License, or (at your option) any later version. * *This program is distributed in the hope that it will be useful, *but WITHOUT ANY WARRANTY; without even the implied warranty of *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *GNU General Public License for more details. * *You should have received a copy of the GNU General Public License *along with this program; if not, write to the Free Software *Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /*$Id: HtmlTools.java,v 1.1.2.28 2010/12/04 21:07:23 christianfoltin Exp $*/ package freemind.main; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Iterator; import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.text.BadLocationException; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.InputSource; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; /** */ public class HtmlTools { public static final String NBSP = "\u00A0"; private static Logger logger; private static HtmlTools sInstance = new HtmlTools(); private static final Pattern HTML_PATTERN = Pattern .compile("(?s)^\\s*<\\s*html.*?>.*"); private static final Pattern FIND_TAGS_PATTERN = Pattern .compile("([^<]*)(<[^>]+>)"); private static final Pattern SLASHED_TAGS_PATTERN = Pattern.compile("<((" + "br|area|base|basefont|" + "bgsound|button|col|colgroup|embed|hr" + "|img|input|isindex|keygen|link|meta" + "|object|plaintext|spacer|wbr" + ")(\\s[^>]*)?)/>"); private static final Pattern TAGS_PATTERN = Pattern.compile("(?s)<[^><]*>"); public static final String SP = " "; /** * */ private HtmlTools() { super(); logger = Resources.getInstance().getLogger(HtmlTools.class.getName()); } public static HtmlTools getInstance() { return sInstance; } public String toXhtml(String htmlText) { if (!isHtmlNode(htmlText)) { return null; } logger.fine("Enter toXhtml with " + htmlText); StringReader reader = new StringReader(htmlText); StringWriter writer = new StringWriter(); try { XHTMLWriter.html2xhtml(reader, writer); String resultXml = writer.toString(); if (Resources.getInstance().getBoolProperty("wh_nonascii_in_utf8")) { resultXml = unescape_utf8(resultXml); } // for safety: if (isWellformedXml(resultXml)) { logger.fine("Leave toXhtml with " + resultXml); return resultXml; } } catch (IOException e) { freemind.main.Resources.getInstance().logException(e); } catch (BadLocationException e) { freemind.main.Resources.getInstance().logException(e); } // fallback: String fallbackText = toXMLEscapedText(htmlText); logger.fine("Leave toXhtml with fallback " + fallbackText); return fallbackText; } public String toHtml(String xhtmlText) { // Remove '/' from <.../> of elements that do not have '/' there in HTML return SLASHED_TAGS_PATTERN.matcher(xhtmlText).replaceAll("<$1>"); } public static class IndexPair { public int originalStart; public int originalEnd; public int replacedStart; public int replacedEnd; public boolean mIsTag; public boolean mIsAlreadyAppended = false; /** * @param pIsTag * TODO */ public IndexPair(int pOriginalStart, int pOriginalEnd, int pReplacedStart, int pReplacedEnd, boolean pIsTag) { super(); originalStart = pOriginalStart; originalEnd = pOriginalEnd; replacedStart = pReplacedStart; replacedEnd = pReplacedEnd; mIsTag = pIsTag; } /** * generated by CodeSugar http://sourceforge.net/projects/codesugar */ public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("[IndexPair:"); buffer.append(" originalStart: "); buffer.append(originalStart); buffer.append(" originalEnd: "); buffer.append(originalEnd); buffer.append(" replacedStart: "); buffer.append(replacedStart); buffer.append(" replacedEnd: "); buffer.append(replacedEnd); buffer.append(" is a tag: "); buffer.append(mIsTag); buffer.append("]"); return buffer.toString(); } } /** * Replaces text in node content without replacing tags. fc, 19.12.06: This * method is very difficult. If you have a simplier method, please supply * it. But look that it complies with FindTextTests!!! */ public String getReplaceResult(Pattern pattern, String replacement, String text) { ArrayList splittedStringList = new ArrayList(); String stringWithoutTags = null; // remove tags and denote their positions: { StringBuffer sb = new StringBuffer(); Matcher matcher = FIND_TAGS_PATTERN.matcher(text); int lastMatchEnd = 0; while (matcher.find()) { String textWithoutTag = matcher.group(1); // Append text without tags: int replStart = sb.length(); matcher.appendReplacement(sb, "$1"); IndexPair indexPair; if (textWithoutTag.length() > 0) { indexPair = new IndexPair(lastMatchEnd, matcher.end(1), replStart, sb.length(), false); lastMatchEnd = matcher.end(1); // System.out.println(sb.toString() // + ", " // + input.substring(indexPair.originalStart, // indexPair.originalEnd) + ", " + indexPair); splittedStringList.add(indexPair); } // String tag = matcher.group(2); replStart = sb.length(); indexPair = new IndexPair(lastMatchEnd, matcher.end(2), replStart, sb.length(), true); lastMatchEnd = matcher.end(2); // System.out.println(sb.toString() + ", " + // input.substring(indexPair.originalStart, // indexPair.originalEnd)+ ", " + indexPair); splittedStringList.add(indexPair); } int replStart = sb.length(); matcher.appendTail(sb); // append tail only if there is a tail if (sb.length() != replStart) { IndexPair indexPair = new IndexPair(lastMatchEnd, text.length(), replStart, sb.length(), false); // System.out.println(sb.toString() + ", " + indexPair); splittedStringList.add(indexPair); } // System.out.println(sb.toString()); stringWithoutTags = sb.toString(); } // // give it out: // for (Iterator iter = splittedStringList.iterator(); iter.hasNext();) // { // IndexPair pair = (IndexPair) iter.next(); // System.out.println(text.substring(pair.originalStart, // pair.originalEnd) + ", " + pair); // } /** * For each pair which is not a tag we find concurrences and replace * them, if pair is a tag then we just append */ StringBuffer sbResult = new StringBuffer(); for (Iterator iter = splittedStringList.iterator(); iter.hasNext();) { IndexPair pair = (IndexPair) iter.next(); if (pair.mIsTag) append(sbResult, text, pair.originalStart, pair.originalEnd); else { Matcher matcher = pattern.matcher(text.substring( pair.originalStart, pair.originalEnd)); int mStart = 0; int mEnd = 0; int mEndOld = 0; int mStartOld = 0; while (matcher.find()) { mStart = matcher.start(); mEnd = matcher.end(); append(sbResult, text, pair.originalStart + mEndOld, pair.originalStart + mStart); /** * If it's a first iteration then we append text between * start and first concurrence, and when it's not first * iteration (mEndOld != 0) we append text between two * concurrences */ // sbResult.append(text, pair.originalStart + mStart, // pair.originalStart + mEnd); // original text sbResult.append(replacement); mEndOld = mEnd; mStartOld = mStart; } append(sbResult, text, pair.originalStart + mEndOld, pair.originalEnd); // append tail } } // System.out.println("Result:'"+sbResult.toString()+"'"); return sbResult.toString(); } /** * Need to program this, as the stringbuffer method appears in java 1.5 * first. * */ private void append(StringBuffer pSbResult, String pText, int pStart, int pEnd) { pSbResult.append(pText.substring(pStart, pEnd)); } public int getMinimalOriginalPosition(int pI, ArrayList pListOfIndices) { for (Iterator iter = pListOfIndices.iterator(); iter.hasNext();) { IndexPair pair = (IndexPair) iter.next(); if (pI >= pair.replacedStart && pI <= pair.replacedEnd) { return pair.originalStart + pI - pair.replacedStart; } } throw new IllegalArgumentException("Position " + pI + " not found."); } /** * @return the maximal index i such that pI is mapped to i by removing all * tags from the original input. */ public int getMaximalOriginalPosition(int pI, ArrayList pListOfIndices) { for (int i = pListOfIndices.size() - 1; i >= 0; --i) { IndexPair pair = (IndexPair) pListOfIndices.get(i); if (pI >= pair.replacedStart) { if (!pair.mIsTag) { return pair.originalStart + pI - pair.replacedStart; } else { return pair.originalEnd; } } } throw new IllegalArgumentException("Position " + pI + " not found."); } /** */ public static boolean isHtmlNode(String text) { for (int i = 0; i < text.length(); i++) { final char ch = text.charAt(i); if (ch == '<') { break; } if (!Character.isWhitespace(ch) || i == text.length()) { return false; } } return HTML_PATTERN.matcher(text.toLowerCase(Locale.ENGLISH)).matches(); } /** * Changes all unicode characters into &#xxx values. * Opposite to {@link HtmlTools#unescapeHTMLUnicodeEntity(String)} */ public static String unicodeToHTMLUnicodeEntity(String text, boolean pPreserveNewlines) { /* * Heuristic reserve for expansion : factor 1.2 */ StringBuffer result = new StringBuffer((int) (text.length() * 1.2)); int intValue; char myChar; for (int i = 0; i < text.length(); ++i) { myChar = text.charAt(i); intValue = (int) text.charAt(i); boolean outOfRange = intValue < 32 || !Resources.getInstance().getBoolProperty("wh_nonascii_in_utf8") && intValue > 126; if(pPreserveNewlines && myChar == '\n') { outOfRange = false; } if(pPreserveNewlines && myChar == '\r') { outOfRange = false; } if (outOfRange) { result.append("&#x").append(Integer.toString(intValue, 16)) .append(';'); } else { result.append(myChar); } } return result.toString(); } /** * Converts XML unicode entity-encoded characters into plain Java unicode * characters; for example, ''&#xff;'' gets converted. Removes all * XML-invalid entity characters, such as &#xb;. * * Opposite to {@link HtmlTools#unicodeToHTMLUnicodeEntity(String, boolean)} * * @param text * input * @return the converted output. */ public static String unescapeHTMLUnicodeEntity(String text) { StringBuffer result = new StringBuffer(text.length()); StringBuffer entity = new StringBuffer(); boolean readingEntity = false; char myChar; char entityChar; for (int i = 0; i < text.length(); ++i) { myChar = text.charAt(i); if (readingEntity) { if (myChar == ';') { if (entity.charAt(0) == '#') { try { if (entity.charAt(1) == 'x') { // Hexadecimal entityChar = (char) Integer.parseInt( entity.substring(2), 16); } else { // Decimal entityChar = (char) Integer.parseInt( entity.substring(1), 10); } if (isXMLValidCharacter(entityChar)) result.append(entityChar); } catch (NumberFormatException e) { result.append('&').append(entity).append(';'); } } else { result.append('&').append(entity).append(';'); } entity.setLength(0); readingEntity = false; } else { if (isXMLValidCharacter(myChar)) entity.append(myChar); } } else { if (myChar == '&') { readingEntity = true; } else { if (isXMLValidCharacter(myChar)) result.append(myChar); } } } if (entity.length() > 0) { result.append('&').append(entity).append(';'); } return result.toString(); } /** * Removes all tags (<..>) from a string if it starts with "<html>..." to * make it compareable. */ public static String removeHtmlTagsFromString(String text) { if (HtmlTools.isHtmlNode(text)) { return removeAllTagsFromString(text); // (?s) enables that . matches // newline. } else { return text; } } public static String removeAllTagsFromString(String text) { return TAGS_PATTERN.matcher(text).replaceAll(""); } public static String htmlToPlain(String text) { return htmlToPlain(text, /* strictHTMLOnly= */true); } public static String htmlToPlain(String text, boolean strictHTMLOnly) { // 0. remove all newlines // 1. replace newlines, paragraphs, and table rows // 2. remove XML tags // 3. replace HTML entities including   // 4. unescape unicode entities // This is a very basic conversion, fixing the most annoying // inconvenience. You can imagine much better conversion of // HTML to plain text. Most of HTML tags can be handled // sensibly, like web browsers do it. if (strictHTMLOnly && !isHtmlNode(text)) { return text; } // System.err.println("base:"+text); String intermediate = text .replaceAll("(?ims)[\n\t]", "") . // Remove newlines replaceAll("(?ims) +", " ") . // Condense spaces replaceAll("(?ims)<br.*?>", "\n") .replaceAll("(?ims)<p.*?>", "\n\n") . // Paragraph replaceAll("(?ims)<div.*?>", "\n") . // Div - block replaceAll("(?ims)<tr.*?>", "\n") .replaceAll("(?ims)<dt.*?>", "\n") . // Defined term replaceAll("(?ims)<dd.*?>", "\n ") . // Definition of defined term replaceAll("(?ims)<td.*?>", " ") .replaceAll("(?ims)<[uo]l.*?>", "\n") . // Beginning of a list replaceAll("(?ims)<li.*?>", "\n * ") .replaceAll("(?ims) *</[^>]*>", ""). // Remaining closing HTML // tags replaceAll("(?ims)<[^/][^>]*> *", ""). // Remaining opening HTML // tags // FIXME Dimitry: is removing of all new lines at the begin a // good idea? replaceAll("^\n+", ""). // fc: to remove start and end spaces. trim(); intermediate = HtmlTools.unescapeHTMLUnicodeEntity(intermediate); // Entities, with the exception of &. intermediate = intermediate.replaceAll("(?ims)<", "<") .replaceAll("(?ims)>", ">").replaceAll("(?ims)"", "\"") .replaceAll("(?ims) ", " "); // System.err.println("intermediate:"+intermediate); return intermediate.replaceAll("(?ims)&", "&"); } public static String plainToHTML(String text) { char myChar; String textTabsExpanded = text.replaceAll("\t", " "); // Use // eight // spaces // as // tab // width. StringBuffer result = new StringBuffer(textTabsExpanded.length()); // Heuristic int lengthMinus1 = textTabsExpanded.length() - 1; result.append("<html><body><p>"); for (int i = 0; i < textTabsExpanded.length(); ++i) { myChar = textTabsExpanded.charAt(i); switch (myChar) { case '&': result.append("&"); break; case '<': result.append("<"); break; case '>': result.append(">"); break; case ' ': if (i > 0 && i < lengthMinus1 && (int) textTabsExpanded.charAt(i - 1) > 32 && (int) textTabsExpanded.charAt(i + 1) > 32) { result.append(' '); } else { result.append(" "); } break; case '\n': result.append("<br>"); break; default: result.append(myChar); } } return result.toString(); } public static String toXMLUnescapedText(String text) { return text.replaceAll("<", "<").replaceAll(">", ">") .replaceAll(""", "\"").replaceAll("&", "&"); } public static String toXMLEscapedTextExpandingWhitespace(String text) { // Spaces and tabs are handled text = text.replaceAll("\t", " "); // Use eight spaces as tab // width. int len = text.length(); StringBuffer result = new StringBuffer(len); char myChar; for (int i = 0; i < len; ++i) { myChar = text.charAt(i); switch (myChar) { case '&': result.append("&"); break; case '<': result.append("<"); break; case '>': result.append(">"); break; case ' ': if (i > 0 && i < len - 1 && (int) text.charAt(i - 1) > 32 && (int) text.charAt(i + 1) > 32) { result.append(' '); } else { result.append(" "); } break; default: result.append(myChar); } } return result.toString(); } public static String toXMLEscapedText(String text) { if(text == null) { return "ERROR: none"; } return text.replaceAll("&", "&").replaceAll("<", "<") .replaceAll(">", ">").replaceAll("\"", """); } /** * @return true, if well formed XML. */ public boolean isWellformedXml(String xml) { try { // Create a builder factory SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(false); // Create the builder and parse the file factory.newSAXParser().parse( new InputSource(new StringReader(xml)), new DefaultHandler()); return true; } catch (SAXParseException e) { logger.log( Level.SEVERE, "XmlParseError on line " + e.getLineNumber() + " of " + xml, e); } catch (Exception e) { logger.log(Level.SEVERE, "XmlParseError", e); } return false; } /** \0 is not allowed: */ public static String makeValidXml(String pXmlNoteText) { return pXmlNoteText.replaceAll("\0", "").replaceAll("�", ""); } public static String replaceIllegalXmlCharacters(String fileContents) { // replace &xa; by newline. fileContents = fileContents.replaceAll("�*[Aa];", "\n"); /* * is illegal, but sometimes occurs in 0.8.x maps. Thus, we * exclude all from 0 - 1f and replace them by nothing. TODO: Which more * are illegal?? */ fileContents = fileContents.replaceAll("�*1?[0-9A-Fa-f];", ""); // decimal: 0-31 fileContents = fileContents.replaceAll("�*[1-2]?[0-9];", ""); fileContents = fileContents.replaceAll("�*3[0-1];", ""); return fileContents; } /** * Determines whether the character is valid in XML. Invalid characters * include most of the range x00-x1F, and more. * * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char. */ public static boolean isXMLValidCharacter(char character) { // Order the tests in such a sequence that the most probable // conditions are tested first. return character >= 0x20 && character <= 0xD7FF || character == 0x9 || character == 0xA || character == 0xD || character >= 0xE000 && character <= 0xFFFD || character >= 0x10000 && character <= 0x10FFFF; } /** Precondition: The input text contains XML unicode entities rather than Java unicode text. The algorithm: Search the string for XML entities. For each XML entity inspect whether it is valid. If valid, append it. To be on the safe side, also inspect for no-entity unicode whether it is XML-valid, and pass on only XML-valid characters. This method uses the method isXMLValidCharacter, which makes use of http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char. */ public static String removeInvalidXmlCharacters(String text) { StringBuffer result = new StringBuffer(text.length()); StringBuffer entity = new StringBuffer(); boolean readingEntity = false; char myChar; char entityChar; for (int i = 0; i < text.length(); ++i) { myChar = text.charAt(i); if (readingEntity) { if (myChar == ';') { if (entity.charAt(0) == '#') { try { if (entity.charAt(1) == 'x') { // Hexadecimal entityChar = (char) Integer.parseInt( entity.substring(2), 16); } else { // Decimal entityChar = (char) Integer.parseInt( entity.substring(1), 10); } if (isXMLValidCharacter(entityChar)) result.append('&').append(entity).append(';'); } catch (NumberFormatException e) { result.append('&').append(entity).append(';'); } } else { result.append('&').append(entity).append(';'); } entity.setLength(0); readingEntity = false; } else { entity.append(myChar); } } else { if (myChar == '&') { readingEntity = true; } else { // The following test is superfluous under the assumption // that the string only contains unicode in XML entities. // Removing this test could significantly speed up this // method; maybe. if (isXMLValidCharacter(myChar)) result.append(myChar); } } } if (entity.length() > 0) { result.append('&').append(entity).append(';'); } return result.toString(); } public static String extractHtmlBody(String output) { if (output.startsWith("<html")) { output = output.substring(6); // do not write } int start = output.indexOf("<body"); if (start == -1) { start = output.indexOf('>') + 1; } else { start = output.indexOf('>', start + 5) + 1; } int end = output.indexOf("</body>"); if (end == -1) { end = output.indexOf("</html>"); } if (end == -1) { end = output.length(); } output = output.substring(start, end); return output; } /** * Is used from XSLT! Don't change, unless you change the freemind_version_updater.xslt, too. * @param input * @return */ public static String replaceSpacesToNonbreakableSpaces(String input) { StringBuffer result = new StringBuffer(input.length()); boolean readingSpaces = false; char myChar; for (int i = 0; i < input.length(); ++i) { myChar = input.charAt(i); if (myChar == ' ') { if (readingSpaces) { result.append(NBSP); } else { result.append(myChar); readingSpaces = true; } } else { readingSpaces = false; result.append(myChar); } } return result.toString(); } /* Borrow code from org.apache.commons.lang.Entities */ public String unescape_utf8(String str) { int firstAmp = str.indexOf('&'); if (firstAmp < 0) { return str; } else { StringWriter stringWriter = createStringWriter(str); try { this.doUnescapeUtf8(stringWriter, str, firstAmp); } catch (IOException e) { // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) // do not throw IOExceptions. return str; } return stringWriter.toString(); } } /** * Make the StringWriter 10% larger than the source String to avoid growing the writer * * @param str The source string * @return A newly created StringWriter */ private StringWriter createStringWriter(String str) { return new StringWriter((int) (str.length() + (str.length() * 0.1))); } /** * Underlying unescape method that allows the optimisation of not starting from the 0 index again. * * @param writer * The <code>Writer</code> to write the results to; assumed to be non-null. * @param str * The source <code>String</code> to unescape; assumed to be non-null. * @param firstAmp * The <code>int</code> index of the first ampersand in the source String. * @throws IOException * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)} * methods. */ private void doUnescapeUtf8(Writer writer, String str, int firstAmp) throws IOException { writer.write(str, 0, firstAmp); int len = str.length(); for (int i = firstAmp; i < len; i++) { char c = str.charAt(i); if (c == '&') { int nextIdx = i + 1; int semiColonIdx = str.indexOf(';', nextIdx); if (semiColonIdx == -1) { writer.write(c); continue; } int amphersandIdx = str.indexOf('&', i + 1); if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) { // Then the text looks like &...&...; writer.write(c); continue; } String entityContent = str.substring(nextIdx, semiColonIdx); int entityValue = -1; int entityContentLen = entityContent.length(); if (entityContentLen > 0) { if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or // hexidecimal) if (entityContentLen > 1) { char isHexChar = entityContent.charAt(1); try { switch (isHexChar) { case 'X' : case 'x' : { entityValue = Integer.parseInt(entityContent.substring(2), 16); break; } default : { entityValue = Integer.parseInt(entityContent.substring(1), 10); } } if (entityValue > 0xFFFF || entityValue < 128 ) { entityValue = -1; } } catch (NumberFormatException e) { entityValue = -1; } } } else { // escaped value content is an entity name //entityValue = this.entityValue(entityContent); entityValue = -1; } } if (entityValue == -1) { writer.write('&'); writer.write(entityContent); writer.write(';'); } else { writer.write(entityValue); } i = semiColonIdx; // move index up to the semi-colon } else { writer.write(c); } } } }