/* This file belongs to the Servoy development and deployment environment, Copyright (C) 1997-2010 Servoy BV This program is free software; you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program; if not, see http://www.gnu.org/licenses or write to the Free Software Foundation,Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 */ package com.servoy.j2db.util; import java.util.StringTokenizer; public class HtmlUtils { /** tags in HTML 4.0.1 (some of them are deprecated) */ @SuppressWarnings("nls") public static final String[] tags = { "<!--", "<!DOCTYPE", "<a", "<abbr", "<acronym", "<address", "<applet", "<area", "<b", "<base", "<basefont", "<bdo", "<big", "<blockquote", "<body", "<br", "<button", "<caption", "<center", "<cite", "<code", "<col", "<colgroup", "<dd", "<del", "<dir", "<div", "<dfn", "<dl", "<dt", "<em", "<fieldset", "<font", "<form", "<frame", "<frameset", "<h1", "<h2", "<h3", "<h4", "<h5", "<h6", "<head", "<hr", "<html", "<i", "<iframe", "<img", "<input", "<ins", "<isindex", "<kbd", "<label", "<legend", "<li", "<link", "<map", "<menu", "<meta", "<noframes", "<noscript", "<object", "<ol", "<optgroup", "<option", "<p", "<param", "<pre", "<q", "<s", "<samp", "<script", "<select", "<small", "<span", "<strike", "<strong", "<style", "<sub", "<sup", "<table", "<tbody", "<td", "<textarea", "<tfoot", "<th", "<thead", "<title", "<tr", "<tt", "<u", "<ul", "<var", "<xmp" }; /** tags that don't support inline style attributes (or tags that I want to align (and I must give the style in its parent)); */ @SuppressWarnings("nls") public static final String[] alignNotSupportedTags = { "<b", "<u", "<i", "<br", "<hr", "<html", "<head", "<menu", "<sub", "<sup", "<style", "<button", "<a", "<center", "<blockquote", "<img", "<font", "<applet", "<bdo", "<big", "<button", "<abbr", "<area", "<!--", "<xmp", "<dir", "<script", "<meta" }; @SuppressWarnings("nls") public static final String[] specialCaseTags = { "table", "tr", "td", "p", "div", "title", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li" }; public static boolean startsWithHtml(Object object) { if (object instanceof CharSequence) { return startsWithHtml((CharSequence)object); } else if (object != null) { return startsWithHtml(object.toString()); } return false; } public static boolean startsWithHtml(CharSequence charsequence) { if (charsequence == null || charsequence.length() == 0) return false; int charsequenceLen = charsequence.length(); int charIndex = 0; // first trim while (charIndex < charsequenceLen && charsequence.charAt(charIndex) == ' ') charIndex++; if (charIndex >= charsequenceLen || charsequence.charAt(charIndex) != '<') return false; if (charIndex >= charsequenceLen - 1 || Character.toLowerCase(charsequence.charAt(++charIndex)) != 'h') return false; if (charIndex >= charsequenceLen - 1 || Character.toLowerCase(charsequence.charAt(++charIndex)) != 't') return false; if (charIndex >= charsequenceLen - 1 || Character.toLowerCase(charsequence.charAt(++charIndex)) != 'm') return false; if (charIndex >= charsequenceLen - 1 || Character.toLowerCase(charsequence.charAt(++charIndex)) != 'l') return false; return true; } /** * searches for content thats is to be shown in a web browser and returns true if that kind of content is found * * @param html the HTML content to be checked * @return True if and only if the HTML content has some "display-able" content */ public static boolean hasUsefulHtmlContent(String html) { if (Utils.stringIsEmpty(html)) return false; String lowercaseText = html.toLowerCase(); if (lowercaseText.indexOf("<html") == -1) //$NON-NLS-1$ { return false; } int beginIndex = lowercaseText.indexOf("<body"); //$NON-NLS-1$ String usefulContent = ""; //$NON-NLS-1$ String enclosingTag = ""; //$NON-NLS-1$ if (beginIndex == -1) { beginIndex = lowercaseText.indexOf("<html"); //$NON-NLS-1$ beginIndex = lowercaseText.indexOf(">", beginIndex) + 1; //$NON-NLS-1$ if (beginIndex == 0) return false; enclosingTag = "<html"; //$NON-NLS-1$ } else { beginIndex = lowercaseText.indexOf(">", beginIndex) + 1; //$NON-NLS-1$ if (beginIndex == 0) return false; enclosingTag = "<body"; //$NON-NLS-1$ } try { usefulContent = findUsefulContent(enclosingTag, lowercaseText, beginIndex); return !Utils.stringIsEmpty(usefulContent); } catch (Exception e) { return false; } } public static boolean hasHtmlTag(String content) { if (content != null) { if (!content.contains("<")) return false; //$NON-NLS-1$ for (String tag : tags) { if (content.contains(tag)) return true; } } return false; } /** * returns the content wrapped by a given tag * * @param enclosingTag * @param htmlContent * @param beginIndex * @return some "useful" content wrapped by the enclosing tag */ private static String findUsefulContent(String enclosingTag, String htmlContent, int beginIndex) { String wrapper = enclosingTag.replaceFirst("<", ""); //$NON-NLS-1$ //$NON-NLS-2$ wrapper = wrapper.trim(); int finalWrapperPosition = htmlContent.indexOf(wrapper, beginIndex); if (finalWrapperPosition == -1) return htmlContent.substring(beginIndex); boolean found = false; int position = finalWrapperPosition - 2; String contentWithWhiteSpaces = ""; //$NON-NLS-1$ while (!found && position >= beginIndex) { contentWithWhiteSpaces = htmlContent.substring(position--, finalWrapperPosition); if (contentWithWhiteSpaces.trim().equals("</")) //$NON-NLS-1$ { found = true; position++; } } if (found) { finalWrapperPosition = position; } String usefulContent = finalWrapperPosition >= beginIndex ? htmlContent.substring(beginIndex, finalWrapperPosition) : htmlContent.substring(beginIndex); if (usefulContent.trim().startsWith("</")) usefulContent = ""; //$NON-NLS-1$ //$NON-NLS-2$ return usefulContent; } public static String stripHTML(String html) { String result = html.replace("<br />", "\n"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replace("<br/>", "\n"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replace("<br>", "\n"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replaceAll("<[^>]*>", ""); //$NON-NLS-1$ //$NON-NLS-2$ return result; } /** * Converts a Java String to an HTML markup string, but does not convert normal spaces to non-breaking space entities (<nbsp>). * * @param s The string to be escaped * @see Utils#escapeMarkup(String, boolean) * @return The escaped string */ public static CharSequence escapeMarkup(final String s) { return escapeMarkup(s, false); } /** * Converts a Java String to an HTML markup String by replacing illegal characters with HTML entities where appropriate. Spaces are converted to * non-breaking spaces (<nbsp>) if escapeSpaces is true, tabs are converted to four non-breaking spaces, less than signs are converted to &lt; * entities and greater than signs to &gt; entities. * * @param s The string to escape * @param escapeSpaces True to replace ' ' with nonbreaking space * @return The escaped string */ public static CharSequence escapeMarkup(final String s, final boolean escapeSpaces) { return escapeMarkup(s, escapeSpaces, false); } /** * Converts a Java String to an HTML markup String by replacing illegal characters with HTML entities where appropriate. Spaces are converted to * non-breaking spaces (<nbsp>) if escapeSpaces is true, tabs are converted to four non-breaking spaces, less-than signs are converted to &lt; * entities and greater-than signs to &gt; entities. * * @param s The string to escape * @param escapeSpaces True to replace ' ' with nonbreaking space * @param convertToHtmlUnicodeEscapes True to convert non-7 bit characters to unicode HTML (&#...) * @return The escaped string */ public static CharSequence escapeMarkup(final String s, final boolean escapeSpaces, final boolean convertToHtmlUnicodeEscapes) { if (s == null) { return null; } else { int len = s.length(); final StringBuffer buffer = new StringBuffer((int)(len * 1.1)); for (int i = 0; i < len; i++) { final char c = s.charAt(i); switch (c) { case '\t' : if (escapeSpaces) { // Assumption is four space tabs (sorry, but that's // just how it is!) buffer.append("    "); //$NON-NLS-1$ } else { buffer.append(c); } break; case ' ' : if (escapeSpaces) { buffer.append(" "); //$NON-NLS-1$ } else { buffer.append(c); } break; case '<' : buffer.append("<"); //$NON-NLS-1$ break; case '>' : buffer.append(">"); //$NON-NLS-1$ break; case '&' : // if this is an entity (&#), then do not convert if ((i < len - 1) && (s.charAt(i + 1) == '#')) { buffer.append(c); } else { // it is not an entity, so convert it to & buffer.append("&"); //$NON-NLS-1$ } break; case '"' : buffer.append("""); //$NON-NLS-1$ break; case '\'' : buffer.append("'"); //$NON-NLS-1$ break; default : if (convertToHtmlUnicodeEscapes) { int ci = 0xffff & c; if (ci < 160) { // nothing special only 7 Bit buffer.append(c); } else { // Not 7 Bit use the unicode system buffer.append("&#"); //$NON-NLS-1$ buffer.append(new Integer(ci).toString()); buffer.append(';'); } } else { buffer.append(c); } break; } } return buffer; } } /* * raw unescape */ public static String unescape(String s) { String result = s; result = result.replaceAll("<", "<"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replaceAll(">", ">"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replaceAll("&", "&"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replaceAll(" ", " "); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replaceAll("    ", "\t"); //$NON-NLS-1$ //$NON-NLS-2$ result = result.replaceAll(""", "\""); //$NON-NLS-1$//$NON-NLS-2$ result = result.replaceAll("'", "\'"); //$NON-NLS-1$ //$NON-NLS-2$ return result; } public static String getValidFontFamilyValue(String cssValue) { StringBuffer sb = new StringBuffer(); StringTokenizer tk = new StringTokenizer(cssValue, ","); //$NON-NLS-1$ while (tk.hasMoreTokens()) { String fontFamily = tk.nextToken(); if (sb.toString().length() != 0) { sb.append(", "); //$NON-NLS-1$ } fontFamily = fontFamily.trim(); if (!fontFamily.startsWith("'") && !fontFamily.startsWith("\"")) //$NON-NLS-1$ //$NON-NLS-2$ { for (int i = 0; i < fontFamily.length(); i++) { boolean validCharacter = (fontFamily.charAt(i) >= 'a' && fontFamily.charAt(i) <= 'z') || (fontFamily.charAt(i) >= 'A' && fontFamily.charAt(i) <= 'Z') || (fontFamily.charAt(i) >= '0' && fontFamily.charAt(i) <= '9') || fontFamily.charAt(i) == '_' || fontFamily.charAt(i) == '-'; if (!validCharacter) { fontFamily = "\"" + fontFamily + "\""; //$NON-NLS-1$//$NON-NLS-2$ break; } } } sb.append(fontFamily); } return sb.toString(); } /** * Replaces all urls in a html document to make them absolute * @param url the url of the document * @param the html content of the document * @return the document with absolute urls */ public static String htmlURLAbsEnhancer(String url, String htmldoc) { String currURL = url; int ind_currURL = url.lastIndexOf("/"); if (ind_currURL != -1 && ind_currURL > 10) { currURL = url.substring(0, ind_currURL); } String baseURL = url; if (url.length() > 10) { int ind_baseURL = url.indexOf("/", 10); if (ind_baseURL != -1) { baseURL = url.substring(0, ind_baseURL); } } StringBuffer retval = new StringBuffer(); String lowerCaseContent = htmldoc.toLowerCase(); int index = 0; int old_index = 0; while (index != -1) { boolean image = false; boolean link = false; int formindex = lowerCaseContent.indexOf("<form", index); int aindex = lowerCaseContent.indexOf("<a", index); int imgindex = lowerCaseContent.indexOf("<img", index); int frameindex = lowerCaseContent.indexOf("<frame", index); int metaindex = lowerCaseContent.indexOf("<meta", index); int linkindex = lowerCaseContent.indexOf("<link", index); if (aindex != -1 && imgindex != -1) { int i = Math.min(aindex, imgindex); if (i != aindex) { aindex = -1; } else { imgindex = -1; } } if (frameindex != -1) { index = frameindex; int newindex = lowerCaseContent.indexOf("src", index); if (newindex == -1) { index++; continue; } else { index = newindex; } link = true; } else if (metaindex != -1) { index = metaindex; int newindex = lowerCaseContent.indexOf(";url", index); if (newindex == -1) { index++; continue; } else { index = newindex; } link = true; } else if (linkindex != -1) { index = linkindex; int newindex = lowerCaseContent.indexOf("href", index); if (newindex == -1) { index++; continue; } else { index = newindex; } link = true; } else if (aindex != -1) { index = aindex; int newindex = lowerCaseContent.indexOf("href", index); if (newindex == -1) { index++; continue; } else { index = newindex; } link = true; } else if (imgindex != -1) { index = imgindex; int newindex = lowerCaseContent.indexOf("src", index); if (newindex == -1) { index++; continue; } else { index = newindex; } image = true; } else if (formindex != -1) { index = formindex; int newindex = lowerCaseContent.indexOf("action", index); if (newindex == -1) { index++; continue; } else { index = newindex; } link = true; } else { break; } if ((index = lowerCaseContent.indexOf("=", index)) == -1) continue; index++; //skip '=' String remaining = htmldoc.substring(index); StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\"'>#"); String strLink = st.nextToken(); retval.append(htmldoc.substring(old_index, index)); retval.append("\""); if (strLink.startsWith("/")) { retval.append(baseURL + strLink); } else if (!strLink.startsWith(baseURL)) { retval.append(currURL + "/" + strLink); } else { retval.append(strLink); } retval.append("\""); old_index = index + 1 + strLink.length() + 1; } retval.append(htmldoc.substring(old_index)); return retval.toString(); } }