/***************************************************************************** * Copyright (c) 2009 CEA LIST. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Remi Schnekenburger (CEA LIST) remi.schnekenburger@cea.fr - Initial API and implementation * *****************************************************************************/ package org.eclipse.papyrus.uml.diagram.common.parser; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Class that provides a html cleaner. */ public class HTMLCleaner { /** map of special html strings and their real value */ protected static final Map<String, String> specials = new HashMap<String, String>(); /** symbol requiring new lines */ protected static final List<String> newLine = new ArrayList<String>(); /** map of special html strings and their real value */ protected static final Map<String, String> xmlSpecials = new HashMap<String, String>(); static { // initialize the special character map specials.put("nbsp", " "); // no-break space specials.put("iexcl", "¡"); //inverted exclamation mark specials.put("cent", "¢"); //cent sign specials.put("pound", "£"); //pound sign specials.put("curren", "¤"); //currency sign specials.put("yen", "¥"); //yen sign = yuan sign specials.put("brvbar", "¦"); //broken bar = brolen vertical bar specials.put("sect", "§"); // section sign specials.put("uml", "¨"); //diaeresis = spacing diaeresis specials.put("copy", "©"); //copyright sign specials.put("ordf", "ª"); //feminine ordinal indicator specials.put("laquo", "«"); //left-pointing double angle quotation mark = left pointing guillemet specials.put("not", "¬"); //not sign = discretionary hyphen specials.put("shy", "­"); //soft hyphen = discretionary hyphen specials.put("reg", "®"); //registered sign = registered trade mark sign specials.put("macr", "¯"); //macron = spacing macron = overline = APL overbar specials.put("deg", "°"); //degree sign specials.put("plusmn", "±"); //plus-minus sign = plus-or-minus sign specials.put("sup2", "²"); // superscript two = superscript digit two = squared specials.put("sup3", "³"); // superscript three = superscript digit three = cubed specials.put("acute", "´"); // acute accent = spacing acute specials.put("micro", "µ"); // micro sign specials.put("para", "¶"); // pilcrow sign = paragraph sign specials.put("middot", "·"); // middle dot = Georgian comma = Greek middle dot specials.put("ccedil", "¸"); // cedilla = spacing cedilla specials.put("sup1", "¹"); // superscript one = superscript digit one specials.put("ordm", "º"); // masculine ordinal indicator specials.put("raquo", "»"); // right-pointing double angle quotation mark = right pointing guillemet specials.put("frac14", "¼"); // vulgar fraction one quarter = fraction one quarter specials.put("frac12", "½"); // vulgar fraction one half = fraction one half specials.put("frac34", "¾"); // vulgar fraction three quarters = fraction three quarters specials.put("iquest", "¿"); // inverted question mark = turned question mark specials.put("Agrave", "À"); // latin capital letter A with grave = latin capital letter A grave specials.put("Aacute", "Á"); // latin capital letter A with acute specials.put("Acirc", "Â"); // latin capital letter A with circumflex specials.put("Atilde", "Ã"); // latin capital letter A with tilde specials.put("Auml", "Ä"); // latin capital letter A with diaeresis specials.put("Aring", "Å"); // latin capital letter A with ring above = latin capital letter A ring specials.put("AElig", "Æ"); // latin capital letter AE = latin capital ligature AE specials.put("Ccedil", "Ç"); // latin capital letter C with cedilla specials.put("Egrave", "È"); // latin capital letter E with grave specials.put("Eacute", "É"); // latin capital letter E with acute specials.put("Ecirc", "Ê"); // latin capital letter E with circumflex specials.put("Euml", "Ë"); //latin capital letter E with diaeresis specials.put("Igrave", "Ì"); //latin capital letter I with grave specials.put("Iacute", "Í"); //latin capital letter I with acute specials.put("Icirc", "Î"); //latin capital letter I with circumflex specials.put("Iuml", "Ï"); // latin capital letter I with diaeresis specials.put("ETH", "Ð"); //latin capital letter ETH specials.put("Ntilde", "Ñ"); //latin capital letter N with tilde specials.put("Ograve", "Ò"); //latin capital letter O with grave specials.put("Oacute", "Ó"); //latin capital letter O with acute specials.put("Ocirc", "Ô"); //latin capital letter O with circumflex specials.put("Otilde", "Õ"); //latin capital letter O with tilde specials.put("Ouml", "Ö"); //latin capital letter O with diaeresis specials.put("times", "×"); //multiplication sign specials.put("Oslash", "Ø"); //latin capital letter O with stroke = latin capital letter O slash specials.put("Ugrave", "Ù"); //latin capital letter U with grave specials.put("Uacute", "Ú"); //latin capital letter U with acute specials.put("Ucirc", "Û"); //latin capital letter U with circumflex specials.put("Uuml", "Ü"); // latin capital letter U with diaeresis specials.put("Yacute", "Ý"); // latin capital letter Y with acute specials.put("THORN", "Þ"); // latin capital letter THORN specials.put("szlig", "ß"); // latin small letter sharp s = ess-zed specials.put("agrave", "à"); // latin small letter a with grave = latin small letter a grave specials.put("aacute", "á"); // latin small letter a with acute specials.put("acirc", "â"); // latin small letter a with circumflex specials.put("atilde", "ã"); // latin small letter a with tilde specials.put("auml", "ä"); //latin small letter a with diaeresis specials.put("aring", "å"); // latin small letter a with ring above = latin small letter a ring specials.put("aelig", "æ"); // latin small letter ae = latin small ligature ae specials.put("ccedil", "ç"); // latin small letter c with cedilla specials.put("egrave", "è"); // latin small letter e with grave specials.put("eacute", "é"); // latin small letter e with acute specials.put("ecirc", "ê"); // latin small letter e with circumflex specials.put("euml", "ë"); // latin small letter e with diaeresis specials.put("igrave", "ì"); // latin small letter i with grave specials.put("iacute", "í"); // latin small letter i with acute specials.put("icirc", "î"); // latin small letter i with circumflex specials.put("iuml", "ï"); //latin small letter i with diaeresis specials.put("eth", "ð"); // latin small letter eth specials.put("ntilde", "ñ"); // latin small letter n with tilde specials.put("ograve", "ò"); // latin small letter o with grave specials.put("oacute", "ó"); // latin small letter o with acute specials.put("ocirc", "ô"); // latin small letter o with circumflex specials.put("otilde", "õ"); // latin small letter o with tilde specials.put("ouml", "ö"); //latin small letter o with diaeresis specials.put("divide", "÷"); // division sign specials.put("oslash", "ø"); // latin small letter o with stroke = latin small letter o slash specials.put("ugrave", "ù"); // latin small letter u with grave specials.put("uacute", "ú"); // latin small letter u with acute specials.put("ucirc", "û"); // latin small letter u with circumflex specials.put("uuml", "ü"); // latin small letter u with diaeresis specials.put("yacute", "ý"); // latin small letter y with acute specials.put("thorn", "þ"); // latin small letter thorn with specials.put("yuml", "ÿ"); // latin small letter y with diaeresis specials.put("quot", "\""); //quotation mark = APL quote specials.put("radic", "√"); //square root = radical sign specials.put("infin", "∞"); //infinity specials.put("cap", "∩"); //intersection = cap specials.put("int", "∫"); //integral xmlSpecials.put("amp", "&"); //ampersand xmlSpecials.put("lt", "<"); //less-than sign xmlSpecials.put("gt", ">"); //greater-than sign // new line list newLine.add("BR"); // new line request newLine.add("br"); newLine.add("BR/"); newLine.add("br/"); newLine.add("br /"); newLine.add("BR /"); newLine.add("/H1"); // end of header newLine.add("/H2"); newLine.add("/H3"); newLine.add("/h1"); newLine.add("/h2"); newLine.add("/h3"); newLine.add("/p"); // end of paragraph newLine.add("/P"); newLine.add("/li"); // end of item list } public static String removeHTMLTags(String htmlString) { StringBuffer buffer = new StringBuffer(); // indicating if parser is in tag boolean inTag = false; // indicating if parser is in special character boolean inSpecial = false; // skip the next character boolean skip = false; // ignore or keep whitespace ? boolean keepWhitespace = true; // ignore or keep whitespace ? boolean keepCarriageReturn = false; int length = htmlString.length(); for(int i = 0; i < length; i++) { skip = false; char c = htmlString.charAt(i); if(c == '<') { // opening a new tag... inTag = true; // should do specific check for new lines (<BR>, <P>, <H1>, // <H2>, etc..) // get tag value String tagValue = htmlString.substring(i + 1, htmlString.indexOf('>', i)); if(newLine.contains(tagValue)) { if(keepCarriageReturn) { buffer.append("\n"); keepCarriageReturn = false; } keepWhitespace = false; } } else if(c == '>' && inTag) { // closing tag. must be in tag to // close it... inTag = false; skip = true; } else if(c == '&') { inSpecial = true; // this is a special character // look for next ';', which closes the special character String specialCharacter = htmlString.substring(i + 1, htmlString.indexOf(';', i)); // replace the value with the specified String replacement = specials.get(specialCharacter); if(replacement == null) { replacement = xmlSpecials.get(specialCharacter); } if(replacement != null) { buffer.append(replacement); } } else if(c == ';' && inSpecial) { inSpecial = false; skip = true; keepWhitespace = true; } else if(c == ' ' || c == '\t') { if(keepWhitespace) { buffer.append(" "); } keepWhitespace = false; } else if(c == '\n' || c == '\r') { if(keepCarriageReturn) { buffer.append("\n"); keepCarriageReturn = false; keepWhitespace = false; } } else if(!skip && !inSpecial && !inTag) { buffer.append(c); keepWhitespace = true; keepCarriageReturn = true; } } return buffer.toString(); } /** * Returns a string derived from the specified string. It removes htlm tags, * adding new line separator when useful. * * @param htmlString * the string to clean. It should be neither <code>null</code>, * nor empty * @return a cleaned string. */ public static String cleanHTMLTags(String htmlString) { StringBuffer buffer = new StringBuffer(); // indicating if parser is in tag boolean inTag = false; // indicating if parser is in special character boolean inSpecial = false; // skip the next character boolean skip = false; // ignore or keep whitespace ? boolean keepWhitespace = true; // ignore or keep whitespace ? boolean keepCarriageReturn = false; int length = htmlString.length(); for(int i = 0; i < length; i++) { skip = false; char c = htmlString.charAt(i); if(c == ' ' || c == '\t') { if(keepWhitespace) { buffer.append(" "); } keepWhitespace = false; } else if(c == '\n' || c == '\r') { if(keepCarriageReturn) { buffer.append("\n"); keepCarriageReturn = false; keepWhitespace = false; } } else if(!skip && !inSpecial && !inTag) { buffer.append(c); keepWhitespace = true; keepCarriageReturn = true; } } return buffer.toString(); } /** * Pre-clean the specified string * * @param htmlString * the string to clean * @return the cleaned string */ public static String preClean(String htmlString) { if(htmlString == null) { return ""; } StringBuffer buffer = new StringBuffer(); // indicating if parser is in special character boolean inSpecial = false; // skip the next character boolean skip = false; int length = htmlString.length(); for(int i = 0; i < length; i++) { skip = false; char c = htmlString.charAt(i); if(c == '&') { inSpecial = true; // this is a special character // look for next ';', which closes the special character String specialCharacter = htmlString.substring(i + 1, htmlString.indexOf(';', i)); // replace the value with the specified String replacement = specials.get(specialCharacter); if(replacement != null) { buffer.append(replacement); } else if(xmlSpecials.get(specialCharacter) != null) { buffer.append("&" + specialCharacter + ";"); } } else if(c == ';' && inSpecial) { inSpecial = false; skip = true; } else if(!skip && !inSpecial) { buffer.append(c); } } return buffer.toString(); } }