/* * This library is part of OpenCms - * the Open Source Content Management System * * Copyright (c) Alkacon Software GmbH (http://www.alkacon.com) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * For further information about Alkacon Software GmbH, please see the * company website: http://www.alkacon.com * * For further information about OpenCms, please see the * project website: http://www.opencms.org * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.opencms.util; import org.opencms.file.CmsObject; import org.opencms.file.CmsProperty; import org.opencms.file.CmsPropertyDefinition; import org.opencms.file.CmsResource; import org.opencms.i18n.CmsEncoder; import org.opencms.main.CmsException; import org.opencms.main.CmsLog; import org.opencms.main.OpenCms; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; /** * HTML cleaner and pretty printer.<p> * * Used to clean up HTML code (e.g. remove word tags) and optionally create XHTML from HTML.<p> * * @since 6.0.0 */ public class CmsHtmlConverter { /** Parameter value for disabled mode. **/ public static final String PARAM_DISABLED = CmsStringUtil.FALSE; /** Parameter value for enabled mode. **/ public static final String PARAM_ENABLED = CmsStringUtil.TRUE; /** Parameter value for replace paragraph mode. */ public static final String PARAM_REPLACE_PARAGRAPHS = "replace-paragraphs"; /** Parameter value for WORD mode. **/ public static final String PARAM_WORD = "cleanup"; /** Parameter value for XHTML mode. **/ public static final String PARAM_XHTML = "xhtml"; /** The separator used for the configured modes String. */ public static final char SEPARATOR_MODES = ';'; /** The log object for this class. */ private static final Log LOG = CmsLog.getLog(CmsHtmlConverter.class); /** The encoding used for the HTML code conversion. */ private String m_encoding; /** The conversion mode for the converter. */ private String m_mode; /** * Constructor, creates a new CmsHtmlConverter.<p> * * The encoding used by default is {@link CmsEncoder#ENCODING_UTF_8}.<p> */ public CmsHtmlConverter() { init(CmsEncoder.ENCODING_UTF_8, PARAM_ENABLED); } /** * Constructor, creates a new CmsHtmlConverter.<p> * * Possible values for the default conversion mode are:<ul> * <li>{@link #PARAM_DISABLED}: The conversion is disabled.</li> * <li>{@link #PARAM_ENABLED}: Conversion is enabled without transformation, so HTML is pretty printed only.</li> * <li>{@link #PARAM_XHTML}: Conversion from HTML to XHTML is enabled.</li> * <li>{@link #PARAM_WORD}: Cleanup of word like HTML tags is enabled.</li> * <li>Other values can be used by the implementing converter class.</li> * </ul> * Values can be combined with the <code>;</code> separator, so it is e.g. possible to convert * to XHTML and clean from word at the same time.<p> * * @param encoding the encoding used for the HTML code conversion * @param mode the conversion mode to use */ public CmsHtmlConverter(String encoding, String mode) { init(encoding, mode); } /** * Reads the content conversion property of a given resource and returns its value.<p> * * A default value (disabled) is returned if the property could not be read.<p> * * @param cms the CmsObject * @param resource the resource in the VFS * @return the content conversion property value */ public static String getConversionSettings(CmsObject cms, CmsResource resource) { // read the content-conversion property String contentConversion; try { String resourceName = cms.getSitePath(resource); CmsProperty contentConversionProperty = cms.readPropertyObject( resourceName, CmsPropertyDefinition.PROPERTY_CONTENT_CONVERSION, true); contentConversion = contentConversionProperty.getValue(CmsHtmlConverter.PARAM_DISABLED); } catch (CmsException e) { // if there was an error reading the property, choose a default value contentConversion = CmsHtmlConverter.PARAM_DISABLED; } return contentConversion; } /** * Tests if the content conversion is enabled.<p> * * @param conversionMode the content conversion mode string * @return true or false */ public static boolean isConversionEnabled(String conversionMode) { boolean value = true; if ((conversionMode == null) || (conversionMode.indexOf(PARAM_DISABLED) != -1)) { value = false; } return value; } /** * Converts the given HTML code according to the settings of this converter.<p> * * @param htmlInput HTML input stored in an array of bytes * @return array of bytes containing the converted HTML * * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported */ public byte[] convertToByte(byte[] htmlInput) throws UnsupportedEncodingException { return convertToByte(new String(htmlInput, getEncoding())); } /** * Converts the given HTML code according to the settings of this converter.<p> * * @param htmlInput HTML input stored in a string * @return array of bytes containing the converted HTML * * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported */ public byte[] convertToByte(String htmlInput) throws UnsupportedEncodingException { return convertToString(htmlInput).getBytes(getEncoding()); } /** * Converts the given HTML code according to the settings of this converter.<p> * * If an any error occurs during the conversion process, the original input is returned unmodified.<p> * * @param htmlInput HTML input stored in an array of bytes * @return array of bytes containing the converted HTML */ public byte[] convertToByteSilent(byte[] htmlInput) { try { return convertToByte(htmlInput); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); } return htmlInput; } } /** * Converts the given HTML code according to the settings of this converter.<p> * * If an any error occurs during the conversion process, the original input is returned unmodified.<p> * * @param htmlInput HTML input stored in a string * @return array of bytes containing the converted HTML */ public byte[] convertToByteSilent(String htmlInput) { try { return convertToByte(htmlInput.getBytes(getEncoding())); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); } try { return htmlInput.getBytes(getEncoding()); } catch (UnsupportedEncodingException e1) { if (LOG.isWarnEnabled()) { LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1); } return htmlInput.getBytes(); } } } /** * Converts the given HTML code according to the settings of this converter.<p> * * @param htmlInput HTML input stored in an array of bytes * @return string containing the converted HTML * * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported */ public String convertToString(byte[] htmlInput) throws UnsupportedEncodingException { return convertToString(new String(htmlInput, getEncoding())); } /** * Converts the given HTML code according to the settings of the converter.<p> * * @param htmlInput HTML input stored in a string * @return string containing the converted HTML * * @throws UnsupportedEncodingException if the encoding set for the conversion is not supported */ public String convertToString(String htmlInput) throws UnsupportedEncodingException { // first: collect all converter classes to use on the input Map converters = new HashMap(); for (Iterator i = getModes().iterator(); i.hasNext();) { String mode = (String)i.next(); String converterClass = OpenCms.getResourceManager().getHtmlConverter(mode); List modes = new ArrayList(); if (converters.containsKey(converterClass)) { // converter class already defined for a previous mode, get mode list modes = (List)converters.get(converterClass); } // add mode name to list for the converter modes.add(mode); // store converter with modes in map converters.put(converterClass, modes); } // second: convert the content with all found converter classes for (Iterator i = converters.entrySet().iterator(); i.hasNext();) { Map.Entry entry = (Map.Entry)i.next(); String className = (String)entry.getKey(); List modes = (List)entry.getValue(); try { I_CmsHtmlConverter converter = (I_CmsHtmlConverter)Class.forName(className).newInstance(); // initialize converter converter.init(getEncoding(), modes); // convert input String htmlInput = converter.convertToString(htmlInput); } catch (ClassNotFoundException e) { LOG.error(org.opencms.loader.Messages.get().getBundle().key( org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1, className), e); } catch (IllegalAccessException e) { LOG.error(org.opencms.loader.Messages.get().getBundle().key( org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1, className), e); } catch (InstantiationException e) { LOG.error(org.opencms.loader.Messages.get().getBundle().key( org.opencms.loader.Messages.LOG_HTML_CONVERTER_CLASS_NOT_FOUND_1, className), e); } } return htmlInput; } /** * Converts the given HTML code according to the settings of this converter.<p> * * If an any error occurs during the conversion process, the original input is returned unmodified.<p> * * @param htmlInput HTML input stored in an array of bytes * * @return string containing the converted HTML */ public String convertToStringSilent(byte[] htmlInput) { try { return convertToString(htmlInput); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); } try { return new String(htmlInput, getEncoding()); } catch (UnsupportedEncodingException e1) { if (LOG.isWarnEnabled()) { LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e1); } return new String(htmlInput); } } } /** * Converts the given HTML code according to the settings of this converter.<p> * * If an any error occurs during the conversion process, the original input is returned unmodified.<p> * * @param htmlInput HTML input stored in string * * @return string containing the converted HTML */ public String convertToStringSilent(String htmlInput) { try { return convertToString(htmlInput); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(Messages.get().getBundle().key(Messages.LOG_CONVERSION_BYTE_FAILED_0), e); } return htmlInput; } } /** * Returns the encoding used for the HTML code conversion.<p> * * @return the encoding used for the HTML code conversion */ public String getEncoding() { return m_encoding; } /** * Returns the conversion mode to use.<p> * * @return the conversion mode to use */ public String getMode() { return m_mode; } /** * Returns the conversion modes to use as List of String parameters.<p> * * @return the conversion modes to use as List of String parameters */ private List getModes() { List modes = new ArrayList(); try { modes = CmsStringUtil.splitAsList(getMode(), SEPARATOR_MODES, true); } catch (Exception e) { // error generating list, an empty list will be returned } return modes; } /** * Initializes the HTML converter instance.<p> * * Possible values for the conversion mode are dependent from the converter implementation.<p> * * Values can be combined with the <code>;</code> separator, so that it is e.g. possible to convert * to XHTML and clean from word at the same time.<p> * * @param encoding the encoding used for the HTML code conversion * @param mode the conversion mode to use */ private void init(String encoding, String mode) { if (encoding == null) { m_encoding = CmsEncoder.ENCODING_UTF_8; } else { m_encoding = encoding; } if (CmsStringUtil.isEmptyOrWhitespaceOnly(mode)) { m_mode = ""; } else { m_mode = mode; } } }