CmsEncoder.java example

Explorer
opencms-core-master
/*
 * This library is part of OpenCms -
 * the Open Source Content Management System
 *
 * Copyright (c) Alkacon Software GmbH (http://www.alkacon.com)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * For further information about Alkacon Software GmbH, please see the
 * company website: http://www.alkacon.com
 *
 * For further information about OpenCms, please see the
 * project website: http://www.opencms.org
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.opencms.i18n;

import org.opencms.main.CmsLog;
import org.opencms.main.OpenCms;
import org.opencms.util.CmsStringUtil;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;

/**
 * The OpenCms CmsEncoder class provides static methods to decode and encode data.<p>
 * 
 * The methods in this class are substitutes for <code>java.net.URLEncoder.encode()</code> and
 * <code>java.net.URLDecoder.decode()</code>. Use the methods from this class in all OpenCms 
 * core classes to ensure the encoding is always handled the same way.<p>
 * 
 * The de- and encoding uses the same coding mechanism as JavaScript, special characters are
 * replaced with <code>%hex</code> where hex is a two digit hex number.<p>
 * 
 * <b>Note:</b> On the client side (browser) instead of using corresponding <code>escape</code>
 * and <code>unescape</code> JavaScript functions, better use <code>encodeURIComponent</code> and
 * <code>decodeURIComponent</code> functions which are work properly with unicode characters.
 * These functions are supported in IE 5.5+ and NS 6+ only.<p>
 * 
 * @since 6.0.0 
 */
public final class CmsEncoder {

    /** Constant for the standard <code>ISO-8859-1</code> encoding. */
    public static final String ENCODING_ISO_8859_1 = "ISO-8859-1";

    /** Constant for the standard <code>US-ASCII</code> encoding. */
    public static final String ENCODING_US_ASCII = "US-ASCII";

    /** 
     * Constant for the standard <code>UTF-8</code> encoding.<p>
     * 
     * Default encoding for JavaScript decodeUriComponent methods is <code>UTF-8</code> by w3c standard. 
     */
    public static final String ENCODING_UTF_8 = "UTF-8";

    /** The regex pattern to match HTML entities. */
    private static final Pattern ENTITIY_PATTERN = Pattern.compile("\\&#\\d+;");

    /** The prefix for HTML entities. */
    private static final String ENTITY_PREFIX = "&#";

    /** The replacement for HTML entity prefix in parameters. */
    private static final String ENTITY_REPLACEMENT = "$$";

    /** The log object for this class. */
    private static final Log LOG = CmsLog.getLog(CmsEncoder.class);

    /** A cache for encoding name lookup. */
    private static Map<String, String> m_encodingCache = new HashMap<String, String>(16);

    /** The plus entity. */
    private static final String PLUS_ENTITY = ENTITY_PREFIX + "043;";

    /**
     * Constructor.<p>
     */
    private CmsEncoder() {

        // empty
    }

    /**
     * Adjusts the given String by making sure all characters that can be displayed 
     * in the given charset are contained as chars, whereas all other non-displayable
     * characters are converted to HTML entities.<p> 
     * 
     * Just calls {@link #decodeHtmlEntities(String, String)} first and feeds the result
     * to {@link #encodeHtmlEntities(String, String)}. <p>
     *  
     * @param input the input to adjust the HTML encoding for
     * @param encoding the charset to encode the result with\
     * 
     * @return the input with the decoded/encoded HTML entities
     */
    public static String adjustHtmlEncoding(String input, String encoding) {

        return encodeHtmlEntities(decodeHtmlEntities(input, encoding), encoding);
    }

    /**
     * Changes the encoding of a byte array that represents a String.<p>
     * 
     * @param input the byte array to convert
     * @param oldEncoding the current encoding of the byte array
     * @param newEncoding the new encoding of the byte array
     * 
     * @return the byte array encoded in the new encoding
     */
    public static byte[] changeEncoding(byte[] input, String oldEncoding, String newEncoding) {

        if ((oldEncoding == null) || (newEncoding == null)) {
            return input;
        }
        if (oldEncoding.trim().equalsIgnoreCase(newEncoding.trim())) {
            return input;
        }
        byte[] result = input;
        try {
            result = (new String(input, oldEncoding)).getBytes(newEncoding);
        } catch (UnsupportedEncodingException e) {
            // return value will be input value
        }
        return result;
    }

    /**
     * Creates a String out of a byte array with the specified encoding, falling back
     * to the system default in case the encoding name is not valid.<p>
     * 
     * Use this method as a replacement for <code>new String(byte[], encoding)</code>
     * to avoid possible encoding problems.<p>
     * 
     * @param bytes the bytes to decode 
     * @param encoding the encoding scheme to use for decoding the bytes
     * 
     * @return the bytes decoded to a String
     */
    public static String createString(byte[] bytes, String encoding) {

        String enc = encoding.intern();
        if (enc != OpenCms.getSystemInfo().getDefaultEncoding()) {
            enc = lookupEncoding(enc, null);
        }
        if (enc != null) {
            try {
                return new String(bytes, enc);
            } catch (UnsupportedEncodingException e) {
                // this can _never_ happen since the charset was looked up first 
            }
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn(Messages.get().getBundle().key(Messages.ERR_UNSUPPORTED_VM_ENCODING_1, encoding));
            }
            enc = OpenCms.getSystemInfo().getDefaultEncoding();
            try {
                return new String(bytes, enc);
            } catch (UnsupportedEncodingException e) {
                // this can also _never_ happen since the default encoding is always valid
            }
        }
        // this code is unreachable in practice
        LOG.error(Messages.get().getBundle().key(Messages.ERR_ENCODING_ISSUES_1, encoding));
        return null;
    }

    /**
     * Decodes a String using UTF-8 encoding, which is the standard for http data transmission
     * with GET ant POST requests.<p>
     * 
     * @param source the String to decode
     * 
     * @return String the decoded source String
     */
    public static String decode(String source) {

        return decode(source, ENCODING_UTF_8);
    }

    /**
     * This method is a substitute for <code>URLDecoder.decode()</code>.
     * Use this in all OpenCms core classes to ensure the encoding is
     * always handled the same way.<p>
     * 
     * In case you don't know what encoding to use, set the value of 
     * the <code>encoding</code> parameter to <code>null</code>. 
     * This method will then default to UTF-8 encoding, which is probably the right one.<p>
     * 
     * @param source The string to decode
     * @param encoding The encoding to use (if null, the system default is used)
     * 
     * @return The decoded source String
     */
    public static String decode(String source, String encoding) {

        if (source == null) {
            return null;
        }
        if (encoding != null) {
            try {
                return URLDecoder.decode(source, encoding);
            } catch (java.io.UnsupportedEncodingException e) {
                // will fallback to default
            }
        }
        // fallback to default decoding
        try {
            return URLDecoder.decode(source, ENCODING_UTF_8);
        } catch (java.io.UnsupportedEncodingException e) {
            // ignore
        }
        return source;
    }

    /**
     * Decodes HTML entity references like <code>&#8364;</code> that are contained in the 
     * String to a regular character, but only if that character is contained in the given 
     * encodings charset.<p> 
     * 
     * @param input the input to decode the HTML entities in
     * @param encoding the charset to decode the input for
     * @return the input with the decoded HTML entities
     * 
     * @see #encodeHtmlEntities(String, String)
     */
    public static String decodeHtmlEntities(String input, String encoding) {

        Matcher matcher = ENTITIY_PATTERN.matcher(input);
        StringBuffer result = new StringBuffer(input.length());
        Charset charset = Charset.forName(encoding);
        CharsetEncoder encoder = charset.newEncoder();

        while (matcher.find()) {
            String entity = matcher.group();
            String value = entity.substring(2, entity.length() - 1);
            int c = Integer.valueOf(value).intValue();
            if (c < 128) {
                // first 128 chars are contained in almost every charset
                entity = new String(new char[] {(char)c});
                // this is intended as performance improvement since 
                // the canEncode() operation appears quite CPU heavy
            } else if (encoder.canEncode((char)c)) {
                // encoder can encode this char
                entity = new String(new char[] {(char)c});
            }
            matcher.appendReplacement(result, entity);
        }
        matcher.appendTail(result);
        return result.toString();
    }

    /**
     * Decodes a string used as parameter in an uri in a way independent of other encodings/decodings applied before.<p>
     * 
     * @param input the encoded parameter string
     * 
     * @return the decoded parameter string
     * 
     * @see #encodeParameter(String)
     */
    public static String decodeParameter(String input) {

        String result = CmsStringUtil.substitute(input, ENTITY_REPLACEMENT, ENTITY_PREFIX);
        return CmsEncoder.decodeHtmlEntities(result, OpenCms.getSystemInfo().getDefaultEncoding());
    }

    /**
     * Encodes a String using UTF-8 encoding, which is the standard for http data transmission
     * with GET ant POST requests.<p>
     * 
     * @param source the String to encode
     * 
     * @return String the encoded source String
     */
    public static String encode(String source) {

        return encode(source, ENCODING_UTF_8);
    }

    /**
     * This method is a substitute for <code>URLEncoder.encode()</code>.
     * Use this in all OpenCms core classes to ensure the encoding is
     * always handled the same way.<p>
     * 
     * In case you don't know what encoding to use, set the value of 
     * the <code>encoding</code> parameter to <code>null</code>. 
     * This method will then default to UTF-8 encoding, which is probably the right one.<p>
     * 
     * @param source the String to encode
     * @param encoding the encoding to use (if null, the system default is used)
     * 
     * @return the encoded source String
     */
    public static String encode(String source, String encoding) {

        if (source == null) {
            return null;
        }
        if (encoding != null) {
            try {
                return URLEncoder.encode(source, encoding);
            } catch (java.io.UnsupportedEncodingException e) {
                // will fallback to default
            }
        }
        // fallback to default encoding
        try {
            return URLEncoder.encode(source, ENCODING_UTF_8);
        } catch (java.io.UnsupportedEncodingException e) {
            // ignore
        }
        return source;
    }

    /**
     * Encodes all characters that are contained in the String which can not displayed 
     * in the given encodings charset with HTML entity references
     * like <code>&#8364;</code>.<p>
     * 
     * This is required since a Java String is 
     * internally always stored as Unicode, meaning it can contain almost every character, but 
     * the HTML charset used might not support all such characters.<p>
     * 
     * @param input the input to encode for HTML
     * @param encoding the charset to encode the result with
     * 
     * @return the input with the encoded HTML entities
     * 
     * @see #decodeHtmlEntities(String, String)
     */
    public static String encodeHtmlEntities(String input, String encoding) {

        StringBuffer result = new StringBuffer(input.length() * 2);
        CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
        Charset charset = Charset.forName(encoding);
        CharsetEncoder encoder = charset.newEncoder();
        for (int i = 0; i < buffer.length(); i++) {
            int c = buffer.get(i);
            if (c < 128) {
                // first 128 chars are contained in almost every charset
                result.append((char)c);
                // this is intended as performance improvement since 
                // the canEncode() operation appears quite CPU heavy
            } else if (encoder.canEncode((char)c)) {
                // encoder can encode this char
                result.append((char)c);
            } else {
                // append HTML entity reference
                result.append(ENTITY_PREFIX);
                result.append(c);
                result.append(";");
            }
        }
        return result.toString();
    }

    /**
     * Encodes all characters that are contained in the String which can not displayed 
     * in the given encodings charset with Java escaping like <code>\u20ac</code>.<p>
     * 
     * This can be used to escape values used in Java property files.<p>
     * 
     * @param input the input to encode for Java
     * @param encoding the charset to encode the result with
     * 
     * @return the input with the encoded Java entities
     */
    public static String encodeJavaEntities(String input, String encoding) {

        StringBuffer result = new StringBuffer(input.length() * 2);
        CharBuffer buffer = CharBuffer.wrap(input.toCharArray());
        Charset charset = Charset.forName(encoding);
        CharsetEncoder encoder = charset.newEncoder();
        for (int i = 0; i < buffer.length(); i++) {
            int c = buffer.get(i);
            if (c < 128) {
                // first 128 chars are contained in almost every charset
                result.append((char)c);
                // this is intended as performance improvement since 
                // the canEncode() operation appears quite CPU heavy
            } else if (encoder.canEncode((char)c)) {
                // encoder can encode this char
                result.append((char)c);
            } else {
                // append Java entity reference
                result.append("\\u");
                String hex = Integer.toHexString(c);
                int pad = 4 - hex.length();
                for (int p = 0; p < pad; p++) {
                    result.append('0');
                }
                result.append(hex);
            }
        }
        return result.toString();
    }

    /**
     * Encodes a string used as parameter in an uri in a way independent of other encodings/decodings applied later.<p>
     * 
     * Used to ensure that GET parameters are not wrecked by wrong or incompatible configuration settings.
     * In order to ensure this, the String is first encoded with html entities for any character that cannot encoded
     * in US-ASCII; additionally, the plus sign is also encoded to avoid problems with the white-space replacer.
     * Finally, the entity prefix is replaced with characters not used as delimiters in urls.<p>
     * 
     * @param input the parameter string
     * 
     * @return the encoded parameter string
     */
    public static String encodeParameter(String input) {

        String result = CmsEncoder.encodeHtmlEntities(input, CmsEncoder.ENCODING_US_ASCII);
        result = CmsStringUtil.substitute(result, "+", PLUS_ENTITY);
        return CmsStringUtil.substitute(result, ENTITY_PREFIX, ENTITY_REPLACEMENT);
    }

    /**
     * Encodes a String in a way that is compatible with the JavaScript escape function.
     * 
     * @param source The text to be encoded
     * @param encoding the encoding type
     * 
     * @return The JavaScript escaped string
     */
    public static String escape(String source, String encoding) {

        // the blank is encoded into "+" not "%20" when using standard encode call
        return CmsStringUtil.substitute(encode(source, encoding), "+", "%20");
    }

    /**
     * Escapes special characters in a HTML-String with their number-based 
     * entity representation, for example & becomes &#38;.<p>
     * 
     * A character <code>num</code> is replaced if<br>
     * <code>((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62)))</code><p>
     * 
     * @param source the String to escape
     * 
     * @return String the escaped String
     * 
     * @see #escapeXml(String)
     */
    public static String escapeHtml(String source) {

        if (source == null) {
            return null;
        }
        StringBuffer result = new StringBuffer(source.length() * 2);
        for (int i = 0; i < source.length(); i++) {
            int ch = source.charAt(i);
            // avoid escaping already escaped characters            
            if (ch == 38) {
                int terminatorIndex = source.indexOf(";", i);
                if (terminatorIndex > 0) {
                    if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+|lt|gt|amp|quote")) {
                        result.append(source.substring(i, terminatorIndex + 1));
                        // Skip remaining chars up to (and including) ";"
                        i = terminatorIndex;
                        continue;
                    }
                }
            }
            if ((ch != 32) && ((ch > 122) || (ch < 48) || (ch == 60) || (ch == 62))) {
                result.append(ENTITY_PREFIX);
                result.append(ch);
                result.append(";");
            } else {
                result.append((char)ch);
            }
        }
        return new String(result);
    }

    /**
     * Escapes non ASCII characters in a HTML-String with their number-based 
     * entity representation, for example & becomes &#38;.<p>
     * 
     * A character <code>num</code> is replaced if<br>
     * <code>(ch > 255)</code><p>
     * 
     * @param source the String to escape
     * 
     * @return String the escaped String
     * 
     * @see #escapeXml(String)
     */
    public static String escapeNonAscii(String source) {

        if (source == null) {
            return null;
        }
        StringBuffer result = new StringBuffer(source.length() * 2);
        for (int i = 0; i < source.length(); i++) {
            int ch = source.charAt(i);
            if (ch > 255) {
                result.append(ENTITY_PREFIX);
                result.append(ch);
                result.append(";");
            } else {
                result.append((char)ch);
            }
        }
        return new String(result);
    }

    /**
     * A simple method to avoid injection.<p>
     * 
     * Replaces all single quotes to double single quotes in the value parameter of the SQL statement.<p> 
     * 
     * @param source the String to escape SQL from
     * @return the escaped value of the parameter source
     */
    public static String escapeSql(String source) {

        return source.replaceAll("'", "''");
    }

    /**
     * Escapes the wildcard characters in a string which will be used as the pattern for a SQL LIKE clause.<p>
     * 
     * @param pattern the pattern 
     * @param escapeChar the character which should be used as the escape character 
     * 
     * @return the escaped pattern 
     */
    public static String escapeSqlLikePattern(String pattern, char escapeChar) {

        char[] special = new char[] {escapeChar, '%', '_'};
        String result = pattern;
        for (char charToEscape : special) {
            result = result.replaceAll("" + charToEscape, "" + escapeChar + charToEscape);
        }
        return result;
    }

    /**
     * Encodes a String in a way that is compatible with the JavaScript escape function.
     * Multiple blanks are encoded _multiply _with <code>%20</code>.<p>
     * 
     * @param source The text to be encoded
     * @param encoding the encoding type
     * 
     * @return The JavaScript escaped string
     */
    public static String escapeWBlanks(String source, String encoding) {

        if (CmsStringUtil.isEmpty(source)) {
            return source;
        }
        StringBuffer ret = new StringBuffer(source.length() * 2);

        // URLEncode the text string
        // this produces a very similar encoding to JavaSscript encoding, 
        // except the blank which is not encoded into "%20" instead of "+"

        String enc = encode(source, encoding);
        for (int z = 0; z < enc.length(); z++) {
            char c = enc.charAt(z);
            if (c == '+') {
                ret.append("%20");
            } else {
                ret.append(c);
            }
        }
        return ret.toString();
    }

    /**
     * Escapes a String so it may be printed as text content or attribute
     * value in a HTML page or an XML file.<p>
     * 
     * This method replaces the following characters in a String:
     * <ul>
     * <li><b><</b> with &lt;
     * <li><b>></b> with &gt;
     * <li><b>&</b> with &amp;
     * <li><b>"</b> with &quot;
     * </ul><p>
     * 
     * @param source the string to escape
     * 
     * @return the escaped string
     * 
     * @see #escapeHtml(String)
     */
    public static String escapeXml(String source) {

        return escapeXml(source, false);
    }

    /**
     * Escapes a String so it may be printed as text content or attribute
     * value in a HTML page or an XML file.<p>
     * 
     * This method replaces the following characters in a String:
     * <ul>
     * <li><b><</b> with &lt;
     * <li><b>></b> with &gt;
     * <li><b>&</b> with &amp;
     * <li><b>"</b> with &quot;
     * </ul><p>
     * 
     * @param source the string to escape
     * @param doubleEscape if <code>false</code>, all entities that already are escaped are left untouched
     * 
     * @return the escaped string
     * 
     * @see #escapeHtml(String)
     */
    public static String escapeXml(String source, boolean doubleEscape) {

        if (source == null) {
            return null;
        }
        StringBuffer result = new StringBuffer(source.length() * 2);

        for (int i = 0; i < source.length(); ++i) {
            char ch = source.charAt(i);
            switch (ch) {
                case '<':
                    result.append("<");
                    break;
                case '>':
                    result.append(">");
                    break;
                case '&':
                    // don't escape already escaped international and special characters
                    if (!doubleEscape) {
                        int terminatorIndex = source.indexOf(";", i);
                        if (terminatorIndex > 0) {
                            if (source.substring(i + 1, terminatorIndex).matches("#[0-9]+")) {
                                result.append(ch);
                                break;
                            }
                        }
                    }
                    // note that to other "break" in the above "if" block
                    result.append("&");
                    break;
                case '"':
                    result.append(""");
                    break;
                default:
                    result.append(ch);
            }
        }
        return new String(result);
    }

    /**
     * Checks if a given encoding name is actually supported, and if so
     * resolves it to it's canonical name, if not it returns the given fallback 
     * value.<p> 
     * 
     * Charsets have a set of aliases. For example, valid aliases for "UTF-8"
     * are "UTF8", "utf-8" or "utf8". This method resolves any given valid charset name 
     * to it's "canonical" form, so that simple String comparison can be used
     * when checking charset names internally later.<p>
     * 
     * Please see <a href="http://www.iana.org/assignments/character-sets">http://www.iana.org/assignments/character-sets</a> 
     * for a list of valid charset alias names.<p>
     * 
     * @param encoding the encoding to check and resolve
     * @param fallback the fallback encoding scheme
     * 
     * @return the resolved encoding name, or the fallback value
     */
    public static String lookupEncoding(String encoding, String fallback) {

        String result = m_encodingCache.get(encoding);
        if (result != null) {
            return result;
        }

        try {
            result = Charset.forName(encoding).name();
            m_encodingCache.put(encoding, result);
            return result;
        } catch (Throwable t) {
            // we will use the default value as fallback
        }

        return fallback;
    }

    /**
     * Re-decodes a String that has not been correctly decoded and thus has scrambled
     * character bytes.<p>
     * 
     * This is an equivalent to the JavaScript "decodeURIComponent" function.
     * It converts from the default "UTF-8" to the currently selected system encoding.<p>
     * 
     * @param input the String to convert
     * 
     * @return String the converted String
     */
    public static String redecodeUriComponent(String input) {

        if (input == null) {
            return input;
        }
        return new String(
            changeEncoding(input.getBytes(), ENCODING_UTF_8, OpenCms.getSystemInfo().getDefaultEncoding()));
    }

    /**
     * Decodes a String in a way that is compatible with the JavaScript 
     * unescape function.<p>
     * 
     * @param source The String to be decoded
     * @param encoding the encoding type
     * 
     * @return The JavaScript unescaped String
     */
    public static String unescape(String source, String encoding) {

        if (source == null) {
            return null;
        }
        int len = source.length();
        // to use standard decoder we need to replace '+' with "%20" (space)
        StringBuffer preparedSource = new StringBuffer(len);
        for (int i = 0; i < len; i++) {
            char c = source.charAt(i);
            if (c == '+') {
                preparedSource.append("%20");
            } else {
                preparedSource.append(c);
            }
        }
        return decode(preparedSource.toString(), encoding);
    }
}