EscapeChars.java example

Explorer
kbot-master
- src
/*	
	Copyright 2012 Jan Ove Saltvedt
	
	This file is part of KBot.

    KBot is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    KBot is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with KBot.  If not, see <http://www.gnu.org/licenses/>.
	
*/



package com.kbotpro.utils;

import java.net.URLEncoder;
import java.io.UnsupportedEncodingException;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

/**
* Convenience methods for escaping special characters related to HTML, XML,
* and regular expressions.
*
* <P>To keep you safe by default, WEB4J goes to some effort to escape
* characters in your data when appropriate, such that you <em>usually</em>
* don't need to think too much about escaping special characters. Thus, you
*  shouldn't need to <em>directly</em> use the services of this class very often.
*
* <P><span class='highlight'>For Model Objects containing free form user input,
* it is highly recommended that you use {@link }, not <tt>String</tt></span>.
* Free form user input is open to malicious use, such as
* <a href='http://www.owasp.org/index.php/Cross_Site_Scripting'>Cross Site Scripting</a>
* attacks.
* Using <tt>SafeText</tt> will protect you from such attacks, by always escaping
* special characters automatically in its <tt>toString()</tt> method.
*
* <P>The following WEB4J classes will automatically escape special characters
* for you, when needed :
* <ul>
* <li>the {@link } class, used as a building block class for your
* application's Model Objects, for modeling all free form user input
* <li>the {@link } tag used with forms
* <li>the {@link } class used for creating quick reports
* <li>the {@link }, {@link }, and {@link } custom tags used
* for translation
* </ul>
*/
public final class EscapeChars {

  /**
   * Escape characters for text appearing in HTML markup.
   *
   * <P>This method exists as a defence against Cross Site Scripting (XSS) hacks.
   * The idea is to neutralize control characters commonly used by scripts, such that
   * they will not be executed by the browser. This is done by replacing the control
   * characters with their escaped equivalents.
   * See {@link } as well.
   *
   * <P>The following characters are replaced with corresponding
   * HTML character entities :
   * <table border='1' cellpadding='3' cellspacing='0'>
   * <tr><th> Character </th><th>Replacement</th></tr>
   * <tr><td> < </td><td> < </td></tr>
   * <tr><td> > </td><td> > </td></tr>
   * <tr><td> & </td><td> & </td></tr>
   * <tr><td> " </td><td> "</td></tr>
   * <tr><td> \t </td><td> 	</td></tr>
   * <tr><td> ! </td><td> !</td></tr>
   * <tr><td> # </td><td> #</td></tr>
   * <tr><td> $ </td><td> $</td></tr>
   * <tr><td> % </td><td> %</td></tr>
   * <tr><td> ' </td><td> '</td></tr>
   * <tr><td> ( </td><td> (</td></tr>
   * <tr><td> ) </td><td> )</td></tr>
   * <tr><td> * </td><td> *</td></tr>
   * <tr><td> + </td><td> + </td></tr>
   * <tr><td> , </td><td> , </td></tr>
   * <tr><td> - </td><td> - </td></tr>
   * <tr><td> . </td><td> . </td></tr>
   * <tr><td> / </td><td> / </td></tr>
   * <tr><td> : </td><td> :</td></tr>
   * <tr><td> ; </td><td> ;</td></tr>
   * <tr><td> = </td><td> =</td></tr>
   * <tr><td> ? </td><td> ?</td></tr>
   * <tr><td> @ </td><td> @</td></tr>
   * <tr><td> [ </td><td> [</td></tr>
   * <tr><td> \ </td><td> \</td></tr>
   * <tr><td> ] </td><td> ]</td></tr>
   * <tr><td> ^ </td><td> ^</td></tr>
   * <tr><td> _ </td><td> _</td></tr>
   * <tr><td> ` </td><td> `</td></tr>
   * <tr><td> { </td><td> {</td></tr>
   * <tr><td> | </td><td> |</td></tr>
   * <tr><td> } </td><td> }</td></tr>
   * <tr><td> ~ </td><td> ~</td></tr>
   * </table>
   *
   * <P>Note that JSTL's {@code <c:out>} escapes <em>only the first
   * five</em> of the above characters.
   */
   public static String forHTML(String aText){
     final StringBuilder result = new StringBuilder();
     final StringCharacterIterator iterator = new StringCharacterIterator(aText);
     char character =  iterator.current();
     while (character != CharacterIterator.DONE ){
       if (character == '<') {
         result.append("<");
       }
       else if (character == '>') {
         result.append(">");
       }
       else if (character == '&') {
         result.append("&");
      }
       else if (character == '\"') {
         result.append(""");
       }
       else if (character == '\t') {
         addCharEntity(9, result);
       }
       else if (character == '!') {
         addCharEntity(33, result);
       }
       else if (character == '#') {
         addCharEntity(35, result);
       }
       else if (character == '$') {
         addCharEntity(36, result);
       }
       else if (character == '%') {
         addCharEntity(37, result);
       }
       else if (character == '\'') {
         addCharEntity(39, result);
       }
       else if (character == '(') {
         addCharEntity(40, result);
       }
       else if (character == ')') {
         addCharEntity(41, result);
       }
       else if (character == '*') {
         addCharEntity(42, result);
       }
       else if (character == '+') {
         addCharEntity(43, result);
       }
       else if (character == ',') {
         addCharEntity(44, result);
       }
       else if (character == '-') {
         addCharEntity(45, result);
       }
       else if (character == '.') {
         addCharEntity(46, result);
       }
       else if (character == '/') {
         addCharEntity(47, result);
       }
       else if (character == ':') {
         addCharEntity(58, result);
       }
       else if (character == ';') {
         addCharEntity(59, result);
       }
       else if (character == '=') {
         addCharEntity(61, result);
       }
       else if (character == '?') {
         addCharEntity(63, result);
       }
       else if (character == '@') {
         addCharEntity(64, result);
       }
       else if (character == '[') {
         addCharEntity(91, result);
       }
       else if (character == '\\') {
         addCharEntity(92, result);
       }
       else if (character == ']') {
         addCharEntity(93, result);
       }
       else if (character == '^') {
         addCharEntity(94, result);
       }
       else if (character == '_') {
         addCharEntity(95, result);
       }
       else if (character == '`') {
         addCharEntity(96, result);
       }
       else if (character == '{') {
         addCharEntity(123, result);
       }
       else if (character == '|') {
         addCharEntity(124, result);
       }
       else if (character == '}') {
         addCharEntity(125, result);
       }
       else if (character == '~') {
         addCharEntity(126, result);
       }
       else if(character == '\n'){
           result.append("<br>\n");
       }
       else {
         //the char is not a special one
         //add it to the result as is
         result.append(character);
       }
       character = iterator.next();
     }
     return result.toString();
  }


  /**
  * Escape all ampersand characters in a URL.
  *
  * <P>Replaces all <tt>'&'</tt> characters with <tt>'&'</tt>.
  *
  *<P>An ampersand character may appear in the query string of a URL.
  * The ampersand character is indeed valid in a URL.
  * <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and
  * such attributes have the additional constraint that ampersands
  * must be escaped.</em>
  *
  * <P>The JSTL <c:url> tag does indeed perform proper URL encoding of
  * query parameters. But it does not, in general, produce text which
  * is valid as an <tt>HREF</tt> attribute, simply because it does
  * not escape the ampersand character. This is a nuisance when
  * multiple query parameters appear in the URL, since it requires a little
  * extra work.
  */
  public static String forHrefAmpersand(String aURL){
    return aURL.replace("&", "&");
  }

  /**
   * Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>.
   *
   * <P>Used to ensure that HTTP query strings are in proper form, by escaping
   * special characters such as spaces.
   *
   * <P>It is important to note that if a query string appears in an <tt>HREF</tt>
   * attribute, then there are two issues - ensuring the query string is valid HTTP
   * (it is URL-encoded), and ensuring it is valid HTML (ensuring the
   * ampersand is escaped).
   */
   public static String forURL(String aURLFragment){
     String result = null;
     try {
       result = URLEncoder.encode(aURLFragment, "UTF-8");
     }
     catch (UnsupportedEncodingException ex){
       throw new RuntimeException("UTF-8 not supported", ex);
     }
     return result;
   }

  /**
  * Escape characters for text appearing as XML data, between tags.
  *
  * <P>The following characters are replaced with corresponding character entities :
  * <table border='1' cellpadding='3' cellspacing='0'>
  * <tr><th> Character </th><th> Encoding </th></tr>
  * <tr><td> < </td><td> < </td></tr>
  * <tr><td> > </td><td> > </td></tr>
  * <tr><td> & </td><td> & </td></tr>
  * <tr><td> " </td><td> "</td></tr>
  * <tr><td> ' </td><td> '</td></tr>
  * </table>
  *
  * <P>Note that JSTL's {@code <c:out>} escapes the exact same set of
  * characters as this method. <span class='highlight'>That is, {@code <c:out>}
  *  is good for escaping to produce valid XML, but not for producing safe
  *  HTML.</span>
  */
  public static String forXML(String aText){
    final StringBuilder result = new StringBuilder();
    final StringCharacterIterator iterator = new StringCharacterIterator(aText);
    char character =  iterator.current();
    while (character != CharacterIterator.DONE ){
      if (character == '<') {
        result.append("<");
      }
      else if (character == '>') {
        result.append(">");
      }
      else if (character == '\"') {
        result.append(""");
      }
      else if (character == '\'') {
        result.append("'");
      }
      else if (character == '&') {
         result.append("&");
      }
      else {
        //the char is not a special one
        //add it to the result as is
        result.append(character);
      }
      character = iterator.next();
    }
    return result.toString();
  }

  /**
  * Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters
  * replaced by their escaped equivalents.
  */
  public static String toDisableTags(String aText){
    final StringBuilder result = new StringBuilder();
    final StringCharacterIterator iterator = new StringCharacterIterator(aText);
    char character =  iterator.current();
    while (character != CharacterIterator.DONE ){
      if (character == '<') {
        result.append("<");
      }
      else if (character == '>') {
        result.append(">");
      }
      else {
        //the char is not a special one
        //add it to the result as is
        result.append(character);
      }
      character = iterator.next();
    }
    return result.toString();
  }


  /**
  * Replace characters having special meaning in regular expressions
  * with their escaped equivalents, preceded by a '\' character.
  *
  * <P>The escaped characters include :
  *<ul>
  *<li>.
  *<li>\
  *<li>?, * , and +
  *<li>&
  *<li>:
  *<li>{ and }
  *<li>[ and ]
  *<li>( and )
  *<li>^ and $
  *</ul>
  */
  public static String forRegex(String aRegexFragment){
    final StringBuilder result = new StringBuilder();

    final StringCharacterIterator iterator =
      new StringCharacterIterator(aRegexFragment)
    ;
    char character =  iterator.current();
    while (character != CharacterIterator.DONE ){
      /*
      * All literals need to have backslashes doubled.
      */
      if (character == '.') {
        result.append("\\.");
      }
      else if (character == '\\') {
        result.append("\\\\");
      }
      else if (character == '?') {
        result.append("\\?");
      }
      else if (character == '*') {
        result.append("\\*");
      }
      else if (character == '+') {
        result.append("\\+");
      }
      else if (character == '&') {
        result.append("\\&");
      }
      else if (character == ':') {
        result.append("\\:");
      }
      else if (character == '{') {
        result.append("\\{");
      }
      else if (character == '}') {
        result.append("\\}");
      }
      else if (character == '[') {
        result.append("\\[");
      }
      else if (character == ']') {
        result.append("\\]");
      }
      else if (character == '(') {
        result.append("\\(");
      }
      else if (character == ')') {
        result.append("\\)");
      }
      else if (character == '^') {
        result.append("\\^");
      }
      else if (character == '$') {
        result.append("\\$");
      }
      else {
        //the char is not a special one
        //add it to the result as is
        result.append(character);
      }
      character = iterator.next();
    }
    return result.toString();
  }

  /**
  * Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings.
  *
  * <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>.
  *
  * <P>The following methods use replacement strings which treat
  * <tt>'$'</tt> and <tt>'\'</tt> as special characters:
  * <ul>
  * <li><tt>String.replaceAll(String, String)</tt>
  * <li><tt>String.replaceFirst(String, String)</tt>
  * <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt>
  * </ul>
  *
  * <P>If replacement text can contain arbitrary characters, then you
  * will usually need to escape that text, to ensure special characters
  * are interpreted literally.
  */
  public static String forReplacementString(String aInput){
    return Matcher.quoteReplacement(aInput);
  }

  /**
  * Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>.
  *
  * <P>Insensitive to case.
  */
  public static String forScriptTagsOnly(String aText){
    String result = null;
    Matcher matcher = SCRIPT.matcher(aText);
    result = matcher.replaceAll("<SCRIPT>");
    matcher = SCRIPT_END.matcher(result);
    result = matcher.replaceAll("</SCRIPT>");
    return result;
  }

  // PRIVATE //

  private EscapeChars(){
    //empty - prevent construction
  }

  private static final Pattern SCRIPT = Pattern.compile(
    "<SCRIPT>", Pattern.CASE_INSENSITIVE
   );
  private static final Pattern SCRIPT_END = Pattern.compile(
    "</SCRIPT>", Pattern.CASE_INSENSITIVE
  );

  private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){
    String padding = "";
    if( aIdx <= 9 ){
       padding = "00";
    }
    else if( aIdx <= 99 ){
      padding = "0";
    }
    else {
      //no prefix
    }
    String number = padding + aIdx.toString();
    aBuilder.append("&#" + number + ";");
  }
}