/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.wicket.util.encoding; import java.io.CharArrayWriter; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.BitSet; import org.apache.wicket.util.lang.Args; /** * Adapted from java.net.URLEncoder, but defines instances for query string encoding versus URL path * component encoding. * <p/> * The difference is important because a space is encoded as a + in a query string, but this is a * valid value in a path component (and is therefore not decode back to a space). * * @author Doug Donohoe * @see java.net.URLEncoder * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC-2396</a> */ public class UrlEncoder { /** * encoder types */ public enum Type { QUERY, PATH, HEADER } /** * List of what not to encode, i.e. characters (e.g. A-Z) and other allowed signs (e.g. !) * that are allowed but don't have a special meaning. */ protected BitSet dontNeedEncoding; // used in decoding protected static final int caseDiff = ('a' - 'A'); /** * Encoder used to encode name or value components of a query string.<br/> * <br/> * * For example: http://org.acme/notthis/northis/oreventhis?buthis=isokay&asis=thispart */ public static final UrlEncoder QUERY_INSTANCE = new UrlEncoder(Type.QUERY); /** * Encoder used to encode segments of a path.<br/> * <br/> * * For example: http://org.acme/foo/thispart/orthispart?butnot=thispart */ public static final UrlEncoder PATH_INSTANCE = new UrlEncoder(Type.PATH); /** * Encoder used to encode a header. */ public static final UrlEncoder HEADER_INSTANCE = new UrlEncoder(Type.HEADER); /** * Allow subclass to call constructor. * * @param type * encoder type */ protected UrlEncoder(final Type type) { /* * This note from java.net.URLEncoder ================================== * * The list of characters that are not encoded has been determined as follows: * * RFC 2396 states: ----- Data characters that are allowed in a URI but do not have a * reserved purpose are called unreserved. These include upper and lower case letters, * decimal digits, and a limited set of punctuation marks and symbols. * * unreserved = alphanum | mark * * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" * * Unreserved characters can be escaped without changing the semantics of the URI, but this * should not be done unless the URI is being used in a context that does not allow the * unescaped character to appear. ----- * * It appears that both Netscape and Internet Explorer escape all special characters from * this list with the exception of "-", "_", ".", "*". While it is not clear why they are * escaping the other characters, perhaps it is safest to assume that there might be * contexts in which the others are unsafe if not escaped. Therefore, we will use the same * list. It is also noteworthy that this is consistent with O'Reilly's * "HTML: The Definitive Guide" (page 164). * * As a last note, Intenet Explorer does not encode the "@" character which is clearly not * unreserved according to the RFC. We are being consistent with the RFC in this matter, as * is Netscape. * * This bit added by Doug Donohoe ================================== RFC 3986 (2005) updates * this (http://tools.ietf.org/html/rfc3986): * * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" * * pct-encoded = "%" HEXDIG HEXDIG * * reserved = gen-delims / sub-delims * * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" * * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // -- PATH * COMPONENT -- // * * path = (see RFC for all variations) path-abempty =( "/" segment ) segment =pchar pchar = * unreserved / pct-encoded / sub-delims / ":" / "@" // -- QUERY COMPONENT -- // * * query =( pchar / "/" / "?" ) */ // unreserved dontNeedEncoding = new BitSet(256); int i; for (i = 'a'; i <= 'z'; i++) { dontNeedEncoding.set(i); } for (i = 'A'; i <= 'Z'; i++) { dontNeedEncoding.set(i); } for (i = '0'; i <= '9'; i++) { dontNeedEncoding.set(i); } dontNeedEncoding.set('-'); dontNeedEncoding.set('.'); dontNeedEncoding.set('_'); // tilde encoded by java.net.URLEncoder version, but RFC is clear on this dontNeedEncoding.set('~'); // sub-delims dontNeedEncoding.set('!'); dontNeedEncoding.set('$'); // encoding type-specific switch (type) { case QUERY : // this code consistent with java.net.URLEncoder version# // encoding a space to a + is done in the encode() method dontNeedEncoding.set(' '); // sub-delims continued dontNeedEncoding.set('*'); dontNeedEncoding.set('/'); // to allow direct passing of URL in query dontNeedEncoding.set(','); // "'" doesn't need encoding, but it will make it easier to use in in JavaScript // "(" and ")" don't need encoding, but we'll be conservative dontNeedEncoding.set(':'); // allowed and used in wicket interface dontNeedEncoding.set('@'); /* * the below encoding of a ? is disabled because it interferes in portlet * environments. as far as i can tell it will not interfere with the ability to pass * around urls in the query string. however, should it cause problems we can * re-enable it as portlet environments are not high priority. we can also add a * switch somewhere to enable/disable this on applicaiton level. (WICKET-4019) */ // dontNeedEncoding.set('?'); // to allow direct passing of URL in query break; case PATH : // this added to deal with encoding a PATH segment // sub-delims continued dontNeedEncoding.set('*'); dontNeedEncoding.set('&'); dontNeedEncoding.set('+'); // "'" doesn't need encoding, but it will make it easier to use in in JavaScript // "(" and ")" don't need encoding, but we'll be conservative dontNeedEncoding.set(','); dontNeedEncoding.set(';'); // semicolon is used in ;jsessionid= dontNeedEncoding.set('='); dontNeedEncoding.set(':'); // allowed and used in wicket interface dontNeedEncoding.set('@'); break; // this added to deal with encoding a PATH component case HEADER : // this added to deal with encoding of header // ' ' is encoded // sub-delims continued dontNeedEncoding.set('#'); dontNeedEncoding.set('&'); dontNeedEncoding.set('+'); dontNeedEncoding.set('^'); dontNeedEncoding.set('`'); dontNeedEncoding.set('|'); break; } } /** * @param s * string to encode * @param charset * charset to use for encoding * @return encoded string * @see java.net.URLEncoder#encode(String, String) */ public String encode(final String s, final Charset charset) { return encode(s, charset.name()); } /** * @param unsafeInput * string to encode * @param charsetName * encoding to use * @return encoded string * @see java.net.URLEncoder#encode(String, String) */ public String encode(final String unsafeInput, final String charsetName) { final String s = unsafeInput.replace("\0", "NULL"); StringBuilder out = new StringBuilder(s.length()); Charset charset; CharArrayWriter charArrayWriter = new CharArrayWriter(); Args.notNull(charsetName, "charsetName"); try { charset = Charset.forName(charsetName); } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { throw new RuntimeException(new UnsupportedEncodingException(charsetName)); } for (int i = 0; i < s.length();) { int c = s.charAt(i); // System.out.println("Examining character: " + c); if (dontNeedEncoding.get(c)) { if (c == ' ') { c = '+'; } // System.out.println("Storing: " + c); out.append((char)c); i++; } else { // convert to external encoding before hex conversion do { charArrayWriter.write(c); /* * If this character represents the start of a Unicode surrogate pair, then pass * in two characters. It's not clear what should be done if a bytes reserved in * the surrogate pairs range occurs outside of a legal surrogate pair. For now, * just treat it as if it were any other character. */ if ((c >= 0xD800) && (c <= 0xDBFF)) { /* * System.out.println(Integer.toHexString(c) + " is high surrogate"); */ if ((i + 1) < s.length()) { int d = s.charAt(i + 1); /* * System.out.println("\tExamining " + Integer.toHexString(d)); */ if ((d >= 0xDC00) && (d <= 0xDFFF)) { /* * System.out.println("\t" + Integer.toHexString(d) + " is low * surrogate"); */ charArrayWriter.write(d); i++; } } } i++; } while ((i < s.length()) && !dontNeedEncoding.get((c = s.charAt(i)))); charArrayWriter.flush(); String str = new String(charArrayWriter.toCharArray()); byte[] ba = str.getBytes(charset); for (byte b : ba) { out.append('%'); char ch = Character.forDigit((b >> 4) & 0xF, 16); // converting to use uppercase letter as part of // the hex value if ch is a letter. if (Character.isLetter(ch)) { ch -= caseDiff; } out.append(ch); ch = Character.forDigit(b & 0xF, 16); if (Character.isLetter(ch)) { ch -= caseDiff; } out.append(ch); } charArrayWriter.reset(); } } return out.toString(); } }