/* Copyright (c) 2008 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.gdata.util.common.net;
import com.google.gdata.util.common.base.CharEscapers;
import com.google.gdata.util.common.base.CharMatcher;
import com.google.gdata.util.common.base.Charsets;
import com.google.gdata.util.httputil.FastURLEncoder;
import static com.google.gdata.util.common.base.Preconditions.checkNotNull;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
/**
* Implements <a href="http://en.wikipedia.org/wiki/Percent-encoding"
* >percent-encoding</a>, specifying how to encode non-US-ASCII and reserved
* characters in URIs.
*
* <p>Per Section 2.1 of <a href="http://tools.ietf.org/html/rfc3986">RFC
* 3986</a>, URIs should contain only characters that are part of US-ASCII, and
* some characters are further reserved to delimit components or subcomponents;
* therefore, characters that are outside the allowed set need to be encoded.
* This is done using the escape sequence "%<i>XX</i>" where <i>XX</i> is the
* hexadecimal value of the bytewise representation of the character.
*
* <p>This encoding format is used for the application/x-www-form-urlencoded
* content type, as defined by section 17.13.4 of the W3C's <a
* href="http://www.w3.org/TR/REC-html40/interact/forms.html#h-17.13.4.1">HTML
* 4.01 Specification</a>.
*
* <p>For example, the Unicode string "flambé" is represented as the byte
* sequence {@code [0x66, 0x6c, 0x61, 0x6d, 0x62, 0xe9]} in ISO-8859-1. In
* UTF-8, it is represented as {@code [0x66, 0x6c, 0x61, 0x6d, 0x62, 0xc3,
* 0xa9]}. The first five characters are unreserved and do not require encoding,
* but the last character is not, so the URI representation is "flamb%E9" in
* ISO-8859-1 and "flamb%C3%A9" in UTF-8. Escape sequences are not
* case-sensitive.
*
* @see Uri
*
*/
public final class UriEncoder {
private UriEncoder() {}
/**
* The default character encoding, UTF-8, per Section 2.5 of <a
* href="http://tools.ietf.org/html/rfc3986">RFC 3986</a>.
*
* @see Charsets
*/
public static final Charset DEFAULT_ENCODING = Charsets.UTF_8;
/**
* Percent-encodes a Unicode string into a US-ASCII string. The {@link
* #DEFAULT_ENCODING}, UTF-8, is used to determine how non-US-ASCII and
* reserved characters should be represented as consecutive sequences of the
* form "%<i>XX</i>".
*
* <p>This replaces ' ' with '+'. So this method should not be
* used for non application/x-www-form-urlencoded strings such as
* host and path.
*
* @param string a Unicode string
* @return a percent-encoded US-ASCII string
* @throws NullPointerException if {@code string} is null
*/
public static String encode(String string) {
return CharEscapers.uriEscaper().escape(string);
}
/**
* Percent-encodes a Unicode string into a US-ASCII string. The specified
* encoding is used to determine how non-US-ASCII and reserved characters
* should be represented as consecutive sequences of the form "%<i>XX</i>".
*
* <p>This replaces ' ' with '+'. So this method should not be
* used for non application/x-www-form-urlencoded strings such as
* host and path.
*
* @param string a Unicode string
* @param encoding a character encoding
* @return a percent-encoded US-ASCII string
* @throws NullPointerException if any argument is null
*/
public static String encode(String string, Charset encoding) {
checkNotNull(string);
checkNotNull(encoding);
// encoding parameter
if (encoding.equals(DEFAULT_ENCODING)) {
return encode(string);
} else {
try {
return FastURLEncoder.encode(string, encoding.name());
} catch (UnsupportedEncodingException impossible) {
// We know we have a valid encoding name since we got it from a Charset
// instance
throw new AssertionError(impossible);
}
}
}
/**
* Percent-decodes a US-ASCII string into a Unicode string. The {@link
* #DEFAULT_ENCODING}, UTF-8, is used to determine what characters are
* represented by any consecutive sequences of the form "%<i>XX</i>".
*
* <p>This replaces '+' with ' '. So this method should not be
* used for non application/x-www-form-urlencoded strings such as
* host and path.
*
* @param string a percent-encoded US-ASCII string
* @return a Unicode string
* @throws NullPointerException if {@code string} is null
*/
public static String decode(String string) {
return decode(string, DEFAULT_ENCODING);
}
/**
* Percent-decodes a US-ASCII string into a Unicode string. The specified
* encoding is used to determine what characters are represented by any
* consecutive sequences of the form "%<i>XX</i>". This is the strict
* kind of decoding, that will throw an exception if any "%XX" sequence
* encountered is invalid (for example, "%HH").
*
* <p>This replaces '+' with ' '. So this method should not be
* used for non application/x-www-form-urlencoded strings such as
* host and path.
*
* @param string a percent-encoded US-ASCII string
* @param encoding a character encoding
* @return a Unicode string
* @throws NullPointerException if any argument is null
* @throws RuntimeException if any the decoding failed because some %
* sequence above is invalid (for example, "%HH")
*/
public static String decode(String string, Charset encoding) {
checkNotNull(string);
checkNotNull(encoding);
try {
return URLDecoder.decode(string, encoding.name());
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
}