/* * ModeShape (http://www.modeshape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.modeshape.common.text; import java.io.UnsupportedEncodingException; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.BitSet; import org.modeshape.common.annotation.Immutable; /** * An encoder useful for converting text to be used within a URL, as defined by Section 2.3 of <a * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. Note that this class does not encode a complete URL ( * {@link java.net.URLEncoder} and {@link java.net.URLDecoder} should be used for such purposes). */ @Immutable public class UrlEncoder implements TextEncoder, TextDecoder { public static final char ESCAPE_CHARACTER = '%'; /** * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved. These include upper and * lower case letters, decimal digits, and a limited set of punctuation marks and symbols. * * <pre> * unreserved = alphanum | mark * mark = "-" | "_" | "." | "!" | "˜" | "*" | "'" | "(" | ")" * </pre> * * Unreserved characters can be escaped without changing the semantics of the URI, but this should not be done unless the URI * is being used in a context that does not allow the unescaped character to appear. */ private static final BitSet RFC2396_UNRESERVED_CHARACTERS = new BitSet(256); private static final BitSet RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS; /** * Lookup table which is used to determine, based on a hex char, how many bytes were needed in UTF-8 encoding to store that char */ private static final byte[] BYTES_PER_CHAR = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; static { RFC2396_UNRESERVED_CHARACTERS.set('a', 'z' + 1); RFC2396_UNRESERVED_CHARACTERS.set('A', 'Z' + 1); RFC2396_UNRESERVED_CHARACTERS.set('0', '9' + 1); RFC2396_UNRESERVED_CHARACTERS.set('-'); RFC2396_UNRESERVED_CHARACTERS.set('_'); RFC2396_UNRESERVED_CHARACTERS.set('.'); RFC2396_UNRESERVED_CHARACTERS.set('!'); RFC2396_UNRESERVED_CHARACTERS.set('~'); RFC2396_UNRESERVED_CHARACTERS.set('*'); RFC2396_UNRESERVED_CHARACTERS.set('\''); RFC2396_UNRESERVED_CHARACTERS.set('('); RFC2396_UNRESERVED_CHARACTERS.set(')'); RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS = (BitSet)RFC2396_UNRESERVED_CHARACTERS.clone(); RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS.set('/'); } private boolean slashEncoded = true; @Override public String encode( String text ) { if (text == null) return null; if (text.length() == 0) return text; return encode(text, isSlashEncoded() ? RFC2396_UNRESERVED_CHARACTERS : RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS); } protected String encode( String text, BitSet safeChars ) { final StringBuilder result = new StringBuilder(); final CharacterIterator iter = new StringCharacterIterator(text); for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { if (safeChars.get(c)) { // Safe character, so just pass through ... result.append(c); } else { try { // The character is not a safe character, and must be escaped in UTF-8 form (see http://tools.ietf.org/html/rfc3629) byte[] utf8Bytes = Character.toString(c).getBytes("UTF-8"); for (byte utf8Byte : utf8Bytes) { result.append(ESCAPE_CHARACTER); int high = (utf8Byte & 0xf0) >> 4; int low = utf8Byte & 0x0f; result.append(Integer.toHexString(high)); result.append(Integer.toHexString(low)); } } catch (UnsupportedEncodingException e) { //should never happen throw new IllegalStateException(e); } } } return result.toString(); } @Override public String decode( String encodedText ) { if (encodedText == null) return null; if (encodedText.length() == 0) return encodedText; final StringBuilder result = new StringBuilder(); final CharacterIterator iter = new StringCharacterIterator(encodedText); byte[] escapedCharBytes = new byte[4]; int byteIdx = 0; int bytesPerChar = -1; for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) { if (c == ESCAPE_CHARACTER) { boolean foundEscapedCharacter = false; // Found the first character in a potential escape sequence, so grab the next two characters ... char hexChar1 = iter.next(); char hexChar2 = hexChar1 != CharacterIterator.DONE ? iter.next() : CharacterIterator.DONE; if (hexChar2 != CharacterIterator.DONE) { // We found two more characters, but ensure they form a valid hexadecimal number ... int hexNum1 = Character.digit(hexChar1, 16); int hexNum2 = Character.digit(hexChar2, 16); if (hexNum1 > -1 && hexNum2 > -1) { foundEscapedCharacter = true; //since we're dealing with UTF-8, we need to figure out how many bytes were used to encode the original //character by reading the number of leading 1 bits from the 1st high order byte if (bytesPerChar == -1) { bytesPerChar = BYTES_PER_CHAR[hexNum1]; } //record the next byte into the array escapedCharBytes[byteIdx++] = (byte) (hexNum1 * 16 + hexNum2); if (byteIdx == bytesPerChar) { //we've filled the buffer of bytes try { result.append(new String(escapedCharBytes, 0, bytesPerChar, "UTF-8")); } catch (UnsupportedEncodingException e) { //should never happen throw new IllegalStateException(e); } byteIdx = 0; bytesPerChar = -1; } } } if (!foundEscapedCharacter) { result.append(c); if (hexChar1 != CharacterIterator.DONE) result.append(hexChar1); if (hexChar2 != CharacterIterator.DONE) result.append(hexChar2); } } else { result.append(c); } } return result.toString(); } /** * @return slashEncoded */ public boolean isSlashEncoded() { return this.slashEncoded; } /** * @param slashEncoded Sets slashEncoded to the specified value. * @return this object, for method chaining */ public UrlEncoder setSlashEncoded( boolean slashEncoded ) { this.slashEncoded = slashEncoded; return this; } }