/* * Part of the CCNx Java Library. * * Copyright (C) 2008-2013 Palo Alto Research Center, Inc. * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License version 2.1 * as published by the Free Software Foundation. * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. You should have received * a copy of the GNU Lesser General Public License along with this library; * if not, write to the Free Software Foundation, Inc., 51 Franklin Street, * Fifth Floor, Boston, MA 02110-1301 USA. */ package org.ccnx.ccn.protocol; import static org.ccnx.ccn.profiles.CommandMarker.COMMAND_MARKER_NONCE; import java.math.BigInteger; import java.net.URISyntaxException; import java.nio.ByteBuffer; import java.util.Random; import org.bouncycastle.util.Arrays; import org.ccnx.ccn.impl.support.DataUtils; import org.ccnx.ccn.protocol.ContentName.ComponentProvider; /** * Wrapper class to store immutable name components. */ public class Component implements ComponentProvider { byte[] component; protected Component(byte[] comp) { this.component = comp; } /** * Create a component from a native string. * @param text native text string. */ public Component(String text) { this.component = parseNative(text); } @Override public byte[] getComponent() { return this.component; } /** * Parse native string component: just UTF-8 encode * For full names in native strings only "/" is special * but for an individual component we will even allow that. * This method intentionally throws no declared exceptions * so you can be confident in encoding any native Java String * TODO make this use Java string escaping rules? * @param name Component as native Java string */ public static byte[] parseNative(String name) { // Handle exception s around missing UTF-8 return DataUtils.getBytesFromUTF8String(name); } /** * Indicates an attempt to parse a .. component. */ public static class DotDot extends Exception { // Need to strip off a component private static final long serialVersionUID = 4667513234636853164L; } private static final boolean uriReserved(char ch) { if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('0' <= ch && ch <= '9') || ch == '-' || ch == '.' || ch == '_' || ch == '~') return false; return true; } /** * Parse the URI Generic Syntax of RFC 3986. * Including handling percent encoding of sequences that are not legal character * encodings in any character set. This method is the inverse of * printComponent() and for any input sequence of bytes it must be the case * that parseComponent(printComponent(input)) == input. Note that the inverse * is NOT true printComponent(parseComponent(input)) != input in general. * * @see fromURI(String) * * Note in particular that this method interprets sequences of more than * two dots ('.') as representing an empty component or dot component value * as encoded by componentPrint. That is, the component value will be * the value obtained by removing three dots. * @param name a single component of a name, URI encoded * @return a name component */ public static byte[] parseURI(String name) throws DotDot, URISyntaxException { byte[] decodedName = null; boolean alldots = true; // does this component contain only dots after unescaping? boolean quitEarly = false; boolean hexEncoding = false; int b1, b2; ByteBuffer result = ByteBuffer.allocate(name.length()); for (int i = 0; i < name.length() && !quitEarly; i++) { char ch = name.charAt(i); switch (ch) { case '%': // This is a byte string %xy where xy are hex digits // Since the input string must be compatible with the output // of componentPrint(), we may convert the character values directly. if (name.length()-1 < i+2) { throw new URISyntaxException(name, "malformed %xy byte representation: too short", i); } b1 = Character.digit(name.charAt(++i), 16); // consume x b2 = Character.digit(name.charAt(++i), 16); // consume y if (b1 < 0 || b2 < 0) throw new URISyntaxException(name, "malformed %xy byte representation: not legal hex number: " + name.substring(i-2, i+1), i-2); result.put((byte)((b1 * 16) + b2)); break; // Note in C lib case 0 is handled like the two general delimiters below that terminate processing // but that case should never arise in Java which uses real unicode characters. case '/': case '?': case '#': quitEarly = true; // early exit from containing loop break; case '=': if (name.length()-1 < i+2 || ((name.length() - i) & 1) == 0) { throw new URISyntaxException(name, "malformed =xy byte representation: too short", i); } hexEncoding = true; break; case ':': case '[': case ']': case '@': case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': // Permit unescaped reserved characters result.put((byte)ch); break; default: if (uriReserved(ch)) throw new URISyntaxException(name, "Illegal characters in URI", i); if (hexEncoding) { b1 = Character.digit(ch, 16); // consume x b2 = Character.digit(name.charAt(++i), 16); // consume y if (b1 < 0 || b2 < 0) throw new URISyntaxException(name, "malformed =xy byte representation: not legal hex number: " + name.substring(i-1, i), i-1); result.put((byte)((b1 * 16) + b2)); } else { // This character remains the same result.put((byte)ch); } break; } if (!quitEarly && result.position() > 0 && result.get(result.position()-1) != '.') { alldots = false; } } result.flip(); if (alldots) { if (result.limit() <= 1) { return null; } else if (result.limit() == 2) { throw new DotDot(); } else { // Remove the three '.' extra result.limit(result.limit()-3); } } decodedName = new byte[result.limit()]; System.arraycopy(result.array(), 0, decodedName, 0, result.limit()); return decodedName; } public static String hexPrint(byte [] bs) { if (null == bs) return new String(); BigInteger bi = new BigInteger(1,bs); return bi.toString(16); } public static String printNative(byte[] bs) { // Native string print is the one place where we can just use // Java native platform decoding. Note that this is not // necessarily invertible, since there may be byte sequences // that do not correspond to any legal native character encoding // that may be converted to e.g. Unicode "Replacement Character" U+FFFD. return new String(bs); } /** * Internal flag signalling the use of the old-style percent-encoding, * or the new mixed-style using percent-encoding and strings of hexadecimal digits. * */ static enum URIEscape { /** Use RFC-3986 S2.1 percent-encoding for unprintable characters in the component name. */ PERCENT, /** Use mixed-form of percent-encoding and '='{digits} encoding for unprintable characters in the component name. */ MIXED } static final char HEX_DIGITS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; public static String printURI(byte [] bs) { return printURI(bs, 0, bs.length, URIEscape.MIXED); } public static String printURI(byte [] bs, int offset, int length) { return printURI(bs, offset, length, URIEscape.MIXED); } /** * Print bytes in the URI Generic Syntax of RFC 3986 * including byte sequences that are not legal character * encodings in any character set and byte sequences that have special * meaning for URI resolution per RFC 3986. This is designed to match * the C library URI encoding. * <p> * This method must be invertible by parseComponent() so * for any input sequence of bytes it must be the case * that parseComponent(printComponent(input)) == input. * </p> * <p> * All bytes that are unreserved characters per RFC 3986 are left unescaped. * Other bytes are percent encoded. * </p> * <p> * Empty path components and path components "." and ".." have special * meaning for relative URI resolution per RFC 3986. To guarantee * these component variations are preserved and recovered exactly when * the URI is parsed by parseComponent() we use a convention that * components that are empty or consist entirely of '.' characters will * have "..." appended. This is intended to be consistent with the CCN C * library handling of URI representation of names. * </p> * @param bs input byte array. * @return */ private static String printURI(byte[] bs, int offset, int length, URIEscape escape) { int i; boolean hexEncoding = false; if (null == bs || bs.length == 0) { // Empty component represented by three '.' return "..."; } // To get enough control over the encoding, we use // our own loop and NOT simply new String(bs) (or java.net.URLEncoder) because // the String constructor will decode illegal UTF-8 sub-sequences // with Unicode "Replacement Character" U+FFFD. We could use a CharsetDecoder // to detect the illegal UTF-8 sub-sequences and handle them separately, // except that this is almost certainly less efficient and some versions of Java // have bugs that prevent flagging illegal overlong UTF-8 encodings (CVE-2008-2938). // Also, it is much easier to verify what this is doing and compare to the C library implementation. // // Initial allocation is based on the documented behavior of StringBuilder's buffer // expansion algorithm being 2+2*length if expansion is required. StringBuilder result = new StringBuilder((1 + 3 * bs.length) / 2); for (i = 0; i < bs.length && bs[i] == '.'; i++) { continue; } if (i == bs.length) { // all dots result.append("..."); } // components starting in either %00 (segments) or %FD (\375, versions) should // be displayed as hex encoded regardless of whether the next byte is // a printable character. Should match the corresponding code in the C library. if (escape == URIEscape.MIXED && (bs[0] == (byte)'\000' || bs[0] == (byte)'\375')) { hexEncoding = true; result.append("="); } // If the option of limiting escaping to percent disappears this // branch of the if can also disappear. if (escape == URIEscape.PERCENT) { for (i = 0; i < bs.length; i++) { char ch = (char) bs[i]; if (!uriReserved(ch)) { result.append(ch); } else { result.append('%'); result.append(HEX_DIGITS[(ch >> 4) & 0xF]); result.append(HEX_DIGITS[ch & 0xF]); } } } else { for (i = 0; i < bs.length; i++) { char ch = (char) bs[i]; if (hexEncoding) { result.append(HEX_DIGITS[(ch >> 4) & 0xF]); result.append(HEX_DIGITS[ch & 0xF]); } else if (!uriReserved(ch)) result.append(ch); else { if (bs.length == (i + 1) || !uriReserved((char)bs[i + 1])) result.append('%'); else { result.append('='); hexEncoding = true; } result.append(HEX_DIGITS[(ch >> 4) & 0xF]); result.append(HEX_DIGITS[ch & 0xF]); } } } return result.toString(); } private static Random random = new Random(); /** * Generates a random nonce component (with a nonce CommandMarker header). * Can be used in ContentName constructors where a nonce is required. * Note: the nonce component generated will be different every time this * is used. */ public static final ComponentProvider NONCE = new ComponentProvider() { @Override public byte[] getComponent() { byte [] nonce = new byte[8]; random.nextBytes(nonce); return COMMAND_MARKER_NONCE.addBinaryData(nonce); } }; private static byte[] emptyComponent = new byte[]{ }; /** * This object generates an empty component (length = 0). */ public static final ComponentProvider EMPTY = new ComponentProvider() { @Override public byte[] getComponent() { return emptyComponent; } }; @Override public boolean equals(Object obj) { if (obj instanceof byte[]) return Arrays.areEqual( (byte[])obj, this.getComponent() ); if (obj instanceof ComponentProvider) return Arrays.areEqual( ((ComponentProvider)obj).getComponent(), this.getComponent() ); if (obj instanceof String) return Arrays.areEqual( ((String)obj).getBytes(), this.getComponent() ); return super.equals(obj); } @Override public int hashCode() { return Arrays.hashCode(this.getComponent()); } @Override public String toString() { return printURI(this.component); } }