/* * #! * Ontopia Engine * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.infoset.impl.basic; import java.io.Externalizable; import java.io.File; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.net.MalformedURLException; import java.net.URL; import net.ontopia.infoset.core.LocatorIF; import net.ontopia.utils.OntopiaRuntimeException; import net.ontopia.utils.StringUtils; /** * PUBLIC: A Uniform Resource Identifier locator. Only URI locators * should be used with this locator class. The notation is 'URI'.<p> * * The address is always normalized by the constructor. The address * given to the constructor <b>must</b> be absolute.<p> */ public class URILocator extends AbstractLocator implements Externalizable { protected String address; protected short schemeEnd; // the ':' char in the scheme part protected short authorityEnd; // last char in authority part protected short lastSlash; // last slash in directory path protected short fragmentStart; // index of fragment '#' static { try { net.ontopia.net.data.Handler.install(); } catch (SecurityException e) { // Fail silently if there are security issues. } catch (NoClassDefFoundError e) { // This happens on Google AppEngine, but is not really a problem // since the data-URL handler is rarely used. See // https://github.com/ontopia/ontopia/issues/118 } } /** * INTERNAL: No-argument constructor used by serialization. Do not * use this constructor in application code. */ public URILocator() { } /** * PUBLIC: Creates a URILocator representing the URI given. Note * that the URI string should be in external form, and that it * must be absolute. */ public URILocator(String address) throws MalformedURLException { this.address = normalize(address); } /** * PUBLIC: Creates a URILocator representing the URL given. */ public URILocator(URL url) throws MalformedURLException { this.address = normalize(url.toExternalForm()); } /** * PUBLIC: Creates a URILocator containing a file URL referring * to the file represented by the File object.<p> * * @since 1.3.4 */ public URILocator(File file) { try { String path = file.getAbsolutePath(); if (File.separatorChar != '/') path = path.replace(File.separatorChar, '/'); if (!path.startsWith("/")) path = "/" + path; if (!path.endsWith("/") && file.isDirectory()) path = path + "/"; path = "file:" + path; this.address = normalize(escapeFilePath(path)); } catch (MalformedURLException e) { throw new OntopiaRuntimeException("INTERNAL ERROR: File " + file + " produced malformed URL", e); } } /** * INTERNAL: Special constructor used when resolving a URI relative * to a base URI. Since the base URI is already normalized we can * avoid repeating the normalization, and thus save time. */ protected URILocator(String normalized, short schemeEnd, short authorityEnd, short lastSlash, short fragmentStart) { this.address = normalized; this.schemeEnd = schemeEnd; this.authorityEnd = authorityEnd; this.lastSlash = lastSlash; this.fragmentStart = fragmentStart; } protected String normalize(String address) throws MalformedURLException { authorityEnd = -1; lastSlash = -1; fragmentStart = -1; char[] uri = new char[address.length() + 100]; // working buffer address.getChars(0, address.length(), uri, 0); // copy into buffer int length = decodeURI(uri, address.length()); schemeEnd = (short) getScheme(uri, length); if (schemeEnd == -1) throw new MalformedURLException("No valid scheme in URI: " + address); if (StringUtils.regionEquals("file", uri, 0, 4) || StringUtils.regionEquals("jar:file", uri, 0, 8) || StringUtils.regionEquals("classpath", uri, 0, 9)) length = parseFileUrl(uri, schemeEnd, length); else if (StringUtils.regionEquals("//", uri, schemeEnd+1, 2)) length = parseHierarchicalUrl(uri, schemeEnd, length); return new String(uri, 0, length); } // -------------------------------------------------------------------------- // LocatorIF implementation // -------------------------------------------------------------------------- public String getNotation() { return "URI"; } public String getAddress() { return address; } public LocatorIF resolveAbsolute(String rel) { int length = rel.length(); if (length == 0) { if (fragmentStart == -1) return this; else return new URILocator(address.substring(0, fragmentStart), schemeEnd, authorityEnd, lastSlash, (short) -1); } switch(rel.charAt(0)) { case '#': if (fragmentStart == -1) return new URIFragmentLocator(address.intern(), rel.substring(1), schemeEnd, authorityEnd, lastSlash); else return new URIFragmentLocator(address.substring(0, fragmentStart).intern(), rel.substring(1), schemeEnd, authorityEnd, lastSlash); case '/': if (length != 1 && rel.charAt(1) == '/') { // begins with "//" if (authorityEnd == -1) throw new OntopiaRuntimeException(new MalformedURLException("Base URI is not hierarchical")); return new URILocator(address.substring(0, schemeEnd+1) + rel, schemeEnd, authorityEnd, lastSlash, fragmentStart); } else // FIXME: should normalize absolute path return new URILocator(address.substring(0, authorityEnd) + rel, schemeEnd, authorityEnd, lastSlash, fragmentStart); } // no default needed; the rest of the method _is_ the default try { char[] relative = rel.toCharArray(); // does the URI have a scheme? if (getScheme(relative, relative.length) != -1) return new URILocator(rel); // scan for slashes in URI int ix; for (ix = 0; ix < length && relative[ix] != '/'; ix++) ; // there were slashes, use constructor for unnormalized URIs, // so that the normalizer resolves the directory for us // (also do this if rel is "." or "..") if (ix < length || rel.equals(".") || rel.equals("..")) { if (lastSlash == -1) // no directory part // the "/" here is important, as it was normalized away and needs // to be added back return new URILocator(address.substring(0, authorityEnd + 1) + "/" + rel); else return new URILocator(address.substring(0, lastSlash + 1) + rel); } // there were no slashes, so this is a pure file name if (lastSlash == -1) // base has no directory part return new URILocator(address + rel, schemeEnd, authorityEnd, lastSlash, fragmentStart); else return new URILocator(address.substring(0, lastSlash + 1) + rel, schemeEnd, authorityEnd, lastSlash, fragmentStart); } catch (MalformedURLException e) { throw new OntopiaRuntimeException(e); } } public String getExternalForm() { return toExternalForm(address); } static String toExternalForm(String address) { // need to escape characters that are not unreserved or reserved char[] tmp = new char[address.length() * 6]; // worst case scenario int pos = 0; // we don't escape % because if it's present in the URI it's because // we didn't unescape it on the way in. for (int ix = 0; ix < address.length(); ix++) { char ch = address.charAt(ix); if ((ch >= 'a' && ch <= 'z') || // a-z (ch >= '?' && ch <= 'Z') || // ? @ A-Z (ch >= '%' && ch <= ';') || // % & ' ( ) * + , - . / 0-9 : ; ch == '#' | ch == '!' || ch == '$' || ch == '=' || ch == '_' || ch == '~' || (ch == '|' && ix == 7)) // file:/X|/; special case... tmp[pos++] = ch; else { // have to escape tmp[pos++] = '%'; if (ch <= 0x7F) { // 0xxxxxxx addByte(tmp, pos, ch); pos += 2; } else if (ch <= 0x07FF) { // 110xxxxx 10xxxxxx addByte(tmp, pos, (ch >> 6) | 0xC0); pos += 2; tmp[pos++] = '%'; addByte(tmp, pos, (ch & 0x3F) | 0x80); pos += 2; } else { // 1110xxxx 10xxxxxx 10xxxxxx addByte(tmp, pos, (ch >> 12) | 0xE0); pos += 2; tmp[pos++] = '%'; addByte(tmp, pos, ((ch >> 6) & 0x3F) | 0x80); pos += 2; tmp[pos++] = '%'; addByte(tmp, pos, (ch & 0x3F) | 0x80); pos += 2; } } } return new String(tmp, 0, pos); } private static void addByte(char[] tmp, int pos, int ch) { tmp[pos] = encodeHexDigit((ch & 0x00F0) >> 4); tmp[pos + 1] = encodeHexDigit(ch & 0x000F); } // -------------------------------------------------------------------------- // URI parsing // -------------------------------------------------------------------------- /** * INTERNAL: Parses and normalizes a file:/ URL. * @param ix The index of the last character in the scheme (':') * @return Index of last character in URI. */ private int parseFileUrl(char[] uri, int ix, int length) throws MalformedURLException { if (ix+2 >= length) throw new MalformedURLException("File URL has only scheme name."); // STEP 1: deal with hostname and initial slashes // file:///home/ -> file:/home/ // file://localhost/home -> file://localhost/home/ // file:/home/ -> file:/home/ // file://graph/tmp/ -> file://graph/tmp/ ix++; // skip ':' if (uri[ix] == '/') ix++; // skip ':/' int chars = -1; if (uri[ix] == '/') { // three cases: '://server/home/', ':///home/' and '://localhost/home/' if (ix+1 < length && uri[ix+1] == '/') chars = 2; // it's ':///home/'; strip '//' else chars = 0; // it's '://server/home/', leave it System.arraycopy(uri, ix+chars, uri, ix, length - (ix+chars)); length -= chars; } // STEP 2: deal with directory part // INVARIANT: ix now index of first char after 'file:/' if (chars == 0) { authorityEnd = (short) ix; return parseDirectoryPart(uri, ix, length); } else { authorityEnd = (short) (ix-1); return parseDirectoryPart(uri, ix-1, length); } } /** * INTERNAL: Parses and normalizes a hierarchical URL. * @param schemeEnd The index of the last character in the scheme (':') * @return Index of last character in URI. */ private int parseHierarchicalUrl(char[] uri, int schemeEnd, int length) throws MalformedURLException { // ---parse authority // [ [ userinfo "@" ] host [ : port ] ] // the only thing we care about is the port number // algorithm: // scan outwards, stop on first '/' or the end // after each ':' keep track of where it was and whether it was // followed by non-digits int ix = schemeEnd + 3; // skip over the '//' int portStart = -1; int hostStart = ix; String port = null; while (ix < length && uri[ix] != '/' && uri[ix] != '?' && uri[ix] != '#') { if (uri[ix] == ':') { // may be port number, check out ix++; portStart = ix; while (ix < length && uri[ix] >= '0' && uri[ix] <= '9') ix++; // port numbers are pure digits, so scan for those if (ix >= length || uri[ix] == '/' || uri[ix] == '?' || uri[ix] == '#') { // terminated with correct char, so it's a port number port = new String(uri, portStart, ix - portStart); break; // this means we're done with the authority part } } else if (uri[ix] == '@') hostStart = ix + 1; ix++; } if (port != null && findPortDefault(uri, schemeEnd).equals(port)) { // default port number used; remove int offset = (ix - portStart) + 1; System.arraycopy(uri, ix, uri, portStart - 1, length - ix); ix -= offset; length -= offset; } StringUtils.downCaseAscii(uri, hostStart, ix - hostStart); // make sure authority part ends with a slash no matter what if (uri[ix] != '/') { length++; // we just lengthened the URI... if (ix+1 < length) // have to shift part after '/' out one notch System.arraycopy(uri, ix, uri, ix+1, (length - ix) - 1); uri[ix++] = '/'; } authorityEnd = (short) ix; if (ix+1 >= length) return length; return parseDirectoryPart(uri, ix, length); } public int parseDirectoryPart(char[] uri, int ix, int length) throws MalformedURLException { if (ix == length) { // we are at the last character, so just stop lastSlash = -1; return length; } int[] slashpos = new int[(length - authorityEnd) / 2 + 2]; slashpos[0] = authorityEnd; int slashix = 0; while (ix < length && uri[ix] != '?' && uri[ix] != '#') { if (uri[ix] == '/') { if (slashpos[slashix] == ix - 1) {// two successive slashes, remove one System.arraycopy(uri, ix, uri, ix - 1, length - ix); ix--; length--; } // WARNING: This loop is time-critical in the extreme. Minor // rearrangements to the tests here can cause the time needed // to create URIs to double. This will then affect import // times and other important operations as well. Care, and // stopwatches, must be exercised if changes are made. if (ix+2 < length && uri[ix+1] == '.') { // handling ./ in URI if (uri[ix+2] == '/') { System.arraycopy(uri, ix+3, uri, ix+1, length - (ix+3)); length -= 2; continue; } // handling ../ in URI and .. at end of URI if (uri[ix+2] == '.' && ((ix+3 < length && uri[ix+3] == '/') || ix+3 == length)) { // removing 3 chars if ../, 2 chars if .. int chars = 3; if (ix+3 == length) chars = 2; int offset; if (ix == authorityEnd) offset = chars; else offset = (ix+chars) - slashpos[slashix]; //debugPrint(uri, length, slashpos, slashix+1); System.arraycopy(uri, ix+(chars+1), uri, slashpos[slashix] + 1, length - (ix+(chars+1))); ix = slashpos[slashix]; length -= offset; if (slashix != 0) slashix--; continue; } } // end of ../ and ./ checking if (ix != authorityEnd) slashpos[++slashix] = ix; } ix++; } // last we check for /. at the end of the directory part, and remove it if (slashpos[slashix] + 2 == ix && uri[ix-1] == '.') { if (slashix != 0) slashix--; System.arraycopy(uri, ix, uri, ix-1, length - ix); length--; ix--; } lastSlash = (short) slashpos[slashix]; // ---parse query, and fragment while (ix < length && uri[ix] != '#') ix++; if (ix < length && uri[ix] == '#') { fragmentStart = (short) ix; // fragment syntax, RFC 2396, page 27 // // fragment = *uric // uric = reserved | unreserved | escaped // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | // "$" | "," // unreserved = alphanum | mark // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | // "(" | ")" ix++; // skip the '#' to begin checking while (ix < length && ((uri[ix] >= 'a' && uri[ix] <= 'z') || (uri[ix] >= '?' && uri[ix] <= 'Z') || // ? @ A-Z (uri[ix] >= '&' && uri[ix] <= '9') || // & ' ( ) * + , - . / 0-9 uri[ix] == '!' || uri[ix] == '$' || uri[ix] == ':' || uri[ix] == ';' || uri[ix] == '=' || uri[ix] == '_' || uri[ix] == '~' || uri[ix] == '%')) { // to support percent-escaping if (uri[ix] == '%') ix += 2; ix++; } if (ix < length) throw new MalformedURLException("Illegal character in fragment: '" + uri[ix] + "' at position " + ix + " of: '" + new String(uri) + "'"); } return length; } /** * Parses the scheme part of a URI. * @return The index of the last char in the scheme, which will be ':' or * -1 if there is no scheme. */ private int getScheme(char[] uri, int length) { // RFC 2396, section 3.1 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) int index = 0; while((index < length) && ((uri[index] >= 'a' && uri[index] <= 'z') || // lowalpha (uri[index] >= 'A' && uri[index] <= 'Z') || // upalpha (uri[index] >= '0' && uri[index] <= '9') || // digit uri[index] == '+' || uri[index] == '-' || uri[index] == '.')) index++; if (index == 0 || index >= length || uri[index] != ':') return -1; return index; } /** * Decodes escape codes in URIs in place in the character array. Returns * length of URI in the character array. */ private int decodeURI(char[] uri, int length) throws MalformedURLException { while (length > 0 && uri[length-1] == ' ') length--; int pos = 0; // pos to write int ix; // index to read for (ix = 0; ix < length && uri[ix] == ' '; ix++) ; for (; ix < length; ix++) { switch(uri[ix]) { case '%': if (ix + 2 >= length) throw new MalformedURLException("Incomplete percent-escape at end of URI"); char ch = (char) (decodeHexDigit(uri[ix+1]) * 16 + decodeHexDigit(uri[ix+2])); if (ch != 38 && ch != 37 && ch != 35) { // it's not #, & or %, so we can unescape it uri[pos++] = ch; ix += 2; } else // it *is* #, & or %. therefore must leave alone uri[pos++] = '%'; break; case '+': uri[pos++] = ' '; break; default: uri[pos++] = uri[ix]; } } return pos; } private int decodeHexDigit(char ch) throws MalformedURLException { if (ch >= '0' && ch <= '9') return ch - '0'; else if (ch >= 'A' && ch <= 'F') return (ch - 'A') + 10; else if (ch >= 'a' && ch <= 'f') return (ch - 'a') + 10; else throw new MalformedURLException("Invalid percent-escape code containing '" + ch + "' as hex digit in"); } private String findPortDefault(char[] uri, int schemeEnd) { if (StringUtils.regionEquals("http", uri, 0, schemeEnd)) return "80"; else if (StringUtils.regionEquals("https", uri, 0, schemeEnd)) return "443"; else if (StringUtils.regionEquals("shttp", uri, 0, schemeEnd)) return "80"; else if (StringUtils.regionEquals("ftp", uri, 0, schemeEnd)) return "21"; else if (StringUtils.regionEquals("ldap", uri, 0, schemeEnd)) return "389"; else if (StringUtils.regionEquals("gopher", uri, 0, schemeEnd)) return "70"; else return "dummy value"; } /** * Escapes the given file path so that illegal characters in the * path are correctly escaped. */ private static String escapeFilePath(String path) { // only the following does not need to be escaped // unreserved = alphanum | mark // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" // we don't escape slashes, because those are not allowed in file names char[] tmp = new char[path.length() * 6]; // more than enough int pos = 0; for (int ix = 0; ix < path.length(); ix++) { char ch = path.charAt(ix); if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || (ch >= '\'' && ch <= '*') || ch == '!' || ch == '-' || ch == '.' || ch == '_' || ch == '~') tmp[pos++] = ch; else if (ch > 0x7F) { // UTF-8-encode the character if (ch < 0x07FF) { // 0000 0080-0000 07FF 110xxxxx 10xxxxxx int codeval = (ch >> 6) | 0xC0; tmp[pos++] = '%'; tmp[pos++] = encodeHexDigit(codeval >> 4); tmp[pos++] = encodeHexDigit(codeval & 0x0F); codeval = (ch & 0x003F) | 0x80; tmp[pos++] = '%'; tmp[pos++] = encodeHexDigit(codeval >> 4); tmp[pos++] = encodeHexDigit(codeval & 0x0F); } else if (ch < 0xFFFF) { // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx int codeval = (ch >> 12) | 0xE0; tmp[pos++] = '%'; tmp[pos++] = encodeHexDigit(codeval >> 4); tmp[pos++] = encodeHexDigit(codeval & 0x0F); codeval = ((ch & 0x0FFF) >> 6) | 0x80; tmp[pos++] = '%'; tmp[pos++] = encodeHexDigit(codeval >> 4); tmp[pos++] = encodeHexDigit(codeval & 0x0F); codeval = ((ch & 0x003F) >> 6) | 0x80; tmp[pos++] = '%'; tmp[pos++] = encodeHexDigit(codeval >> 4); tmp[pos++] = encodeHexDigit(codeval & 0x0F); } else throw new OntopiaRuntimeException("INTERNAL ERROR: Only BMP characters supported"); } else { tmp[pos++] = '%'; tmp[pos++] = encodeHexDigit(ch >> 4); tmp[pos++] = encodeHexDigit(ch & 0x0F); } } return new String(tmp, 0, pos); } private static char encodeHexDigit(int value) { if (value <= 9) return (char) ('0' + value); else return (char) ('A' + (value - 10)); } // --- Debugging methods @SuppressWarnings("unused") private void debugPrint(char[] uri, int length, int[] indexes, int count) { System.out.println("\n" + new String(uri, 0, length)); int next = 0; for (int ix = 0; ix < length; ix++) { if (indexes[next] == ix) { System.out.print("^"); next++; } else System.out.print(" "); } System.out.println(""); } // -------------------------------------------------------------------------- // Misc // -------------------------------------------------------------------------- public int hashCode() { return address.hashCode(); } public boolean equals(Object object) { try { LocatorIF locator = (LocatorIF)object; return address.equals(locator.getAddress()) && locator.getNotation().equals("URI"); } catch (ClassCastException e) { return false; // In case the object is not a locator } catch (NullPointerException e) { return false; // In case the object is null } } // -------------------------------------------------------------------------- // Externalization // -------------------------------------------------------------------------- public void writeExternal(ObjectOutput out) throws IOException { out.writeUTF(address); } public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { address = in.readUTF(); } // -------------------------------------------------------------------------- // Utility method // -------------------------------------------------------------------------- /** * INTERNAL: Parses the URI and returns an instance of URILocator if * the URI is valid. If the URI is invalid null is returned. * * @since 3.0 */ public static URILocator create(String uriAddress) { try { return new URILocator(uriAddress); } catch (MalformedURLException e) { return null; } } }