/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.util; import org.apache.jena.graph.Node ; import org.apache.jena.rdf.model.impl.Util ; //import org.apache.jena.riot.system.RiotChars ; /** * Code to split an URI or IRI into prefix and local part. * Historically, 'prefix' is referred to as 'namespace' * reflecting RDF/XML history. * <p> * For display, use {@link #localname} and {@link #namespace}. * This follows Turtle, adds some pragmatic rulesm but does not escape * any characters. A URI is split never split before the last {@code /} * or last {@code #}, if present. * See {@link #splitpoint} for more details. * <p> * This code form the machinary behind {@link Node#getLocalName} * {@link Node#getNameSpace} for URI Nodes. * <p> * {@link #localnameTTL} is strict Turtle; it is the same local name as * before, but escaped if necessary. * <p> * The functions {@link #namespaceXML} and {@link #localnameXML} * apply the rules for XML qnames. */ public class SplitIRI { /** Return the 'namespace' (prefix) for a URI string. * Use with {@link #localname} */ public static String namespace(String string) { int i = splitpoint(string) ; if ( i < 0 ) return string ; return string.substring(0, i) ; } /** Calculate a localname - do not escape PN_LOCAL_ESC. * This is not guaranteed to be legal Turtle. * Use with {@link #namespace} */ public static String localname(String string) { int i = splitpoint(string) ; if ( i < 0 ) return "" ; return string.substring(i) ; } /** Return the 'namespace' (prefix) for a URI string, * legal for Turtle and goes with {@link #localnameTTL} */ public static String namespaceTTL(String string) { return namespace(string) ; } /** Calculate a localname - enforce legal Turle * escape PN_LOCAL_ESC, check for final '.' * Use with {@link #namespaceTTL} */ public static String localnameTTL(String string) { String x = localname(string) ; if ( x.isEmpty()) return x ; return escape_PN_LOCAL_ESC(x) ; } private static String escape_PN_LOCAL_ESC(String x) { // Assume that escapes are rare so scan once to make sure there // is work to do then scan again doing the work. //'\' ('_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%') int N = x.length() ; boolean escchar = false ; for ( int i = 0 ; i < N ; i++ ) { char ch = x.charAt(i) ; if ( needsEscape(ch, (i==N-1)) ) { escchar = true ; break ; } } if ( ! escchar ) return x ; StringBuilder sb = new StringBuilder(N+10) ; for ( int i = 0 ; i < N ; i++ ) { char ch = x.charAt(i) ; // DOT only needs escaping at the end if ( needsEscape(ch, (i==N-1) ) ) sb.append('\\') ; sb.append(ch) ; } return sb.toString() ; } private static boolean needsEscape(char ch, boolean finalChar) { if ( ch == '.' ) return finalChar ; return isPN_LOCAL_ESC(ch) ; } // @formatter:off /* From the RDF 1.1 Turtle specification: [136s] PrefixedName ::= PNAME_LN | PNAME_NS Productions for terminals [163s] PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] [164s] PN_CHARS_U ::= PN_CHARS_BASE | '_' [166s] PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] [167s] PN_PREFIX ::= PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)? [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? [169s] PLX ::= PERCENT | PN_LOCAL_ESC [170s] PERCENT ::= '%' HEX HEX [171s] HEX ::= [0-9] | [A-F] | [a-f] [172s] PN_LOCAL_ESC ::= '\' ('_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%') */ // @formatter:on /** Find the URI split point, return the index into the string that is the * first character of a legal Turtle local name. * <p> * This is a pragmatic choice, not just finding the maximal point. * For example, with escaping '/' can be included but that means * {@code http://example/path/abc} could split to give {@code http://example/} * and {@code path/abc} . * <p> * Split URN's after ':'. * * @param uri URI string * @return The split point, or -1 for "not found". */ public static int splitpoint(String uri) { boolean isURN = uri.startsWith("urn:") ; // Fast track. Still need to check validity of the prefix part. int idx1 = uri.lastIndexOf('#') ; // Not so simple - \/ in local names int idx2 = isURN ? uri.lastIndexOf(':') : uri.lastIndexOf('/') ; // If absolute. int idx3 = uri.indexOf(':') ; // Note: local names can't end in "." in Turtle. // This is handled by escape_PN_LOCAL_ESC which will escape it as "\." // Test the discovered local part. // Limit is exclusive. int limit = Math.max(idx1, idx2) ; limit = Math.max(limit, idx3) ; limit = Math.max(-1, limit) ; int splitPoint = -1 ; // Work backwards, checking for // ((PN_CHARS | '.' | ':' | PLX)* for ( int i = uri.length()-1 ; i > limit ; i-- ) { char ch = uri.charAt(i) ; if ( /*RiotChars.*/isPNChars_U_N(ch) || /*RiotChars.*/isPN_LOCAL_ESC(ch) || ch == ':' || ch == '-' || ch == '.' ) continue ; splitPoint = i+1 ; break ; } // limit was at the end. No split point (we could escape the limit point) if ( splitPoint == -1 ) splitPoint = limit+1 ; // No split point. if ( splitPoint >= uri.length() ) return -1 ; // Check the first character of the local name. // All character are legal localname name characters but may not satisfy the additional // first character rule. Move forward to first legal first character. int ch = uri.charAt(splitPoint) ; while ( ch == '.' || ch == '-' ) { splitPoint++ ; if ( splitPoint >= uri.length() ) return -1 ; ch = uri.charAt(splitPoint) ; } // Checking the final '.' is done when checking for escapes. return splitPoint ; } private static boolean checkhex(String uri, int i) { return /*RiotChars.*/isHexChar(uri.charAt(i)) ; } // Assuming legal URIs, there is no work to be done // for %XX. If illegal (e.g. %X), the best we can do // is not mess them up. /* // % - just need to check that it is followed by two hex. if ( ch == '%' ) { if ( i+2 >= uri.length() ) { // Too short return -1 ; } if ( ! checkhex(uri, i+1) || ! checkhex(uri, i+2) ) return -1 ; } */ /** Split point, according to XML qname rules. * This is the longest NCName at the end of the uri. * See {@link Util#splitNamespaceXML}. */ public static int splitXML(String string) { return Util.splitNamespaceXML(string) ; } /** Namespace, according to XML qname rules. * Use with {@link #localnameXML}. */ public static String namespaceXML(String string) { int i = splitXML(string) ; return string.substring(0, i) ; } /** Localname, according to XML qname rules. */ public static String localnameXML(String string) { int i = splitXML(string) ; return string.substring(i) ; } // Extracted from RiotChars // When/if RIOT becomes accessible to this code, then refactor private static boolean /*RiotChars.*/isPN_LOCAL_ESC(char ch) { switch (ch) { case '\\': case '_': case '~': case '.': case '-': case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': case '=': case '/': case '?': case '#': case '@': case '%': return true ; default: return false ; } } /** ASCII 0-9 */ private static boolean isDigit(int ch) { return range(ch, '0', '9') ; } private static boolean isPNCharsBase(int ch) { // PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | // [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | // [#x10000-#xEFFFF] return r(ch, 'a', 'z') || r(ch, 'A', 'Z') || r(ch, 0x00C0, 0x00D6) || r(ch, 0x00D8, 0x00F6) || r(ch, 0x00F8, 0x02FF) || r(ch, 0x0370, 0x037D) || r(ch, 0x037F, 0x1FFF) || r(ch, 0x200C, 0x200D) || r(ch, 0x2070, 0x218F) || r(ch, 0x2C00, 0x2FEF) || r(ch, 0x3001, 0xD7FF) || // Surrogate pairs r(ch, 0xD800, 0xDFFF) || r(ch, 0xF900, 0xFDCF) || r(ch, 0xFDF0, 0xFFFD) || r(ch, 0x10000, 0xEFFFF) ; // Outside the basic plain. } private static boolean isPNChars_U(int ch) { //PN_CHARS_BASE | '_' return isPNCharsBase(ch) || ( ch == '_' ) ; } private static boolean isPNChars_U_N(int ch) { // PN_CHARS_U | [0-9] return isPNCharsBase(ch) || ( ch == '_' ) || isDigit(ch) ; } private static boolean isPNChars(int ch) { // PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] return isPNChars_U(ch) || isDigit(ch) || ( ch == '-' ) || ch == 0x00B7 || r(ch, 0x300, 0x036F) || r(ch, 0x203F, 0x2040) ; } /** Hexadecimal character */ private static boolean isHexChar(int ch) { return range(ch, '0', '9') || range(ch, 'a', 'f') || range(ch, 'A', 'F') ; } private static int valHexChar(int ch) { if ( range(ch, '0', '9') ) return ch - '0' ; if ( range(ch, 'a', 'f') ) return ch - 'a' + 10 ; if ( range(ch, 'A', 'F') ) return ch - 'A' + 10 ; return -1 ; } private static boolean r(int ch, int a, int b) { return ( ch >= a && ch <= b ) ; } private static boolean range(int ch, char a, char b) { return (ch >= a && ch <= b) ; } }