/* * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * The Original Code is the Kowari Metadata Store. * * The Initial Developer of the Original Code is Plugged In Software Pty * Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions * created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002 * Plugged In Software Pty Ltd. All Rights Reserved. * * Contributor(s): N/A. * * [NOTE: The text of this Exhibit A may differ slightly from the text * of the notices in the Source Code files of the Original Code. You * should use the text of this Exhibit A rather than the text found in the * Original Code Source Code for Your Modifications.] * */ package org.mulgara.resolver; // Java 2 standard packages import java.net.URI; import java.net.URISyntaxException; import java.text.ParseException; import java.util.regex.Matcher; import java.util.regex.Pattern; // Third party packages import org.apache.log4j.Logger; // Apache Log4J import org.jrdf.graph.Literal; import org.jrdf.graph.Node; import org.jrdf.graph.URIReference; // Local packages import org.mulgara.query.rdf.LiteralImpl; import org.mulgara.query.rdf.URIReferenceImpl; /** * Static library for converting N-Triples serialization to and from JRDF * {@link Node}s. * * @created 2004-09-22 * @author <a href="http://www.pisoftware.com/raboczi">Simon Raboczi</a> * @author <a href="mailto:pgearon@users.sourceforge.net">Paula Gearon</a> * @version $Revision: 1.8 $ * @modified $Date: 2005/01/05 04:58:24 $ * @maintenanceAuthor $Author: newmana $ * @company <a href="mailto:info@PIsoftware.com">Plugged In Software</a> * @copyright ©2004 <a href="http://www.tucanatech.com/">Tucana * Technology, Inc</a> * @licence <a href="{@docRoot}/../../LICENCE">Mozilla Public License v1.1</a> * Portions by Paula Gearon. * @copyright ©2006 <a href="http://www.herzumsoftware.com/">Herzum Software LLC</a> */ abstract class NTriples { /** * Logger. * * This is named after the class. */ private static final Logger logger = Logger.getLogger(NTriples.class.getName()); /** * A regular expression matching NTriples literals. * * In the following pattern: * <ul> * <li>Group 0 is the entire literal serialization</li> * <li>Group 1 is the lexical form</li> * <li>Group 3 is the language clause<li> * <li>Group 4 is the language code</li> * <li>Group 5 is the datatype clause</li> * <li>Group 6 is the datatype URI</li> * </ul> */ private static final Pattern literalPattern = Pattern.compile( "\\x22(([^\\\\]|\\\\[tnr\\x22\\\\]|\\\\u\\p{XDigit}{2}|\\\\U\\p{XDigit}{4})*)\\x22" + // lexical form "(@(\\w+))?" + // optional language "(\\^\\^<([^>]*)>)?" // optional datatype ); /** * A regular expression to pick out characters needing escape from Unicode * to ASCII. */ private static final Pattern escapedCharacterPattern = Pattern.compile( "[\ud800\udc00-\udbff\udfff]" + // surrogate pairs "|" + // ...or... "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]" // all other escaped chars ); /** * A regular expression to pick out ASCII escapes for Unicode characters. * * In the following pattern: * <ul> * <li>Group 0 is the escaped lexical form</li> * <li>Group 2 is any single character escape</li> * <li>Group 3 is any 4-digit Unicode escape</li> * <li>Group 4 is any 8-digit Unicode escape</li> * </ul> */ private static final Pattern escapePattern = Pattern.compile( "\\\\" + // all escapes start with a backslash "(" + "([tnr\\\\\\\"])" + // tab, newline, return, backslash, quote "|" + // ...or... "u(\\p{XDigit}{4})" + // a 16-bit hexadecimal Unicode "|" + // ...or... "U(\\p{XDigit}{8})" + // a 32-bit hexadecimal Unicode ")" ); /** * Convert N-Triples to JRDF. * * @param string a string in N-Triples format, never <code>null</code> * @param baseURI the base URI against which to resolve relative URI * references, which must be absolute * @return a JRDF node equivalent to the <var>string</var> * @throws IllegalArgumentException if <var>string</var> is <code>null</code> * @throws ParseException if <var>string</var> isn't valid N-Triples */ public static Node toNode(String string, URI baseURI) throws ParseException { // Validate "string" parameter if (string == null) { throw new IllegalArgumentException("Null \"string\" parameter"); } // Validate "baseURI" parameter if (baseURI == null || !baseURI.isAbsolute()) { throw new IllegalArgumentException( "Illegal \"baseURI\" parameter: " + baseURI ); } if (string.charAt(0) == '<') { // A named resource if (string.length() < 2 || string.charAt(string.length() - 1) != '>') { throw new ParseException("No terminating '>' in " + string, 1); } string = string.substring(1, string.length() - 1); URI uri; if (string.length() == 0) { // The URI.resolve() method does not work correctly in this case. // The absolute URI is the database URI. uri = baseURI; } else { // Resolve the (possibly) relative uri against the database URI. uri = baseURI.resolve(string); } assert uri != null; assert uri.isAbsolute() : uri + " is not absolute"; return new URIReferenceImpl(uri); } else if (string.charAt(0) == '"') { Matcher matcher = literalPattern.matcher(string); if (!matcher.matches()) { throw new ParseException("Invalid literal: " + string, -1); } // Determine the datatype URI URI datatypeURI = null; if (matcher.group(6) != null) { try { datatypeURI = new URI(matcher.group(6)); } catch (URISyntaxException e) { ParseException parseException = new ParseException("Invalid datatype URI", -1); parseException.initCause(e); throw parseException; } } // Determine the language code String language = matcher.group(4); if (datatypeURI == null && language == null) { language = ""; } if (datatypeURI == null) { return new LiteralImpl( unescapeLexicalForm(matcher.group(1)), // lexical form language // language code ); } else { return new LiteralImpl( unescapeLexicalForm(matcher.group(1)), // lexical form datatypeURI // datatype ); } } else { throw new ParseException("Unrecognized initial character in" + string, 1); } } /** * Convert JRDF to N-Triples. * * @param node a JRDF node, never <code>null</code> * @param baseURI the base URI against which to relativize URI references, * always absolute * @return the N-Triples serialization for the <var>node</var> * @throws IllegalArgumentException if <var>node</var> is <code>null</code> * or is neither a {@link URIReference} nor a {@link Literal}. */ public static String toString(Node node, URI baseURI) { // Validate "node" parameter if (node == null) { throw new IllegalArgumentException("Null \"node\" parameter"); } // Validate "baseURI" parameter if (baseURI != null && !baseURI.isAbsolute()) { throw new IllegalArgumentException( "Relative \"baseURI\" parameter: " + baseURI ); } if (node instanceof URIReference) { URI uri = ((URIReference) node).getURI(); URI relativeURI = (baseURI != null) ? baseURI.relativize(uri) : uri; // Be suspicious about relative URIs -- we're only expecting the // names of models from this server, or the name of the server itself if (!relativeURI.isAbsolute()) { if ((relativeURI.getAuthority() != null) || ((relativeURI.getPath() != null) && (relativeURI.getPath().length() > 0)) || (relativeURI.getFragment() == null)) { logger.warn("Unusual relative URI in backup: " + relativeURI + " authority=\"" + relativeURI.getAuthority() + "\"" + " path=\"" + relativeURI.getPath() + "\"" + " fragment=\"" + relativeURI.getFragment() + "\""); } } return "<" + relativeURI + ">"; } else if (node instanceof Literal) { Literal literal = (Literal) node; // Lexical form StringBuffer buffer = new StringBuffer(); buffer.append('"') .append(escapeLexicalForm(literal.getLexicalForm())) .append('"'); // Language code String lang = literal.getLanguage(); if (lang != null && !lang.equals("")) { buffer.append('@').append(lang); } // Datatype URI if (literal.getDatatypeURI() != null) { buffer.append("^^<") .append(literal.getDatatypeURI().toString()) .append('>'); } return buffer.toString(); } else { throw new IllegalArgumentException( "Unsupported node of class " + node.getClass() + ": " + node ); } } /** * Escape an arbitrary unicode lexical form into N-Triples serialization. * * @param string a string to escape, never <code>null</code> * @return a version of the <var>string</var> with N-Triples escapes applied * @throws IllegalArgumentException if <var>string</var> is <code>null</code> */ public static String escapeLexicalForm(String string) { // Validate "string" parameter if (string == null) { throw new IllegalArgumentException("Null \"string\" parameter"); } // Obtain a matcher Matcher matcher = escapedCharacterPattern.matcher(string); // Try to short-circuit the whole process -- maybe nothing needs escaping? if (!matcher.find()) { return string; } // Perform escape character substitutions on each match found by the // matcher, accumulating the escaped text into a stringBuffer StringBuffer stringBuffer = new StringBuffer(); do { // The escape text with which to replace the current match String escapeString; // Depending of the character sequence we're escaping, determine an // appropriate replacement String groupString = matcher.group(); switch (groupString.length()) { case 1: // 16-bit characters requiring escaping switch (groupString.charAt(0)) { case '\t': // tab escapeString = "\\\\t"; break; case '\n': // newline escapeString = "\\\\n"; break; case '\r': // carriage return escapeString = "\\\\r"; break; case '"': // quote escapeString = "\\\\\\\""; break; case '\\': // backslash escapeString = "\\\\\\\\"; break; default: // other characters use 4-digit hex escapes String hexString = Integer.toHexString(groupString.charAt(0)).toUpperCase(); escapeString = "\\\\u0000".substring(0, 7 - hexString.length()) + hexString; assert escapeString.length() == 7; assert escapeString.startsWith("\\\\u"); break; } break; case 2: // surrogate pairs are represented as 8-digit hex escapes assert Character.getType(groupString.charAt(0)) == Character.SURROGATE; assert Character.getType(groupString.charAt(1)) == Character.SURROGATE; String hexString = Integer.toHexString( ( (groupString.charAt(0) & 0x3FF) << 10) + // high surrogate (groupString.charAt(1) & 0x3FF) + // low surrogate 0x10000 // base codepoint U+10000 ).toUpperCase(); escapeString = "\\\\U00000000".substring(0, 11 - hexString.length()) + hexString; assert escapeString.length() == 11; assert escapeString.startsWith("\\\\U000"); break; default: throw new Error("Escape sequence " + groupString + " has no handler"); } assert escapeString != null; // Having determined an appropriate escapeString, add it to the // stringBuffer matcher.appendReplacement(stringBuffer, escapeString); } while (matcher.find()); // Finish off by appending any remaining text that didn't require escaping, // and return the assembled buffer matcher.appendTail(stringBuffer); return stringBuffer.toString(); } /** * Unescape N-Triples serialization of a lexical form back to unicode. * * @param string an ASCII string formatted with N-Triples lexical form * escape codes, never <code>null</code> * @return a version of the <var>string</var> with N-Triples escapes * evaluated * @throws IllegalArgumentException if <var>string</var> is <code>null</code> */ public static String unescapeLexicalForm(String string) { // Validate "string" parameter if (string == null) { throw new IllegalArgumentException("Null \"string\" parameter"); } // Obtain a matcher Matcher matcher = escapePattern.matcher(string); // Try to short-circuit the whole process -- maybe nothing needs escaping? if (!matcher.find()) { return string; } // Perform unescape character substitutions on each match found by the // matcher, accumulating the unescaped text into a stringBuffer StringBuffer stringBuffer = new StringBuffer(); do { // The escape text with which to replace the current match String unescapedString; if (matcher.group(2) != null) { switch (matcher.group(2).charAt(0)) { case 't': // tab unescapedString = "\t"; break; case 'n': // newline unescapedString = "\n"; break; case 'r': // return unescapedString = "\r"; break; case '"': // quote unescapedString = "\""; break; case '\\': // backslash unescapedString = "\\\\"; // this has to be escaped because // Matcher.appendReplacement tries to // find capturing group references break; default: throw new Error("Impossible condition in unescape parsing"); } } else if (matcher.group(3) != null) { try { unescapedString = Character.toString((char) Integer.parseInt(matcher.group(3), 16)); } catch (NumberFormatException e) { Error error = new Error("Impossible condition in unescape parsing"); error.initCause(e); throw error; } } else if (matcher.group(4) != null) { try { int unicode = Integer.parseInt(matcher.group(4), 16); int highSurrogate = 0xD800 + ((unicode-0x10000) >> 10); assert highSurrogate >= 0xD800 && highSurrogate < 0xDC00: "Bad high surrogate U+" + Integer.toHexString(highSurrogate); int lowSurrogate = 0xDC00 + ((unicode-0x10000) & 0x3FF); assert lowSurrogate >= 0xDC00 && lowSurrogate < 0xE000: "Bad low surrogate U+" + Integer.toHexString(lowSurrogate); unescapedString = Character.toString((char) highSurrogate) + Character.toString((char) lowSurrogate); } catch (NumberFormatException e) { Error error = new Error("Impossible condition in unescape parsing"); error.initCause(e); throw error; } } else { throw new Error("Impossible condition in unescape parsing"); } assert unescapedString != null; // Having determined an appropriate unescapedString, add it to the // stringBuffer matcher.appendReplacement(stringBuffer, unescapedString); } while (matcher.find()); // Finish off by appending any remaining text that didn't require escaping, // and return the assembled buffer matcher.appendTail(stringBuffer); return stringBuffer.toString(); } }