NTriples.java example

Explorer
mulgara-master
- src
  - jar
  - war
    - server-http
      - java
        HttpServer.java
        HttpServerServlet.java
- tools
  - src
    - org
      - mulgara
        tools
        Sparql.java
        Tql.java
/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is the Kowari Metadata Store.
 *
 * The Initial Developer of the Original Code is Plugged In Software Pty
 * Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions
 * created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002
 * Plugged In Software Pty Ltd. All Rights Reserved.
 *
 * Contributor(s): N/A.
 *
 * [NOTE: The text of this Exhibit A may differ slightly from the text
 * of the notices in the Source Code files of the Original Code. You
 * should use the text of this Exhibit A rather than the text found in the
 * Original Code Source Code for Your Modifications.]
 *
 */

package org.mulgara.resolver;

// Java 2 standard packages
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

// Third party packages
import org.apache.log4j.Logger;  // Apache Log4J
import org.jrdf.graph.Literal;
import org.jrdf.graph.Node;
import org.jrdf.graph.URIReference;

// Local packages
import org.mulgara.query.rdf.LiteralImpl;
import org.mulgara.query.rdf.URIReferenceImpl;

/**
 * Static library for converting N-Triples serialization to and from JRDF
 * {@link Node}s.
 *
 * @created 2004-09-22
 * @author <a href="http://www.pisoftware.com/raboczi">Simon Raboczi</a>
 * @author <a href="mailto:pgearon@users.sourceforge.net">Paula Gearon</a>
 * @version $Revision: 1.8 $
 * @modified $Date: 2005/01/05 04:58:24 $
 * @maintenanceAuthor $Author: newmana $
 * @company <a href="mailto:info@PIsoftware.com">Plugged In Software</a>
 * @copyright ©2004 <a href="http://www.tucanatech.com/">Tucana
 *   Technology, Inc</a>
 * @licence <a href="{@docRoot}/../../LICENCE">Mozilla Public License v1.1</a>
 * Portions by Paula Gearon.
 * @copyright ©2006 <a href="http://www.herzumsoftware.com/">Herzum Software LLC</a>
 */
abstract class NTriples
{
  /**
   * Logger.
   *
   * This is named after the class.
   */
  private static final Logger logger =
    Logger.getLogger(NTriples.class.getName());

  /**
   * A regular expression matching NTriples literals.
   *
   * In the following pattern:
   * <ul>
   * <li>Group 0 is the entire literal serialization</li>
   * <li>Group 1 is the lexical form</li>
   * <li>Group 3 is the language clause<li>
   * <li>Group 4 is the language code</li>
   * <li>Group 5 is the datatype clause</li>
   * <li>Group 6 is the datatype URI</li>
   * </ul>
   */
  private static final Pattern literalPattern = Pattern.compile(
    "\\x22(([^\\\\]|\\\\[tnr\\x22\\\\]|\\\\u\\p{XDigit}{2}|\\\\U\\p{XDigit}{4})*)\\x22" +  // lexical form
    "(@(\\w+))?" +        // optional language
    "(\\^\\^<([^>]*)>)?"  // optional datatype
  );

  /**
   * A regular expression to pick out characters needing escape from Unicode
   * to ASCII.
   */
  private static final Pattern escapedCharacterPattern = Pattern.compile(
    "[\ud800\udc00-\udbff\udfff]" +                // surrogate pairs
    "|" +                                          // ...or...
    "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]"          // all other escaped chars
  );

  /**
   * A regular expression to pick out ASCII escapes for Unicode characters.
   *
   * In the following pattern:
   * <ul>
   * <li>Group 0 is the escaped lexical form</li>
   * <li>Group 2 is any single character escape</li>
   * <li>Group 3 is any 4-digit Unicode escape</li>
   * <li>Group 4 is any 8-digit Unicode escape</li>
   * </ul>
   */
  private static final Pattern escapePattern = Pattern.compile(
    "\\\\" +               // all escapes start with a backslash
    "(" +
    "([tnr\\\\\\\"])" +    // tab, newline, return, backslash, quote
    "|" +                  // ...or...
    "u(\\p{XDigit}{4})" +  // a 16-bit hexadecimal Unicode
    "|" +                  // ...or...
    "U(\\p{XDigit}{8})" +  // a 32-bit hexadecimal Unicode
    ")"
  );

  /**
   * Convert N-Triples to JRDF.
   *
   * @param string  a string in N-Triples format, never <code>null</code>
   * @param baseURI  the base URI against which to resolve relative URI
   *   references, which must be absolute
   * @return  a JRDF node equivalent to the <var>string</var>
   * @throws IllegalArgumentException if <var>string</var> is <code>null</code>
   * @throws ParseException if <var>string</var> isn't valid N-Triples
   */
  public static Node toNode(String string, URI baseURI) throws ParseException
  {
    // Validate "string" parameter
    if (string == null) {
      throw new IllegalArgumentException("Null \"string\" parameter");
    }

    // Validate "baseURI" parameter
    if (baseURI == null || !baseURI.isAbsolute()) {
      throw new IllegalArgumentException(
        "Illegal \"baseURI\" parameter: " + baseURI
      );
    }

    if (string.charAt(0) == '<') {
      // A named resource
      if (string.length() < 2 || string.charAt(string.length() - 1) != '>') {
        throw new ParseException("No terminating '>' in " + string, 1);
      }
      string = string.substring(1, string.length() - 1);

      URI uri;
      if (string.length() == 0) {
        // The URI.resolve() method does not work correctly in this case.
        // The absolute URI is the database URI.
        uri = baseURI;
      }
      else {
        // Resolve the (possibly) relative uri against the database URI.
        uri = baseURI.resolve(string);
      }
      assert uri != null;
      assert uri.isAbsolute() : uri + " is not absolute";

      return new URIReferenceImpl(uri);
    }
    else if (string.charAt(0) == '"') {
      Matcher matcher = literalPattern.matcher(string);
      if (!matcher.matches()) {
        throw new ParseException("Invalid literal: " + string, -1);
      }

      // Determine the datatype URI
      URI datatypeURI = null;
      if (matcher.group(6) != null) {
        try {
          datatypeURI = new URI(matcher.group(6));
        }
        catch (URISyntaxException e) {
          ParseException parseException =
            new ParseException("Invalid datatype URI", -1);
          parseException.initCause(e);
          throw parseException;
        }
      }

      // Determine the language code
      String language = matcher.group(4);
      if (datatypeURI == null && language == null) {
        language = "";
      }

      if (datatypeURI == null) {
        return new LiteralImpl(
          unescapeLexicalForm(matcher.group(1)),  // lexical form
          language                                // language code
        );
      }
      else {
        return new LiteralImpl(
          unescapeLexicalForm(matcher.group(1)),  // lexical form
          datatypeURI                             // datatype
        );
      }
    }
    else {
      throw new ParseException("Unrecognized initial character in" + string, 1);
    }
  }

  /**
   * Convert JRDF to N-Triples.
   *
   * @param node  a JRDF node, never <code>null</code>
   * @param baseURI  the base URI against which to relativize URI references,
   *   always absolute
   * @return  the N-Triples serialization for the <var>node</var>
   * @throws IllegalArgumentException if <var>node</var> is <code>null</code>
   *   or is neither a {@link URIReference} nor a {@link Literal}.
   */
  public static String toString(Node node, URI baseURI)
  {
    // Validate "node" parameter
    if (node == null) {
      throw new IllegalArgumentException("Null \"node\" parameter");
    }

    // Validate "baseURI" parameter
    if (baseURI != null && !baseURI.isAbsolute()) {
      throw new IllegalArgumentException(
        "Relative \"baseURI\" parameter: " + baseURI
      );
    }

    if (node instanceof URIReference) {
      URI uri = ((URIReference) node).getURI();
      URI relativeURI = (baseURI != null) ? baseURI.relativize(uri) : uri;

      // Be suspicious about relative URIs -- we're only expecting the
      // names of models from this server, or the name of the server itself
      if (!relativeURI.isAbsolute()) {
        if ((relativeURI.getAuthority() != null) ||
            ((relativeURI.getPath() != null) &&
             (relativeURI.getPath().length() > 0)) ||
            (relativeURI.getFragment() == null))
        {
          logger.warn("Unusual relative URI in backup: " + relativeURI +                      " authority=\"" + relativeURI.getAuthority() + "\"" +
              " path=\"" + relativeURI.getPath() + "\"" +
              " fragment=\"" +
              relativeURI.getFragment() + "\"");
        }
      }

      return "<" + relativeURI + ">";
    }
    else if (node instanceof Literal) {
      Literal literal = (Literal) node;

      // Lexical form
      StringBuffer buffer = new StringBuffer();
      buffer.append('"')
            .append(escapeLexicalForm(literal.getLexicalForm()))
            .append('"');

      // Language code
      String lang = literal.getLanguage();
      if (lang != null && !lang.equals("")) {
        buffer.append('@').append(lang);
      }

      // Datatype URI
      if (literal.getDatatypeURI() != null) {
        buffer.append("^^<")
              .append(literal.getDatatypeURI().toString())
              .append('>');
      }

      return buffer.toString();
    }
    else {
      throw new IllegalArgumentException(
        "Unsupported node of class " + node.getClass() + ": " + node
      );
    }
  }

  /**
   * Escape an arbitrary unicode lexical form into N-Triples serialization.
   *
   * @param string  a string to escape, never <code>null</code>
   * @return a version of the <var>string</var> with N-Triples escapes applied
   * @throws IllegalArgumentException if <var>string</var> is <code>null</code>
   */
  public static String escapeLexicalForm(String string)
  {
    // Validate "string" parameter
    if (string == null) {
      throw new IllegalArgumentException("Null \"string\" parameter");
    }

    // Obtain a matcher
    Matcher matcher = escapedCharacterPattern.matcher(string);

    // Try to short-circuit the whole process -- maybe nothing needs escaping?
    if (!matcher.find()) {
      return string;
    }

    // Perform escape character substitutions on each match found by the
    // matcher, accumulating the escaped text into a stringBuffer
    StringBuffer stringBuffer = new StringBuffer();
    do {
      // The escape text with which to replace the current match
      String escapeString;

      // Depending of the character sequence we're escaping, determine an
      // appropriate replacement
      String groupString = matcher.group();
      switch (groupString.length()) {
        case 1: // 16-bit characters requiring escaping
          switch (groupString.charAt(0)) {
            case '\t': // tab
              escapeString = "\\\\t";
            break;
            case '\n': // newline
              escapeString = "\\\\n";
            break;
            case '\r': // carriage return
              escapeString = "\\\\r";
            break;
            case '"':  // quote
              escapeString = "\\\\\\\"";
            break;
            case '\\': // backslash
              escapeString = "\\\\\\\\";
            break;
            default:   // other characters use 4-digit hex escapes
              String hexString =
                  Integer.toHexString(groupString.charAt(0)).toUpperCase();
              escapeString =
                  "\\\\u0000".substring(0, 7 - hexString.length()) + hexString;

              assert escapeString.length() == 7;
              assert escapeString.startsWith("\\\\u");
            break;
          }
        break;

        case 2: // surrogate pairs are represented as 8-digit hex escapes
          assert Character.getType(groupString.charAt(0)) == Character.SURROGATE;
          assert Character.getType(groupString.charAt(1)) == Character.SURROGATE;

          String hexString = Integer.toHexString(
              ( (groupString.charAt(0) & 0x3FF) << 10) + // high surrogate
              (groupString.charAt(1) & 0x3FF) + // low surrogate
              0x10000 // base codepoint U+10000
              ).toUpperCase();
          escapeString =
              "\\\\U00000000".substring(0, 11 - hexString.length()) + hexString;
          assert escapeString.length() == 11;
          assert escapeString.startsWith("\\\\U000");
        break;

        default:
          throw new Error("Escape sequence " + groupString + " has no handler");      }
      assert escapeString != null;

      // Having determined an appropriate escapeString, add it to the
      // stringBuffer
      matcher.appendReplacement(stringBuffer, escapeString);
    }
    while (matcher.find());

    // Finish off by appending any remaining text that didn't require escaping,
    // and return the assembled buffer
    matcher.appendTail(stringBuffer);
    return stringBuffer.toString();
  }

  /**
   * Unescape N-Triples serialization of a lexical form back to unicode.
   *
   * @param string  an ASCII string formatted with N-Triples lexical form
   *   escape codes, never <code>null</code>
   * @return a version of the <var>string</var> with N-Triples escapes
   *   evaluated
   * @throws IllegalArgumentException if <var>string</var> is <code>null</code>
   */
  public static String unescapeLexicalForm(String string)
  {
    // Validate "string" parameter
    if (string == null) {
      throw new IllegalArgumentException("Null \"string\" parameter");
    }

    // Obtain a matcher
    Matcher matcher = escapePattern.matcher(string);

    // Try to short-circuit the whole process -- maybe nothing needs escaping?
    if (!matcher.find()) {
      return string;
    }

    // Perform unescape character substitutions on each match found by the
    // matcher, accumulating the unescaped text into a stringBuffer
    StringBuffer stringBuffer = new StringBuffer();
    do {
      // The escape text with which to replace the current match
      String unescapedString;

      if (matcher.group(2) != null) {
        switch (matcher.group(2).charAt(0)) {
          case 't':   // tab
            unescapedString = "\t";
          break;
          case 'n':   // newline
            unescapedString = "\n";
          break;
          case 'r':   // return
            unescapedString = "\r";
          break;
          case '"':   // quote
            unescapedString = "\"";
          break;
          case '\\':  // backslash
            unescapedString = "\\\\";  // this has to be escaped because
                                       // Matcher.appendReplacement tries to
                                       // find capturing group references
          break;
          default:
            throw new Error("Impossible condition in unescape parsing");
        }
      }
      else if (matcher.group(3) != null) {
        try {
          unescapedString =
            Character.toString((char) Integer.parseInt(matcher.group(3), 16));
        }
        catch (NumberFormatException e) {
          Error error = new Error("Impossible condition in unescape parsing");
          error.initCause(e);
          throw error;
        }
      }
      else if (matcher.group(4) != null) {
        try {
          int unicode = Integer.parseInt(matcher.group(4), 16);

          int highSurrogate = 0xD800 + ((unicode-0x10000) >> 10);
          assert highSurrogate >= 0xD800 && highSurrogate < 0xDC00:
            "Bad high surrogate U+" + Integer.toHexString(highSurrogate);

          int lowSurrogate  = 0xDC00 + ((unicode-0x10000) & 0x3FF);
          assert lowSurrogate >= 0xDC00 && lowSurrogate < 0xE000:
            "Bad low surrogate U+" + Integer.toHexString(lowSurrogate);

          unescapedString = Character.toString((char) highSurrogate) +
                            Character.toString((char) lowSurrogate);
        }
        catch (NumberFormatException e) {
          Error error = new Error("Impossible condition in unescape parsing");
          error.initCause(e);
          throw error;
        }
      }
      else {
        throw new Error("Impossible condition in unescape parsing");
      }
      assert unescapedString != null;

      // Having determined an appropriate unescapedString, add it to the
      // stringBuffer
      matcher.appendReplacement(stringBuffer, unescapedString);
    }
    while (matcher.find());

    // Finish off by appending any remaining text that didn't require escaping,
    // and return the assembled buffer
    matcher.appendTail(stringBuffer);
    return stringBuffer.toString();
  }
}