Parser.java example

Explorer
mulgara-master
- src
  - jar
  - war
    - server-http
      - java
        HttpServer.java
        HttpServerServlet.java
- tools
  - src
    - org
      - mulgara
        tools
        Sparql.java
        Tql.java
/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is the Kowari Metadata Store.
 *
 * The Initial Developer of the Original Code is Plugged In Software Pty
 * Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions
 * created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002
 * Plugged In Software Pty Ltd. All Rights Reserved.
 *
 * Contributor(s): N/A.
 *
 * [NOTE: The text of this Exhibit A may differ slightly from the text
 * of the notices in the Source Code files of the Original Code. You
 * should use the text of this Exhibit A rather than the text found in the
 * Original Code Source Code for Your Modifications.]
 *
 */

package org.mulgara.content.n3;

// Java 2 standard packages
import java.io.InputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;

// Third party packages
import antlr.collections.AST;        // ANTLR compiler-compiler
import com.hp.hpl.jena.n3.N3Parser;  // Jena
import com.hp.hpl.jena.n3.N3ParserEventHandler;
import org.apache.log4j.Logger;      // Apache Log4J
import org.jrdf.graph.BlankNode;     // JRDF
import org.jrdf.graph.Node;
import org.jrdf.graph.ObjectNode;
import org.jrdf.graph.PredicateNode;
import org.jrdf.graph.SubjectNode;
import org.jrdf.graph.Triple;
import org.jrdf.graph.URIReference;
import org.jrdf.vocabulary.OWL;
import org.jrdf.vocabulary.RDF;

// Locally written packages
import org.mulgara.content.Content;
import org.mulgara.content.NotModifiedException;
import org.mulgara.parser.MulgaraParserException;
import org.mulgara.query.TuplesException;
import org.mulgara.query.rdf.BlankNodeImpl;
import org.mulgara.query.rdf.LiteralImpl;
import org.mulgara.query.rdf.Mulgara;
import org.mulgara.query.rdf.TripleImpl;
import org.mulgara.query.rdf.URIReferenceImpl;
import org.mulgara.resolver.spi.LocalizeException;
import org.mulgara.resolver.spi.ResolverSession;
import org.mulgara.util.IntFile;
import org.mulgara.util.NumberUtil;
import org.mulgara.util.StringToLongMap;
import org.mulgara.util.TempDir;

/**
 * nd
 * <p>This class parses N3 data. It is implemented as a {@link Runnable} to allow it to be running in
 * the background filling a queue, while a consumer thread drains the queue.</p>
 *
 * <p>Because ResolverSession (and the underlying StringPoolSession) may not be accessed
 * concurrently from multiple threads, there is some extra complication when creating blank nodes,
 * whereby blank-node instances are created in the parser thread but their id's are allocated later
 * in the app-thread.</p>
 *
 * @created 2004-04-02
 * @author <a href="http://staff.pisoftware.com/anewman">Andrew Newman</a>
 * @author <a href="http://staff.pisoftware.com/davidm">David Makepeace</a>
 * @author <a href="http://staff.pisoftware.com/raboczi">Simon Raboczi</a>
 * @copyright © 2004 <a href="http://www.PIsoftware.com/">Plugged In Software Pty Ltd</a>
 */
class Parser extends Thread implements N3ParserEventHandler {
  /** Logger. */
  private static final Logger logger = Logger.getLogger(Parser.class.getName());

  private static final String ANON_TAG = "_:";

  private static final String LOCAL_ANON_TAG = ANON_TAG + "node";

  /**
   * Maximum size that the {@link #triples} buffer can attain without the
   * parser deliberately blocking and waiting for it to drain.
   */
  private final long MAX_TRIPLES = 1000;

  /** Mapping between parsed blank node IDs and local node numbers. */
  private IntFile blankNodeIdMap;

  /** Mapping between blank node rdf:nodeIDs and local node numbers. */
  private StringToLongMap blankNodeNameMap;

  /** Mapping between blank node IDs and blank-node instances. */
  private Map<Long, BlankNodeImpl> unallocBlankNodeIdMap = new HashMap<Long, BlankNodeImpl>();

  /** Mapping between blank node rdf:nodeIDs and blank-node instances. */
  private Map<String, BlankNodeImpl> unallocBlankNodeNameMap = new HashMap<String, BlankNodeImpl>();

  /** The resolverSession to create new internal identifiers for blank nodes. */
  private ResolverSession resolverSession;

  /** The stream containing the data to be parsed. */
  private InputStream inputStream;

  /** The queue of triples generated by the Notation-3 parser. */
  private LinkedList<Triple> triples = new LinkedList<Triple>();

  /**
   * The number of statements parsed so far.
   *
   * When {@link #complete} is <code>true</code>, this will be the number of
   * statements in the Notation-3 document.
   */
  private long statementCount = 0;

  /**
   * <code>true</code> if statementCount is the count of the total number of statements in
   * the entire file because the parser has reached the end of the file without error.
   */
  private boolean statementCountIsTotal = false;

  /** Flag used to indicate that the end of the N3 file has been reached. */
  private boolean complete = false;

  /** The exception which interrupted parsing, or <code>null</code> is parsing is successful. */
  private Throwable exception = null;

  /**
   * The base URI from which the {@link #inputStream} came and where any
   * relative URI references within the stream should be resolved to absolute
   * form.
   *
   * This field may be <code>null</code> if the origin of the stream is
   * unknown, although in that case all URI references within the stream must
   * be absolute.
   */
  private URI baseURI;

  /**
   * Map of <code>prefix</code> directives.
   *
   * Keys are {@link String}s of the form <code>p3p:</code>.
   * Values are also {@link String}s, and of the form
   * <code>http://www.example.org/meeting_organization#</code>.
   */
  private final Map<String,String> prefixMap = new HashMap<String,String>();

  //
  // Constructor
  //

  /**
   * Sole constructor.
   */
  Parser(Content content, ResolverSession resolverSession) throws NotModifiedException, TuplesException {
    // Validate parameters
    if (content == null) throw new IllegalArgumentException("Null \"content\" parameter");
    if (resolverSession == null) throw new IllegalArgumentException("Null \"resolverSession\" parameter");

    // Initialize fields
    this.resolverSession = resolverSession;
    this.baseURI = content.getURI() != null ? content.getURI() : URI.create(Mulgara.NAMESPACE);
    try {
      this.blankNodeIdMap = IntFile.open(TempDir.createTempFile("n3idmap", null), true);
      this.blankNodeNameMap = new StringToLongMap();
      this.inputStream = content.newInputStream();
    } catch (IOException e) {
      throw new TuplesException("Unable to obtain input stream from " + baseURI, e);
    }
  }

  /**
   * @return the number of statements parsed so far
   */
  synchronized long getStatementCount() throws TuplesException {
    checkForException();
    return statementCount;
  }

  /**
   * @return the total number of statements in the file
   */
  synchronized long waitForStatementTotal() throws TuplesException {
    while (!complete) {
      checkForException();

      // Keep the LinkedList drained.
      triples.clear();
      unallocBlankNodeIdMap.clear();
      unallocBlankNodeNameMap.clear();
      notifyAll();

      try {
        wait();
      } catch (InterruptedException ex) {
        throw new TuplesException("Abort");
      }
    }
    checkForException();
    assert statementCountIsTotal;
    return statementCount;
  }

  /**
   * Returns true if getStatementCount() would return the total number
   * of statements in the file.
   */
  synchronized boolean isStatementCountTotal() throws TuplesException {
    checkForException();
    return statementCountIsTotal;
  }

  //
  // Method implementing Runnable
  //

  public void run() {
    Throwable t = null;

    try {
      (new N3Parser(inputStream, this)).parse();
      if (logger.isDebugEnabled()) logger.debug("Parsed Notation-3");
      return;
    } catch (Throwable th) {
      t = th;
    } finally {
      synchronized (this) {
        if (t != null) {
          exception = t;
        } else if (exception == null) {
          // End of file has been reached without error.
          statementCountIsTotal = true;
        }
        complete = true;
        notifyAll();
      }
    }

    if (logger.isDebugEnabled()) logger.debug("Exception while parsing N3", exception);
  }

  //
  // Methods implementing N3ParserEventHandler
  //

  public void startDocument() {
    if (logger.isDebugEnabled()) logger.debug("Start N3 document");
    prefixMap.clear();
  }

  public void endDocument() {
    if (logger.isDebugEnabled()) logger.debug("End N3 document");
  }

  public void error(Exception ex, String message) {
    if (logger.isDebugEnabled()) logger.debug(message, ex);
  }

  public void startFormula(int line, String context) {
    if (logger.isDebugEnabled()) logger.debug("Start formula " + context);
  }

  public void endFormula(int line, String context) {
    if (logger.isDebugEnabled()) logger.debug("End formula " + context);
  }

  public void quad(int    line,
                   AST    subj,
                   AST    pred,
                   AST    obj,
                   String context) {

    if (logger.isDebugEnabled()) {
      logger.debug("Parsing " + subj + " " + pred + " " + obj + " from " + baseURI);
    }

    // convert the triple components to JRDF Nodes
    SubjectNode   subjectNode   = null;
    PredicateNode predicateNode = null;
    ObjectNode    objectNode    = null;
    try {
      subjectNode   = (SubjectNode)   toNode(subj);
      predicateNode = (PredicateNode) toNode(pred);
      objectNode    = (ObjectNode)    toNode(obj);
    } catch (MulgaraParserException e) {
      logger.error("Unable to parse at line " + line + ": " + e.getMessage());
      return;
    }

    if (logger.isDebugEnabled()) {
      logger.debug("Parsed " + subjectNode + " " + predicateNode + " " + objectNode + " from " + baseURI);
    }

    synchronized (this) {
      // Wait for the triples buffer to drain if it's too full
      while (triples.size() >= MAX_TRIPLES) {
        try {
          wait();
        } catch (InterruptedException ex) {
          throw new RuntimeException("Abort");
        }
      }

      // Buffer the statement
      triples.addLast(new TripleImpl(subjectNode, predicateNode, objectNode));
      statementCount++;
      notifyAll();
    }
  }

  public void directive(int line, AST directive, AST[] args, String context) {
    switch (directive.getType()) {
    case N3Parser.AT_PREFIX:
      assert args.length == 2;
      assert args[0].getType() == N3Parser.QNAME;
      assert args[1].getType() == N3Parser.URIREF;
      prefixMap.put(args[0].toString(), args[1].toString());
      return;

    default:
      logger.warn(
        "Ignoring directive at line " + line +
        ": directive=" + directive + " (type " + directive.getType() + ") " +
        "args=" + Arrays.asList(args) + " (type " + args[0].getType() + ") " +
        "context=" + context
      );
    }
  }

  //
  // Internal methods
  //

  /**
   * Convert and validate an AST object into a node.
   *
   * @param ast  The AST object to convert.
   * @return a {@link Node} matching the AST object.
   * @throws MulgaraParserException An unhandled element was encountered.
   */
  private Node toNode(AST ast) throws MulgaraParserException {
    if (ast == null) throw new IllegalArgumentException("Unable to load NULL nodes");

    switch (ast.getType()) {
      case N3Parser.LITERAL:

        // check if this is a literal type
        URI datatype = null;
        String lang = null;

        // get any modifiers
        AST a1 = ast.getNextSibling();
        AST a2 = (a1 == null ? null : a1.getNextSibling());

        // find the language
        lang = getLang(a1);
        if (lang == null) lang = getLang(a2);
        if (lang == null) lang = "";

        // find the datatype
        datatype = getDatatype(a1);
        if (datatype == null) datatype = getDatatype(a2);

        if (datatype == null) {
          return new LiteralImpl(ast.toString(), lang);
        } else {
          return new LiteralImpl(ast.toString(), datatype);
        }

      case N3Parser.NUMBER:
        datatype = NumberUtil.getXSD(NumberUtil.parseNumber(ast.toString()));
        return new LiteralImpl(ast.toString(), datatype);

      case N3Parser.ANON:
        return getBlankNode(ast);
      case N3Parser.QNAME:
        String s = ast.toString();
        if (isAnonymous(ast)) {
          return getBlankNode(ast);
        } else {
          int colonIndex = s.indexOf(':');
          assert colonIndex != -1;
          String qnamePrefix = s.substring(0, colonIndex + 1);
          String uriPrefix = prefixMap.get(qnamePrefix);
          if (uriPrefix == null) throw new RuntimeException("No @prefix for " + s);
          return toURIReference(uriPrefix + s.substring(colonIndex + 1));
        }
      case N3Parser.URIREF:
        return toURIReference(ast.toString());
      case N3Parser.KW_A:
        return toURIReference(RDF.TYPE);
      case N3Parser.TK_LIST_FIRST:
        return toURIReference(RDF.FIRST);
      case N3Parser.TK_LIST_REST:
        return toURIReference(RDF.REST);
      case N3Parser.TK_LIST_NIL:
        return toURIReference(RDF.NIL);
      case N3Parser.TK_LIST:
        return toURIReference(RDF.LIST);
      case N3Parser.EQUAL:
        return toURIReference(OWL.SAME_AS);
      case N3Parser.FORMULA:
        throw new MulgaraParserException("Formulas are not supported");
      default:
        throw new Error("Unsupported N3 parser token type: " + ast.getType());
    }
  }

  private URIReference toURIReference(String string) {
    try {
      return toURIReference(new URI(string));
    } catch (URISyntaxException e) {
      throw new RuntimeException("Invalid URI reference generated", e);
    }
  }

  private URIReference toURIReference(URI u) {
    if (!u.isAbsolute() && baseURI != null) u = baseURI.resolve(u);
    return new URIReferenceImpl(u);
  }

  /**
   * Tests if a node is anonymous.
   *
   * This is done by looking for the {@link #ANON_TAG} prefix.
   *
   * @param node The node to test.
   * @return <code>true</code> if the node is anonymous.
   */
  private boolean isAnonymous(AST node) {
    String idStr = node.toString();
    return idStr.startsWith(ANON_TAG);
  }

  /**
   * Create a blank node from an AST object.
   *
   * @param n The AST node to convert to an anonymous node.
   * @return An anonymous node that the AST node maps to.
   */
  private BlankNode getBlankNode(AST n) {
    // this is anonymous, so parse its ID
    long anonId = parseAnonId(n);
    String anonIdStr = null;
    try {
      synchronized (this) {
        // look up the id in the blank node maps
        long resourceNodeId;
        if (anonId >= 0) {
          resourceNodeId = blankNodeIdMap.getLong(anonId);
        } else {
          // don't expect to use this map
          anonIdStr = n.toString();
          resourceNodeId = blankNodeNameMap.get(anonIdStr);
        }

        // check if the node was found
        BlankNodeImpl blankNode;
        if (resourceNodeId == 0) {
          if (anonId >= 0) {
            blankNode = unallocBlankNodeIdMap.get(anonId);
          } else {
            blankNode = unallocBlankNodeNameMap.get(n.toString());
          }
        } else {
          // Found the ID, so need to recreate the anonymous resource for it
          blankNode = new BlankNodeImpl(resourceNodeId);
        }

        // check if the node was found
        if (blankNode == null) {
          // need a new anonymous node for this ID
          blankNode = new BlankNodeImpl();
          // need to put this node into a map
          if (anonId >= 0) {
            unallocBlankNodeIdMap.put(anonId, blankNode);
          } else {
            unallocBlankNodeNameMap.put(anonIdStr, blankNode);
          }
        }

        return blankNode;
      }
    } catch (IOException e) {
      throw new RuntimeException("Couldn't generate anonymous resource", e);
    }
  }

  /**
   * Parse out the node ID used by a blank node.
   *
   * @param node The node to get the ID from.
   * @return The number part of the node.
   */
  private long parseAnonId(AST node) {
    String str = node.toString();
    if (!str.startsWith(ANON_TAG)) return -1;
    try {
      int startPoint = node.toString().startsWith(LOCAL_ANON_TAG) ? LOCAL_ANON_TAG.length() : ANON_TAG.length();
      return Long.parseLong(node.toString().substring(startPoint));
    } catch (NumberFormatException nfe) {
      return -1;
    }
  }

  /**
   * Get the language of a node.
   *
   * @param a node to test for language.  May be null.
   * @return The string representing the language, or <code>null</code> if this
   *   is not available.
   */
  private String getLang(AST a) {
    // empty nodes have no info
    if (a == null) return null;
    return a.getType() == N3Parser.AT_LANG ? a.getText().substring(1) : null;
  }

  /**
   * Get the type of a node.
   *
   * @param a node to test for type.  May be null.
   * @return The URI representing the type, or <code>null</code> if this is not
   *   available.
   */
  private URI getDatatype(AST a) {
    // empty nodes have no info
    if (a == null) return null;
    // check if this is a datatype node
    if (a.getType() != N3Parser.DATATYPE) return null;

    // get the datatype details
    AST dt = a.getFirstChild();
    try {
      if (dt == null) return null;
      String uri = dt.toString();
      // check for QName
      int colonIndex = uri.indexOf(':');
      // relative URI, so just return
      if (colonIndex == -1) return new URI(uri);

      // look for possible prefix
      String qnamePrefix = uri.substring(0, colonIndex + 1);
      String uriPrefix = prefixMap.get(qnamePrefix);
      // if known prefix, then use it, otherwise just return the string as a URI
      return uriPrefix == null ? new URI(uri) : new URI(uriPrefix + uri.substring(colonIndex + 1));
    } catch (URISyntaxException e) {
      logger.warn("Error parsing N3 datatype: " + dt.toString(), e);
      return null;
    }
  }

  /**
   * If an exception occurred in the parser, throws a TuplesException that
   * wraps the exception.
   */
  private void checkForException() throws TuplesException {
    if (exception != null) {
      throw new TuplesException("Exception while reading " + baseURI, exception);
    }
  }

  /**
   * Returns a new triple from the queue or null if there are no more triples.
   */
  synchronized Triple getTriple() throws TuplesException {
    while (triples.isEmpty()) {
      checkForException();
      if (complete) {
        // No more triples.
        return null;
      }

      // Wait for more triples.
      try {
        wait();
      } catch (InterruptedException ex) {
        throw new TuplesException("Abort");
      }
    }
    checkForException();
    allocateBlankNodes();

    notifyAll();
    return triples.removeFirst();
  }

  /**
   * Allocate the ids for the new blank nodes.
   */
  private synchronized void allocateBlankNodes() {
    try {
      for (Map.Entry<Long, BlankNodeImpl> entry : unallocBlankNodeIdMap.entrySet()) {
        resolverSession.localize(entry.getValue());     // This sets and returns the node ID
        blankNodeIdMap.putLong(entry.getKey(), entry.getValue().getNodeId());
      }
      unallocBlankNodeIdMap.clear();

      for (Map.Entry<String, BlankNodeImpl> entry : unallocBlankNodeNameMap.entrySet()) {
        resolverSession.localize(entry.getValue());     // This sets and returns the node ID
        blankNodeNameMap.put(entry.getKey(), entry.getValue().getNodeId());
      }
      unallocBlankNodeNameMap.clear();
    } catch (LocalizeException le) {
      throw new RuntimeException("Unable to create blank node", le);
    } catch (IOException ioe) {
      throw new RuntimeException("Unable to create blank node", ioe);
    }
  }

  /**
   * Stops the thread.
   */
  synchronized void abort() {
    interrupt();

    // Clear the triples list and notify in case ARP uses an internal thread
    // which has become blocked on the list being MAX_TRIPLES in size.
    triples.clear();
    notifyAll();
  }

}