StatementParser.java example

Explorer
mulgara-master
- src
  - jar
  - war
    - server-http
      - java
        HttpServer.java
        HttpServerServlet.java
- tools
  - src
    - org
      - mulgara
        tools
        Sparql.java
        Tql.java
/*
 * Copyright 2010 Paula Gearon
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.mulgara.content.rdfa;

// Java 2 standard packages
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.Map;

import javax.activation.MimeType;

// Third party packages
import org.apache.log4j.Logger;      // Apache Log4J
import org.jrdf.graph.BlankNode;     // JRDF
import org.jrdf.graph.Literal;
import org.jrdf.graph.Node;
import org.jrdf.graph.ObjectNode;
import org.jrdf.graph.PredicateNode;
import org.jrdf.graph.SubjectNode;
import org.jrdf.graph.Triple;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;

import net.rootdev.javardfa.ParserFactory.Format;
import net.rootdev.javardfa.StatementSink;
import net.rootdev.javardfa.ParserFactory;


// Locally written packages
import org.mulgara.content.Content;
import org.mulgara.content.NotModifiedException;
import org.mulgara.parser.MulgaraParserException;
import org.mulgara.query.TuplesException;
import org.mulgara.query.rdf.BlankNodeImpl;
import org.mulgara.query.rdf.LiteralImpl;
import org.mulgara.query.rdf.MimeTypes;
import org.mulgara.query.rdf.TripleImpl;
import org.mulgara.query.rdf.URIReferenceImpl;
import org.mulgara.resolver.spi.ResolverSession;
import org.mulgara.util.IntFile;
import org.mulgara.util.TempDir;

/**
 *
 * @created 2010-08-09
 * @author Paula Gearon
 */
class StatementParser implements Runnable, StatementSink {

  /** Logger. */
  private static final Logger logger = Logger.getLogger(StatementParser.class.getName());

  /** Text prefix for blank nodes. */
  @SuppressWarnings("unused")
  private static final String BLANK_PREFIX = "_:";

  /** The prefix that rdfa-java uses */
  private static final String RJ_PREFIX = "_:node";

  /** The period of time to wait in ms for the parser to provide some data. */
  private static final int TIMEOUT = 30000;

  /**
   * Maximum size for the {@link #triples} buffer. Any larger and the parser will
   * block and drain.
   */
  private static final int BUFFER_SIZE = 1000;

  /** Mapping between parsed blank node IDs and local node numbers. */
  private IntFile blankNodeIdMap;

  /** Mapping between blank node IDs and blank-node instances that haven't been stored. */
  private Map<Long,BlankNodeImpl> blankNodeInstMap = new HashMap<Long,BlankNodeImpl>();

  /** The resolverSession to create new internal identifiers for blank nodes. */
  private ResolverSession resolverSession;

  /** The data to be parsed and its metadata */
  private final Content content;

  /** The stream containing the data to be parsed. */
  private InputStream inputStream;

  /** The parser for the input stream. */
  private XMLReader reader;

  /** The base of the document. */
  private URI base;

  /** Resolves relative URIs and IRIs to absolute URIs/IRIs */
  private BasedResolver parseResolver;

  /** The queue of triples generated by the parser. */
  private LinkedBlockingQueue<Triple> triples = new LinkedBlockingQueue<Triple>(BUFFER_SIZE);

  /** A marker to indicate that the end of the data has been reached. */
  static final Triple TERMINATOR = new TripleImpl(null, null, null);

  /** The number of parsed statements */
  private long statementCount = 0;

  /** Indicates that parsing is complete */
  private volatile boolean finished = false;

  /** Used to asynchronously indicate an exception. */
  private Throwable exception = null;

  /** Thread used for parsing. This is the producer thread. */
  private Thread parserThread;

  /**
   * Sets up the sink to start receiving triples.
   * @param content Contains the data for parsing and its metadata.
   * @param resolverSession Access to the database for inserting data.
   */
  StatementParser(Content content, ResolverSession resolverSession) throws NotModifiedException, TuplesException {
    if (content == null) throw new IllegalArgumentException("Null \"content\" parameter");
    if (resolverSession == null) throw new IllegalArgumentException("Null \"resolverSession\" parameter");

    this.content = content;
    this.resolverSession = resolverSession;
    try {
      this.blankNodeIdMap = IntFile.open(TempDir.createTempFile("rdfaidmap", null), true);
      this.inputStream = content.newInputStream();
      this.base = content.getURI();
      parseResolver = new BasedResolver(content.getURIString());
      reader = ParserFactory.createReaderForFormat(this, getType(content), parseResolver);
    } catch (Exception e) {
      throw new TuplesException("Unable to obtain input stream from " + content.getURI(), e);
    }
  }

  /**
   * @return the number of statements parsed so far
   */
  long getStatementCount() {
    return statementCount;
  }

  /**
   * Do the parsing. This is the entry point of the parsing thread.
   */
  public void run() {
    parserThread = Thread.currentThread();
    Throwable t = null;

    try {
      reader.parse(new InputSource(inputStream));
      if (logger.isDebugEnabled()) logger.debug("Parsed RDFa on " + content.getURI());
      return;
    } catch (Throwable th) {
      logger.error("Error parsing RDFa", th);
      t = th;
    } finally {
      try {
        triples.put(TERMINATOR);
      } catch (InterruptedException e) {
        logger.error("Error ending RDFa parse", e);
        t = e;
      }
      if (t != null) exception = t;
      finished = true;
    }

    if (logger.isDebugEnabled()) logger.debug("Exception while parsing RDFa", exception);
  }

  public void start() {
    if (logger.isDebugEnabled()) logger.debug("Started RDFa document");
  }

  public void end() {
    if (logger.isDebugEnabled()) logger.debug("End RDFa document");
    finished = true;
  }

  public void addPrefix(String prefix, String uri) {
    if (logger.isDebugEnabled()) logger.debug("@prefix " + prefix + ": <" + uri + "> .");
  }

  public void setBase(String base) {
    try {
      if (base != null) parseResolver.setBase(base);
    } catch (IllegalArgumentException e) {
      logger.warn("Invalid base in RDFa file: " + base);
    }
  }

  /**
   * Adds an triple with a Literal as the object.
   * @param subject string form of the subject.
   * @param predicate string form of the predicate.
   * @param lex The lexical form of the literal in the object.
   * @param lang The language code of the literal in the object. May be <code>null</code>.
   * @param datatype The datatype of the literal in the object. May be <code>null</code>.
   */
  public void addLiteral(String subject, String predicate, String lex, String lang, String datatype) {
    try {
      enqueue((SubjectNode)toNode(subject), (PredicateNode)toNode(predicate), toLiteral(lex, lang, datatype));
    } catch (MulgaraParserException e) {
      logger.error("Unable to parse. " + e.getMessage());
      return;
    }
  }


  /**
   * Adds an triple with a URI or blank node as the object.
   * @param subject string form of the subject.
   * @param predicate string form of the predicate.
   * @param object string form of the object.
   */
  public void addObject(String subject, String predicate, String object) {
    try {
      enqueue((SubjectNode)toNode(subject), (PredicateNode)toNode(predicate), (ObjectNode)toNode(object));
    } catch (MulgaraParserException e) {
      logger.error("Unable to parse. " + e.getMessage());
      return;
    }
  }


  /**
   * Add a parsed triple to the queue.
   * @param subjectNode The subject of the triple.
   * @param predicateNode The predicate of the triple.
   * @param objectNode The object of the triple.
   */
  void enqueue(SubjectNode subjectNode, PredicateNode predicateNode, ObjectNode objectNode) {
    if (logger.isDebugEnabled()) {
      logger.debug("Parsed " + subjectNode + " " + predicateNode + " " + objectNode + " from " + content.getURI());
    }

    try {
      triples.put(new TripleImpl(subjectNode, predicateNode, objectNode));
    } catch (InterruptedException e) {
      throw new RuntimeException("Unable to record parsed triple", e);
    }
    statementCount++;
  }

  /**
   * Convert and validate an AST object into a node.
   *
   * @param text The text of the node that was parsed.
   * @return a {@link Node} formed from the text
   * @throws MulgaraParserException An unhandled element was encountered.
   */
  private Node toNode(String text) throws MulgaraParserException {
    if (text == null) return new URIReferenceImpl(base);

    if (text.startsWith(RJ_PREFIX)) return getBlankNode(text);
    return toUri(text);
  }

  /**
   * Creates a URIReference out of a string.
   * @param text The string to convert.
   * @return A new URIReference containing the URI from the string.
   * @throws MulgaraParserException The text was not a valid URI.
   */
  private Node toUri(String text) throws MulgaraParserException {
    try {
      return new URIReferenceImpl(new URI(text));
    } catch (URISyntaxException e) {
      throw new MulgaraParserException("Invalid URI: " + text, e);
    }
  }

  /**
   * Create a blank node from a URI with a blank node form.
   *
   * @param n The node to convert to an anonymous node.
   * @return An anonymous node that the node maps to.
   */
  private BlankNode getBlankNode(String n) throws MulgaraParserException {
    long anonId;
    try {
      anonId = Long.parseLong(n.substring(RJ_PREFIX.length()));
    } catch (NumberFormatException nfe) {
      throw new MulgaraParserException("Invalid blank node: " + n);
    }
    if (anonId < 0) throw new MulgaraParserException("Inexpected blank node format: " + n);

    synchronized (this) {
      // look up the id in the blank node map
      long internalId = blankNodeIdMap.getLong(anonId);

      // check if the node was found
      BlankNodeImpl blankNode;
      if (internalId == 0) {
        blankNode = blankNodeInstMap.get(anonId);
        if (blankNode == null) {
          blankNode = new BlankNodeImpl();
          blankNodeInstMap.put(anonId, blankNode);
        }
      } else {
        // Found the ID, so need to recreate the anonymous resource for it
        blankNode = new BlankNodeImpl(internalId);
      }

      return blankNode;
    }
  }

  /**
   * Creates a literal out of three components.
   * @param text The lexical value of the literal.
   * @param lang The language code of the literal, or <code>null</code> if not an
   *        untyped literal with a language code.
   * @param datatype The URI of the datatype of the literal, or <code>null</code>
   *        if an untyped literal.
   * @return A new literal.
   */
  Literal toLiteral(String text, String lang, String datatype) throws MulgaraParserException {
    if (datatype != null) {
      assert lang == null;
      try {
        return new LiteralImpl(text, new URI(datatype));
      } catch (URISyntaxException e) {
        throw new MulgaraParserException("Invalid datatype on literal: " + text + "^^" + datatype, e);
      }
    }
    if (lang != null) return new LiteralImpl(text, lang);
    return new LiteralImpl(text);
  }

  /**
   * If an exception occurred in the parser, throws a TuplesException that
   * wraps the exception.
   */
  private void checkForException() throws TuplesException {
    if (exception != null) {
      throw new TuplesException("Exception while reading " + content.getURIString(), exception);
    }
  }


  /**
   * Returns a new triple from the queue or null if there are no more triples.
   * @return The oldest triple in the queue.
   */
  Triple getTriple() throws TuplesException {
    checkForException();
    allocateBlankNodes();
    try {
      Triple result = triples.poll(TIMEOUT, TimeUnit.MILLISECONDS);
      if (result == null) throw new TuplesException("Timeout waiting for data from parser");
      return result;
    } catch (InterruptedException e) {
      throw new TuplesException("Unable to retrieve data from the parser", e);
    }
  }

  /**
   * Allocate the ids for the new blank nodes.
   */
  private void allocateBlankNodes() {
    try {
      for (Map.Entry<Long, BlankNodeImpl> entry : blankNodeInstMap.entrySet()) {
        resolverSession.localize(entry.getValue());     // This sets and returns the node ID
        blankNodeIdMap.putLong(entry.getKey(), entry.getValue().getNodeId());
      }
      blankNodeInstMap.clear();

    } catch (Exception le) {
      throw new RuntimeException("Unable to create blank node", le);
    }
  }

  /**
   * Stops the thread.
   */
  void terminate() {
    finished = true;
    if (parserThread != null && parserThread.isAlive()) parserThread.interrupt();
    triples.clear();
    try {
      triples.put(TERMINATOR);
    } catch (InterruptedException e) {
      exception = e;
    }
  }

  /**
   * Tests if the parse is complete.
   * @return <code>true</code> if parsing is over.
   */
  boolean isFinished() {
    return finished;
  }

  /**
   * Determine the type of parsing to be done, based on the content.
   * @param c The Content to be parsed.
   * @return Either <code>Format.XHTML</code> or <code>Format.HTML</code>.
   * @throws NotModifiedException 
   */
  private Format getType(Content c) throws NotModifiedException {
    MimeType t = c.getContentType();
    if (t != null) {
      if (MimeTypes.APPLICATION_XHTML.match(t)) return Format.XHTML;
      if (MimeTypes.TEXT_HTML.match(t)) return Format.HTML;
    }
    String loc = c.getURIString();
    if (loc != null) {
      if (loc.endsWith(RdfaContentHandler.XHTML_EXT)) return Format.XHTML;
      if (loc.endsWith(RdfaContentHandler.HTML_EXT)) return Format.HTML;
    }
    logger.warn("Guessing HTML for unknown MIME type: " + t);
    return Format.HTML;
  }
}