/*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is the Kowari Metadata Store.
*
* The Initial Developer of the Original Code is Plugged In Software Pty
* Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions
* created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002
* Plugged In Software Pty Ltd. All Rights Reserved.
*
* Contributor(s): N/A.
*
* [NOTE: The text of this Exhibit A may differ slightly from the text
* of the notices in the Source Code files of the Original Code. You
* should use the text of this Exhibit A rather than the text found in the
* Original Code Source Code for Your Modifications.]
*
*/
package org.mulgara.content.rdfxml;
// Java 2 standard packages
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.LinkedList;
import javax.activation.MimeType;
// logging
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
// JRDF
import org.jrdf.graph.BlankNode;
import org.jrdf.graph.Literal;
import org.jrdf.graph.Node;
// local
import org.mulgara.content.Content;
import org.mulgara.content.NotModifiedException;
import org.mulgara.query.TuplesException;
import org.mulgara.query.rdf.BlankNodeImpl;
import org.mulgara.query.rdf.LiteralImpl;
import org.mulgara.query.rdf.Mulgara;
import org.mulgara.query.rdf.URIReferenceImpl;
import org.mulgara.resolver.spi.LocalizeException;
import org.mulgara.resolver.spi.ResolverSession;
import org.mulgara.util.IntFile;
import org.mulgara.util.StringToLongMap;
import org.mulgara.util.TempDir;
// XML
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXParseException;
// Jena
import com.hp.hpl.jena.rdf.arp.ALiteral;
import com.hp.hpl.jena.rdf.arp.ARP;
import com.hp.hpl.jena.rdf.arp.AResource;
import com.hp.hpl.jena.rdf.arp.StatementHandler;
/**
* This {@link Runnable}
*
* @created 2004-04-02
* @author <a href="http://staff.pisoftware.com/davidm">David Makepeace</a>
* @author <a href="http://staff.pisoftware.com/raboczi">Simon Raboczi</a>
* @version $Revision: 1.8 $
* @modified $Date: 2005/01/05 04:58:02 $ @maintenanceAuthor $Author: newmana $
* @company <a href="mailto:info@PIsoftware.com">Plugged In Software</a>
* @copyright © 2004 <a href="http://www.PIsoftware.com/">Plugged In
* Software Pty Ltd</a>
* @licence <a href="{@docRoot}/../../LICENCE">Mozilla Public License v1.1</a>
*/
class Parser extends Thread implements ErrorHandler, StatementHandler {
/** Logger. */
private static final Logger logger =
Logger.getLogger(Parser.class.getName());
private final int BUFFER_SIZE = 1000;
/**
* Maximum size that {@link #queue} can attain without the
* parser deliberately blocking and waiting for it to drain.
*/
private final int QUEUE_MAX_BUFFERS = 10;
/**
* The number of statements per batch for performance
* metrics results.
*/
private static final double STATEMENT_COUNT_BATCH = 250000;
/**
* The ARP parser instance to use.
*/
private final ARP arp = new ARP();
/**
* Map ARP anonymous node IDs to {@link BlankNode}s.
*/
private final StringToLongMap blankNodeNameMap;
/**
* Mapping between blank node IDs generated by ARP and nodes in the
* {@link #queue}.
*/
private final IntFile blankNodeIdMap;
/**
* Supplied inputstream for reading
*/
private InputStream inputStream;
private long[][] headBuffer = null;
private int headIndex = 0;
private long[][] tailBuffer = null;
private int tailIndex = 0;
/**
* The queue of <code>long[][3]</code> buffers of triples generated by the
* RDF/XML parser.
*/
private LinkedList<long[][]> queue = new LinkedList<long[][]>();
/**
* The number of statements parsed so far.
*
* When {@link #complete} is <code>true</code>, this will be the number of
* statements in the RDF/XML document.
*/
private long statementCount = 0;
/**
* true if statementCount is the count of the total number of statements in
* the entire file because the parser has reached the end of the file without
* error.
*/
private boolean statementCountIsTotal = false;
/**
* Flag used to indicate that the end of the RDF/XML file has been reached.
*/
private boolean complete = false;
/**
* The exception which interrupted parsing, or <code>null</code> is parsing
* is successful.
*/
private Throwable exception = null;
/**
* The base URI from which the {@link #inputStream} came and where any
* relative URI references within the stream should be resolved to absolute
* form.
*
* This field may be <code>null</code> if the origin of the stream is
* unknown, although in that case all URI references within the stream must
* be absolute.
*/
private URI baseURI;
/** The data type on the stream provided. If provided then this should be application/rdf+xml. */
private MimeType contentType;
/**
* The context in which to localize incoming RDF nodes.
*/
private final ResolverSession resolverSession;
/** The initial start time for performance metrics results. */
private double startTime;
/** The time the last batch of statements inserted */
private double lastStatementLoadTime;
private boolean isInfoEnabled = false;
//
// Constructor
//
/**
* Sole constructor.
*
* @throws NotModifiedException if the <var>content</var> model is already cached
* @throws TuplesException if the {@link #blankNodeIdMap} or
* {@link #blankNodeNameMap} can't be created or the <var>content</var>
* can't be read
*/
Parser(Content content, ResolverSession resolverSession)
throws NotModifiedException, TuplesException {
// Validate "content" parameter
if (content == null) {
throw new IllegalArgumentException("Null \"content\" parameter");
}
// Validate "resolverSession" parameter
if (resolverSession == null) {
throw new IllegalArgumentException("Null \"resolverSession\" parameter");
}
// Initialize fields
this.baseURI = content.getURI() != null ? content.getURI() : URI.create(Mulgara.NAMESPACE);
try {
this.blankNodeNameMap = new StringToLongMap();
this.blankNodeIdMap = IntFile.open(
TempDir.createTempFile("rdfidmap", null)
);
this.blankNodeIdMap.clear();
} catch (IOException e) {
throw new TuplesException("Unable to create blank node map", e);
}
try {
this.inputStream = content.newInputStream();
} catch (IOException e) {
throw new TuplesException("Unable to obtain input stream from " + baseURI, e);
}
this.contentType = content.getContentType();
this.resolverSession = resolverSession;
// Configure the RDF/XML parser
arp.getOptions().setEmbedding(true);
arp.getOptions().setLaxErrorMode();
arp.getHandlers().setErrorHandler(this);
arp.getHandlers().setStatementHandler(this);
// is info enabled
isInfoEnabled = logger.isInfoEnabled();
// Used for statistics during a load
startTime = System.currentTimeMillis();
lastStatementLoadTime = startTime;
}
/**
* @return the number of statements parsed so far
*/
synchronized long getStatementCount() throws TuplesException {
checkForException();
return statementCount;
}
/**
* @return the total number of statements in the file
*/
synchronized long waitForStatementTotal() throws TuplesException {
while (!complete) {
checkForException();
// Keep the LinkedList drained.
queue.clear();
notifyAll();
try {
wait();
} catch (InterruptedException ex) {
throw new TuplesException("Abort");
}
}
checkForException();
assert statementCountIsTotal;
return statementCount;
}
/**
* Returns true if getStatementCount() would return the total number
* of statements in the file.
*/
synchronized boolean isStatementCountTotal() throws TuplesException {
checkForException();
return statementCountIsTotal;
}
//
// Method implementing Runnable
//
public void run() {
Throwable t = null;
try {
arp.load(inputStream, baseURI == null ? "" : baseURI.toString());
if (logger.isDebugEnabled()) {
logger.debug("Parsed RDF/XML");
}
return;
} catch (Throwable th) {
t = th;
} finally {
try {
if (blankNodeNameMap != null) {
blankNodeNameMap.delete();
}
if (blankNodeIdMap != null) {
blankNodeIdMap.delete();
}
} catch (IOException ioex) {
logger.warn("Unable to clean up blank node id map", ioex);
} finally {
flushQueue(t);
}
}
if (logger.isDebugEnabled()) {
logger.debug("Exception while parsing RDF/XML", exception);
}
}
//
// Methods implementing StatementHandler
//
public void statement(AResource subject,
AResource predicate,
ALiteral object) {
// Localize the statement
long[] triple;
try {
triple = new long[] { toLocalNode(subject),
toLocalNode(predicate),
toLocalNode(object) };
} catch (IOException e) {
throw new RuntimeException("Unable to localize parsed triple", e);
} catch (LocalizeException e) {
throw new RuntimeException("Unable to localize parsed triple", e);
}
// Buffer the statement
addTriple(triple);
}
public void statement(AResource subject,
AResource predicate,
AResource object) {
// Localize the statement
long[] triple;
try {
triple = new long[] { toLocalNode(subject),
toLocalNode(predicate),
toLocalNode(object) };
} catch (IOException e) {
throw new RuntimeException("Unable to localize parsed triple", e);
} catch (LocalizeException e) {
throw new RuntimeException("Unable to localize parsed triple", e);
}
// Buffer the statement
addTriple(triple);
}
//
// Methods implementing ErrorHandler
//
/**
* Recoverable error.
* @param e The exception being handled.
*/
public synchronized void error(SAXParseException e) {
if (logger.isEnabledFor(Level.WARN)) {
logger.warn("Recoverable error, line " + e.getLineNumber() + ", column " +
e.getColumnNumber() + ": " + e.getMessage());
}
if (logger.isDebugEnabled()) {
logger.debug("Recoverable error, line " + e.getLineNumber() + ", column " +
e.getColumnNumber() + ": " + e.getMessage(), e);
}
}
/**
* Non-recoverable error.
* @param e The exception being handled
*/
public synchronized void fatalError(SAXParseException e)
{
exception = e;
if (logger.isEnabledFor(Level.ERROR)) {
logger.error("Fatal error, line " + e.getLineNumber() + ", column " +
e.getColumnNumber() + ": " + e.getMessage());
}
if (logger.isDebugEnabled()) {
logger.debug("Fatal error, line " + e.getLineNumber() + ", column " +
e.getColumnNumber() + ": " + e.getMessage(), e);
}
}
/**
* Warning.
* @param e The exception being warned about
*/
public void warning(SAXParseException e) {
if (logger.isEnabledFor(Level.WARN)) {
logger.warn("Warning, line " + e.getLineNumber() + ", column " +
e.getColumnNumber() + ": " + e.getMessage());
}
if (logger.isDebugEnabled()) {
logger.debug("Warning, line " + e.getLineNumber() + ", column " +
e.getColumnNumber() + ": " + e.getMessage(), e);
}
}
//
// Internal methods
//
/**
* Create a JRDF {@link Literal} object from an ARP literal object.
*
* @param literal the ARP literal
* @return a local node corresponding to the literal
* @throws LocalizeException if the literal can't be localized
*/
private long toLocalNode(ALiteral literal) throws LocalizeException {
URI type = null;
if (literal.getDatatypeURI() != null) {
try {
type = new URI(literal.getDatatypeURI());
} catch (URISyntaxException e) {
throw new Error("ARP generated datatype for " + literal + " which isn't a URI", e);
}
}
String lang = literal.getLang();
if (type == null) {
if (lang == null) lang = "";
} else {
lang = null;
}
if (type == null) {
return resolverSession.localize(new LiteralImpl(literal.toString(), lang));
} else {
return resolverSession.localize(new LiteralImpl(literal.toString(), type));
}
}
/**
* Create a JRDF {@link Node} from an ARP resource object.
*
* @param resource the ARP resource.
* @return a local node corresponding the ARP resource (either a URI
* reference or a blank node)
* @throws IOException if bnode IDs can't be stored in {@link #blankNodeIdMap}
* or {@link #blankNodeNameMap}
* @throws LocalizeException if the resource can't be localized
*/
private long toLocalNode(AResource resource)
throws IOException, LocalizeException {
if (resource.isAnonymous()) {
String anonIdStr = resource.getAnonymousID();
long anonId = parseAnonId(anonIdStr);
try {
long resourceNodeId;
if (anonId >= 0) {
resourceNodeId = blankNodeIdMap.getLong(anonId);
} else {
// Try the StringToLongMap instead.
resourceNodeId = blankNodeNameMap.get(anonIdStr);
}
// If it's not found add it.
if (resourceNodeId == 0) {
// Create a new blank node.
resourceNodeId = resolverSession.localize(new BlankNodeImpl());
if (anonId >= 0) {
blankNodeIdMap.putLong(anonId, resourceNodeId);
} else {
blankNodeNameMap.put(anonIdStr, resourceNodeId);
}
}
return resourceNodeId;
} catch (IOException e) {
throw new RuntimeException("Couldn't generate anonymous resource", e);
}
} else {
try {
assert resource.getURI() != null;
return resolverSession.localize(new URIReferenceImpl(new URI(resource.getURI())));
} catch (URISyntaxException e) {
throw new Error("ARP generated a malformed URI: '" + resource.getURI() + "'", e);
}
}
}
/**
* If an exception occurred in the parser, throws a TuplesException that
* wraps the exception.
*/
private void checkForException() throws TuplesException {
if (exception != null) {
queue.clear();
headIndex = 0;
headBuffer = null;
if (baseURI == null) throw new TuplesException("Exception while reading stream of type: " + contentType, exception);
throw new TuplesException("Exception while reading " + baseURI, exception);
}
}
/**
* @return a new <code>long[3]</code> triple from the queue or
* <code>null</code> if there are no more triples.
*/
long[] getTriple() throws TuplesException {
if (headBuffer == null || headIndex >= headBuffer.length) {
// Get another buffer from the queue.
headIndex = 0;
headBuffer = null;
headBuffer = getBufferFromQueue();
if (headBuffer == null) {
// No more triples.
return null;
}
assert headBuffer.length > 0;
}
// Get a triple from the headBuffer.
long[] triple = headBuffer[headIndex];
headBuffer[headIndex++] = null;
assert triple != null;
assert triple.length == 3;
return triple;
}
private synchronized long[][] getBufferFromQueue() throws TuplesException {
while (queue.isEmpty()) {
checkForException();
if (complete) {
// No more buffers in the queue.
return null;
}
// Wait for a buffer.
try {
wait();
} catch (InterruptedException ex) {
throw new TuplesException("Abort");
}
}
checkForException();
notifyAll();
return queue.removeFirst();
}
private void addTriple(long[] triple) {
assert triple != null;
if (tailBuffer == null) {
tailBuffer = new long[BUFFER_SIZE][];
tailIndex = 0;
}
tailBuffer[tailIndex++] = triple;
if (tailIndex >= tailBuffer.length) {
// Add the buffer to the queue.
addBufferToQueue(tailBuffer);
tailBuffer = null;
tailIndex = 0;
}
}
private synchronized void flushQueue(Throwable t) {
if (interrupted()) {
if (t == null) t = new InterruptedException();
}
if (t != null) {
exception = t;
queue.clear();
} else if (exception == null) {
// End of file has been reached without error.
if (tailBuffer != null) {
// There is at least one triple in the tailBuffer.
assert tailIndex > 0;
long[][] buf = new long[tailIndex][];
System.arraycopy(tailBuffer, 0, buf, 0, tailIndex);
addBufferToQueue(buf);
logStatementActivity();
}
statementCountIsTotal = true;
} else {
// An exception has already been reported.
queue.clear();
}
tailBuffer = null;
tailIndex = 0;
complete = true;
notifyAll();
}
private synchronized void addBufferToQueue(long[][] buffer) {
assert buffer != null;
// Wait for the queue to drain a bit if it's too full
while (queue.size() >= QUEUE_MAX_BUFFERS) {
try {
wait();
} catch (InterruptedException ex) {
throw new RuntimeException("Abort");
}
}
queue.addLast(buffer);
statementCount += buffer.length;
notifyAll();
}
/**
* Stops the thread.
*/
synchronized void abort() {
interrupt();
// Clear the queue and notify in case ARP uses an internal thread
// which has become blocked on the list being MAX_TRIPLES in size.
queue.clear();
notifyAll();
}
private void logStatementActivity() {
// For very large documents, periodically log activity.
if (isInfoEnabled) {
if (statementCount % STATEMENT_COUNT_BATCH == 0) {
long now = System.currentTimeMillis();
logger.info("\tbatch timestamp\t" + now +
"\tstatements\t" + statementCount +
"\tper second\t" + Math.round((STATEMENT_COUNT_BATCH /
(now - lastStatementLoadTime))*1000) +
"\tavg per seconds\t" + Math.round((statementCount /
(now - startTime))*1000));
// update the current time for performance logging
lastStatementLoadTime = now;
}
}
}
/**
* Parse the AnonymousID from ARP to get the Id in the form of a long. We
* currently make assumptions about the format of the AnonymousID string,
* namely that the first character of the string is an "A" and that the
* remaining characters are digits.
*
* @param anonIdStr the AnonymousID string
* @return the Id as a long
*/
private long parseAnonId(String anonIdStr) {
assert anonIdStr.length() > 0;
if (anonIdStr.charAt(0) != 'A') return -1;
try {
long anonId = Long.parseLong(anonIdStr.substring(1));
assert anonId >= 0;
return anonId;
} catch (NumberFormatException ex) {
return -1;
}
}
}