/* * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * The Original Code is the Kowari Metadata Store. * * The Initial Developer of the Original Code is Plugged In Software Pty * Ltd (http://www.pisoftware.com, mailto:info@pisoftware.com). Portions * created by Plugged In Software Pty Ltd are Copyright (C) 2001,2002 * Plugged In Software Pty Ltd. All Rights Reserved. * * Contributor(s): N/A. * * [NOTE: The text of this Exhibit A may differ slightly from the text * of the notices in the Source Code files of the Original Code. You * should use the text of this Exhibit A rather than the text found in the * Original Code Source Code for Your Modifications.] * */ package org.mulgara.resolver.url; // Java 2 standard packages import java.io.*; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.ZipInputStream; import org.xml.sax.*; // Third party packages import com.hp.hpl.jena.rdf.arp.ARP; // ARP (Jena RDF/XML parser) import com.hp.hpl.jena.rdf.arp.ALiteral; import com.hp.hpl.jena.rdf.arp.AResource; import com.hp.hpl.jena.rdf.arp.StatementHandler; import org.apache.log4j.Logger; // Apache Log4J import org.jrdf.graph.*; // JRDF // Locally written packages import org.mulgara.query.Cursor; import org.mulgara.query.TuplesException; import org.mulgara.query.Variable; import org.mulgara.query.rdf.*; import org.mulgara.resolver.spi.LocalizeException; import org.mulgara.resolver.spi.ResolverSession; import org.mulgara.resolver.spi.Statements; import org.mulgara.resolver.spi.StatementsWrapperResolution; import org.mulgara.store.tuples.AbstractTuples; import org.mulgara.store.tuples.Tuples; /** * Parses an {@link InputStream} into {@link Statements}. * * This particular implementation is complicated by the need to adapt the Jena * ARP RDF/XML "push" parser to be a "pull" parser instead. * * @created 2004-04-02 * @author <a href="http://staff.pisoftware.com/raboczi">Simon Raboczi</a> * @version $Revision: 1.8 $ * @modified $Date: 2005/01/05 04:58:56 $ @maintenanceAuthor $Author: newmana $ * @company <a href="mailto:info@PIsoftware.com">Plugged In Software</a> * @copyright © 2004 <a href="http://www.PIsoftware.com/">Plugged In * Software Pty Ltd</a> * @licence <a href="{@docRoot}/../../LICENCE">Mozilla Public License v1.1</a> */ public class URLStatements extends AbstractTuples implements Statements { /** * Logger. */ @SuppressWarnings("unused") private static final Logger logger = Logger.getLogger(URLStatements.class.getName()); /** * The session used to globalize the RDF nodes from the stream. */ private ResolverSession resolverSession; private Map<String,BlankNode> blankNodeMap; /** * The current row. * * if the cursor is not on a row, this will be <code>null</code> */ private Triple triple; /** * The location of the RDF/XML document. */ private URL url; private Parser parser = null; private long rowCount; private boolean rowCountIsValid = false; // // Constructors // /** * Construct an RDF/XML stream parser. * * @param url the location of the RDF/XML formatted document * @param resolverSession session against which to localize RDF nodes * @param blankNodeMap map of blank node IDs populated by any earlier * parsings of the same document within the scope of the same * <var>resolverSession</var> * @throws IllegalArgumentException if <var>inputStream</var> or * <var>resolverSession</var> are <code>null</code> * @throws TuplesException if the <var>inputStream</var> can't be parsed as * RDF/XML */ URLStatements(URL url, ResolverSession resolverSession, Map<String,BlankNode> blankNodeMap) throws TuplesException { // Validate "url" parameter if (url == null) throw new IllegalArgumentException( "Null \"url\" parameter"); // Validate "resolverSession" parameter if (resolverSession == null) throw new IllegalArgumentException("Null \"resolverSession\" parameter"); // Validate "blankNodeMap" parameter if (blankNodeMap == null) throw new IllegalArgumentException("Null \"blankNodeMap\" parameter"); // Initialize fields this.url = url; this.resolverSession = resolverSession; this.blankNodeMap = blankNodeMap; // Fix the magical column names for RDF statements setVariables(new Variable[] { new Variable("subject"), new Variable("predicate"), new Variable("object") }); } // // Methods implementing Statements // public long getSubject() throws TuplesException { return getColumnValue(0); } public long getPredicate() throws TuplesException { return getColumnValue(1); } public long getObject() throws TuplesException { return getColumnValue(2); } // // Methods implementing AbstractTuples // /** * {@inheritDoc} * * Non-zero length <var>prefix</var> values don't need to be supported by * this class because prefix filtration is implemented by the * {@link StatementsWrapperResolution} which {@link URLResolver} always * applies to this class before returning one. * * @param prefix {@inheritDoc}; for this particular implementation, non-zero * length prefixes are not supported * @throws {@inheritDoc}; also if <var>prefix</var> is non-zero length */ public void beforeFirst(long[] prefix, int suffixTruncation) throws TuplesException { // Validate "prefix" parameter if (prefix == null) { throw new IllegalArgumentException("Null \"prefix\" parameter"); } if (prefix.length != 0) { throw new TuplesException( getClass() + ".beforeFirst isn't implemented for non-zero length prefix" ); } // Validate "suffixTruncation" parameter if (suffixTruncation != 0) { throw new IllegalArgumentException("Null \"suffixTruncation\" parameter"); } // Shut down any existing parsing thread if (parser != null) { stopThread(); } // Create the parser and start the parsing thread parser = new Parser(url, blankNodeMap); parser.start(); // TODO skip forward to the first triple that matches prefix } /** * The cursor position isn't cloned by this method. */ public Object clone() { URLStatements cloned = (URLStatements) super.clone(); // Copy immutable fields by reference cloned.resolverSession = resolverSession; cloned.url = url; // Even though this a mutable field, want to share the blank node IDs cloned.blankNodeMap = blankNodeMap; // The cursor position is not cloned. cloned.triple = null; cloned.parser = null; return cloned; } /** * Close the RDF/XML formatted input stream. */ public void close() throws TuplesException { stopThread(); } /** * @param column 0 for the subject, 1 for the predicate, 2 for the object */ public long getColumnValue(int column) throws TuplesException { if (triple == null) { throw new TuplesException("There is no current row"); } // Pull the appropriate field from the current triple as a JRDF Node Node node; switch (column) { case 0: node = triple.getSubject(); break; case 1: node = triple.getPredicate(); break; case 2: node = triple.getObject(); break; default: throw new TuplesException("No such column " + column); } assert node != null; // Localize the node try { return resolverSession.localize(node); } catch (LocalizeException e) { throw new TuplesException("Couldn't get column " + column + " value", e); } } public List<Tuples> getOperands() { return Collections.emptyList(); } public int getRowCardinality() throws TuplesException { long statementCount; if (rowCountIsValid) { statementCount = rowCount; } else { Parser p; boolean newParser; if (parser != null) { // Use the existing parser. p = parser; newParser = false; } else { // Create a new parser. p = new Parser(url, blankNodeMap); p.start(); newParser = true; } // We can do this since the queue holds more than two triples. try { synchronized (p) { while (p.getStatementCount() < 2 && !p.isStatementCountTotal()) { try { // Wait on the parser for changes to the statement count or // completion status. p.wait(); } catch (InterruptedException ex) { throw new TuplesException("Abort"); } } statementCount = p.getStatementCount(); } } catch (TuplesException ex) { p.abort(); if (!newParser) { // We just aborted the main parser, so nullify the reference. parser = null; } throw ex; // rethrow. } finally { if (newParser) { // Stop the thread. p.abort(); } } } // Convert the statement count into a cardinality class return statementCount == 0 ? Cursor.ZERO : statementCount == 1 ? Cursor.ONE : Cursor.MANY; } public long getRowCount() throws TuplesException { if (!rowCountIsValid) { if (parser != null && parser.isStatementCountTotal()) { // Get the statement count from the parser. rowCount = parser.getStatementCount(); } else { // Create a new parser and consume the entire file. Parser p = new Parser(url, blankNodeMap); p.start(); try { rowCount = p.waitForStatementTotal(); } finally { p.abort(); } } rowCountIsValid = true; } return rowCount; } public long getRowUpperBound() throws TuplesException { // If the row count isn't yet available, return an absurdly huge value return parser != null && parser.isStatementCountTotal() ? parser.getStatementCount() : Long.MAX_VALUE; } /** Guess at a large number */ private static final Long LARGE_FILE_SIZE = 1000000L; public long getRowExpectedCount() throws TuplesException { // If the row count isn't yet available, return an absurdly huge value return parser != null && parser.isStatementCountTotal() ? parser.getStatementCount() : LARGE_FILE_SIZE; } public boolean hasNoDuplicates() throws TuplesException { return false; } public boolean isColumnEverUnbound(int column) throws TuplesException { switch (column) { case 0: case 1: case 2: return false; default: throw new TuplesException("No such column " + column); } } public boolean next() throws TuplesException { if (parser == null) { // no current row return false; } try { triple = parser.getTriple(); } catch (TuplesException ex) { stopThread(); throw ex; // rethrow } if (triple == null) { // Hit the end of the file. assert parser.isStatementCountTotal(); rowCount = parser.getStatementCount(); rowCountIsValid = true; stopThread(); } return triple != null; } /** * Stops the thread if it is running, and clears the current row. */ private void stopThread() { if (parser != null) { parser.abort(); parser = null; } triple = null; } } /** * This {@link Runnable} */ class Parser extends Thread implements ErrorHandler, StatementHandler { /** Logger. */ private static final Logger logger = Logger.getLogger(Parser.class.getName()); private final int BUFFER_SIZE = 1000; /** * Maximum size that {@link #queue} can attain without the * parser deliberately blocking and waiting for it to drain. */ private final int QUEUE_MAX_BUFFERS = 10; /** * The ARP parser instance to use. */ private final ARP arp = new ARP(); /** * Map ARP anonymous node IDs to {@link BlankNode}s. */ private final Map<String,BlankNode> blankNodeMap; private URL url; private Triple[] headBuffer = null; private int headIndex = 0; private Triple[] tailBuffer = null; private int tailIndex = 0; /** * The queue of buffers of triples generated by the RDF/XML parser. */ private LinkedList<Triple[]> queue = new LinkedList<Triple[]>(); /** * The number of statements parsed so far. * * When {@link #complete} is <code>true</code>, this will be the number of * statements in the RDF/XML document. */ private long statementCount = 0; /** * true if statementCount is the count of the total number of statements in * the entire file because the parser has reached the end of the file without * error. */ private boolean statementCountIsTotal = false; /** * Flag used to indicate that the end of the RDF/XML file has been reached. */ private boolean complete = false; /** * The exception which interrupted parsing, or <code>null</code> is parsing * is successful. */ private Throwable exception = null; // // Constructor // /** * Sole constructor. */ @SuppressWarnings("deprecation") Parser(URL url, Map<String,BlankNode> blankNodeMap) { // Validate "url" parameter if (url == null) throw new IllegalArgumentException("Null \"url\" parameter"); // Validate "blankNodeMap" parameter if (blankNodeMap == null) throw new IllegalArgumentException("Null \"blankNodeMap\" parameter"); // Initialize fields this.url = url; this.blankNodeMap = blankNodeMap; // Configure the RDF/XML parser arp.setEmbedding(true); arp.setLaxErrorMode(); arp.setErrorHandler(this); arp.setStatementHandler(this); } /** * @return the number of statements parsed so far */ synchronized long getStatementCount() throws TuplesException { checkForException(); return statementCount; } /** * @return the total number of statements in the file */ synchronized long waitForStatementTotal() throws TuplesException { while (!complete) { checkForException(); // Keep the LinkedList drained. queue.clear(); notifyAll(); try { wait(); } catch (InterruptedException ex) { throw new TuplesException("Abort"); } } checkForException(); assert statementCountIsTotal; return statementCount; } /** * Returns true if getStatementCount() would return the total number * of statements in the file. */ synchronized boolean isStatementCountTotal() throws TuplesException { checkForException(); return statementCountIsTotal; } // // Method implementing Runnable // public void run() { Throwable t = null; // Parse the stream into RDF statements try { InputStream in = url.openStream(); // Guess at transfer encoding (compression scheme) based on file extension if (url.getPath().endsWith(".gz")) { // The file name ends with ".gz", so assume it's a gzip'ed file in = new GZIPInputStream(in); } else if (url.getPath().endsWith(".zip")) { // The file name ends with ".zip", so assume it's a zip'ed file in = new ZipInputStream(in); ((ZipInputStream)in).getNextEntry(); } arp.load(in, url.toString()); if (logger.isDebugEnabled()) { logger.debug("Parsed RDF/XML"); } return; } catch (Throwable th) { t = th; } finally { flushQueue(t); } if (logger.isDebugEnabled()) { logger.debug("Exception while parsing RDF/XML", exception); } } // // Methods implementing StatementHandler // public void statement(AResource subject, AResource predicate, ALiteral object) { if (logger.isDebugEnabled()) { logger.debug("Parsed " + subject + " " + predicate + " " + object + " from " + url); } Triple triple = new TripleImpl((SubjectNode) toNode(subject), (PredicateNode) toNode(predicate), (ObjectNode) toNode(object)); addTriple(triple); } public void statement(AResource subject, AResource predicate, AResource object) { if (logger.isDebugEnabled()) { logger.debug("Parsed " + subject + " " + predicate + " " + object + " from " + url); } Triple triple = new TripleImpl((SubjectNode) toNode(subject), (PredicateNode) toNode(predicate), (ObjectNode) toNode(object)); addTriple(triple); } // // Methods implementing ErrorHandler // /** * Recoverable error. * * @param e PARAMETER TO DO */ public synchronized void error(SAXParseException e) { exception = e; logger.error("Error, " + e.getLineNumber() + ", column " + e.getColumnNumber() + ": " + e.getMessage(), e); } /** * Non-recoverable error. * * @param e PARAMETER TO DO */ public synchronized void fatalError(SAXParseException e) { exception = e; logger.error("Fatal error, line " + e.getLineNumber() + ", column " + e.getColumnNumber() + ": " + e.getMessage(), e); } /** * Warning. * * @param e PARAMETER TO DO */ public void warning(SAXParseException e) { logger.warn("Warning, line " + e.getLineNumber() + ", column " + e.getColumnNumber() + ": " + e.getMessage(), e); } // // Internal methods // /** * Create a JRDF {@link Literal} object from an ARP literal object. * * @param literal the ARP literal * @return a JRDF literal */ private Literal toNode(ALiteral literal) { URI type = null; if (literal.getDatatypeURI() != null) { try { type = new URI(literal.getDatatypeURI()); } catch (URISyntaxException e) { throw new Error("ARP generated datatype for " + literal + " which isn't a URI", e); } } String lang = literal.getLang(); if (type == null) { if (lang == null) lang = ""; } else { lang = null; } if (type == null) { return new LiteralImpl(literal.toString(), lang); } else { return new LiteralImpl(literal.toString(), type); } } /** * Create a JRDF {@link Node} from an ARP resource object. * * @param resource the ARP resource. * @return the JRDF {@link URIReference} or {@link BlankNode} */ private Node toNode(AResource resource) { if (resource.isAnonymous()) { // Generate a blank node String anonymousID = resource.getAnonymousID(); BlankNode blankNode = blankNodeMap.get(anonymousID); if (blankNode == null) blankNode = new BlankNodeImpl(); assert blankNode != null; // Associate this new blank node with its anonymous ID and return it blankNodeMap.put(anonymousID, blankNode); return blankNode; } else { // Generate a URI reference try { return new URIReferenceImpl(new URI(resource.getURI().toString())); } catch (URISyntaxException e) { throw new Error("ARP generated a malformed URI", e); } } } /** * If an exception occurred in the parser, throws a TuplesException that * wraps the exception. */ private void checkForException() throws TuplesException { if (exception != null) { queue.clear(); headIndex = 0; headBuffer = null; throw new TuplesException("Exception while reading " + url, exception); } } /** * Returns a new triple from the queue or null if there are no more triples. */ Triple getTriple() throws TuplesException { if (headBuffer == null || headIndex >= headBuffer.length) { // Get another buffer from the queue. headIndex = 0; headBuffer = null; headBuffer = getBufferFromQueue(); if (headBuffer == null) { // No more triples. return null; } assert headBuffer.length > 0; } // Get a triple from the headBuffer. Triple triple = headBuffer[headIndex]; headBuffer[headIndex++] = null; assert triple != null; return triple; } private synchronized Triple[] getBufferFromQueue() throws TuplesException { while (queue.isEmpty()) { checkForException(); if (complete) { // No more buffers in the queue. return null; } // Wait for a buffer. try { wait(); } catch (InterruptedException ex) { throw new TuplesException("Abort"); } } checkForException(); notifyAll(); return (Triple[]) queue.removeFirst(); } private void addTriple(Triple triple) { assert triple != null; if (tailBuffer == null) { tailBuffer = new Triple[BUFFER_SIZE]; tailIndex = 0; } tailBuffer[tailIndex++] = triple; if (tailIndex >= tailBuffer.length) { // Add the buffer to the queue. addBufferToQueue(tailBuffer); tailBuffer = null; tailIndex = 0; } } private synchronized void flushQueue(Throwable t) { if (interrupted()) { if (t == null) { t = new InterruptedException(); } } if (t != null) { exception = t; queue.clear(); } else if (exception == null) { // End of file has been reached without error. if (tailBuffer != null) { // There is at least one triple in the tailBuffer. assert tailIndex > 0; Triple[] buf = new Triple[tailIndex]; System.arraycopy(tailBuffer, 0, buf, 0, tailIndex); addBufferToQueue(buf); } statementCountIsTotal = true; } else { // An exception has already been reported. queue.clear(); } tailBuffer = null; tailIndex = 0; complete = true; notifyAll(); } private synchronized void addBufferToQueue(Triple[] buffer) { assert buffer != null; // Wait for the queue to drain a bit if it's too full while (queue.size() >= QUEUE_MAX_BUFFERS) { try { wait(); } catch (InterruptedException ex) { throw new RuntimeException("Abort"); } } queue.addLast(buffer); statementCount += buffer.length; notifyAll(); } /** * Stops the thread. */ synchronized void abort() { interrupt(); // Clear the triples list and notify in case ARP uses an internal thread // which has become blocked on the list being MAX_TRIPLES in size. queue.clear(); notifyAll(); } }