/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Mar 17, 2012 */ package com.bigdata.bop.rdf.update; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.Serializable; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.Callable; import java.util.concurrent.FutureTask; import org.apache.log4j.Logger; import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.RDFParser; import org.openrdf.rio.RDFParserFactory; import org.openrdf.rio.RDFParserRegistry; import org.openrdf.rio.RDFParser.DatatypeHandling; import org.openrdf.rio.helpers.RDFHandlerBase; import com.bigdata.bop.BOp; import com.bigdata.bop.BOpContext; import com.bigdata.bop.Constant; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IConstant; import com.bigdata.bop.ILocatableResourceAnnotations; import com.bigdata.bop.PipelineOp; import com.bigdata.bop.Var; import com.bigdata.bop.bindingSet.ListBindingSet; import com.bigdata.rdf.internal.IV; import com.bigdata.rdf.internal.VTE; import com.bigdata.rdf.internal.impl.TermId; import com.bigdata.rdf.lexicon.LexiconRelation; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.rio.IRDFParserOptions; import com.bigdata.rdf.rio.PresortRioLoader; import com.bigdata.rdf.rio.RDFParserOptions; import com.bigdata.rdf.rio.StatementBuffer; import com.bigdata.rdf.sail.webapp.client.MiniMime; import com.bigdata.rdf.sparql.ast.LoadGraph; import com.bigdata.rdf.store.AbstractTripleStore; import com.bigdata.rdf.store.DataLoader; import com.bigdata.rdf.store.DataLoader.ClosureEnum; import com.bigdata.rdf.store.DataLoader.CommitEnum; import com.bigdata.rdf.store.DataLoader.Options; import com.bigdata.relation.accesspath.UnsyncLocalOutputBuffer; import com.bigdata.util.Bytes; /** * Operator parses a RDF data source, writing bindings which represent * statements onto the output sink. This operator is compatible with the * {@link ChunkedResolutionOp} and the {@link InsertStatementsOp}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * TODO Examine the integration point for Truth Maintenance (TM). * <p> * {@link ClosureEnum} and {@link CommitEnum} shape the way in which * the update plan is generated. They are not options on the * {@link ParseOp} itself. * <p> * We need to setup the assertion and retraction buffers such that they * have the appropriate scope or (for database at once closure) we do * not setup those buffers but we recompute the closure of the database * afterwards. * <p> * The assertion buffers might be populated after the IV resolution * step and before we write on the indices. We then compute the fixed * point of the closure over the delta and then write that onto the * database. We should be able to specify that some sources contain * data to be removed (INSERT DATA and REMOVE DATA or UNLOAD src). The * operation should combine assertions and retractions to be efficient. * <p> * See {@link DataLoader}. * * TODO Add an operator which handles a zip archive, creating a LOAD * for each resource in that archive. Recursive directory processing is * similar. Both should result in multiple ParseOp instances which can * run in parallel. Those ParseOp instances will feed the IV * resolution, optional TM, and statement writer operations. * <p> * If we can make the SOURCE_URI a value expression, then we could flow * solutions into the LOAD operation which would be the bindings for * the source URI. Very nice! Then we could hash partition the LOAD * operator across a cluster and do a parallel load very easily. If the * source for those solutions was the parse of a single RDF file (or * streamed URI) containing the files to be loaded then we could also * gain the indirection necessary to load large numbers of files in * parallel on a cluster. * * TODO In at least the SIDS mode, we need to do some special * operations when the statement buffer is flushed. That statement * buffer could either be fed directly by the {@link ParserOp} or * indirectly through solutions modeling statements flowing through the * query engine. I am inclined to the latter for better parallelism. * Even though there is more stuff on the heap and more latency within * the stages, I think that we will get more out of the increased * parallelism. * * TODO Any annotation here should be configurable from the * {@link LoadGraph} AST node and (ideally) the SPARQL UPDATE syntax. * * FIXME This does not handle SIDS. The {@link StatementBuffer} logic * needs to get into {@link InsertStatementsOp} for that to work, or * the plan needs to be slightly different and hit a different insert * operator for statements all together. * * FIXME This does not handle Truth Maintenance. * * @see PresortRioLoader * @see StatementBuffer * @see DataLoader * @see DataLoader.Options * @see RDFParserOptions * @see ClosureEnum * @see CommitEnum */ public class ParseOp extends PipelineOp { private static final transient Logger log = Logger.getLogger(ParseOp.class); /** * */ private static final long serialVersionUID = 1L; /** * Note: {@link BOp.Annotations#TIMEOUT} is respected to limit the read time * on an HTTP connection. * <p> * Note: The {@link RDFParserOptions} are initialized based on * {@link AbstractTripleStore#getProperties()}. Those defaults are then * overriden by any {@link RDFParserOptions.Options} which are specified as * annotations. When {@link LexiconRelation#isStoreBlankNodes()} is * <code>true</code>, then blank nodes IDs will be preserved unless that * option has been explicitly overridden. */ public interface Annotations extends PipelineOp.Annotations, ILocatableResourceAnnotations, RDFParserOptions.Options { /** * The data source to be parsed. */ String SOURCE_URI = ParseOp.class.getName() + ".sourceUri"; /** * The base URI for that data source (defaults to the * {@link #SOURCE_URI}). */ String BASE_URI = ParseOp.class.getName() + ".baseUri"; /** * The target graph (optional). * <p> * Note: This is ignored unless we are in quads mode. If we are in quads * mode and the data is not quads, then it is an error. */ String TARGET_URI = ParseOp.class.getName() + ".targetUri"; /** * When <code>true</code>, a parse error will be ignored. */ String SILENT = ParseOp.class.getName() + ".silent"; boolean DEFAULT_SILENT = false; // /** // * Optional property specifying the capacity of the // * {@link StatementBuffer} (default is {@value #DEFAULT_BUFFER_CAPACITY} // * statements). // */ // String BUFFER_CAPACITY = ParseOp.class.getName() + ".bufferCapacity"; // // int DEFAULT_BUFFER_CAPACITY = 100000; /** * The name of the fallback {@link RDFFormat} (default * {@link #DEFAULT_FALLBACK}). * <p> * Note: {@link RDFFormat} is not {@link Serializable}, so this * annotation must be the format name rather than the {@link RDFFormat} * instance. */ String FALLBACK = ParseOp.class.getName() + ".fallback"; String DEFAULT_FALLBACK = RDFFormat.RDFXML.getName(); /** * When <code>true</code>, the context position will be stripped from * the statements. This may be used to remove context from quads data * when loading into a triples or SIDs mode database. It may also be * used to force quads data into a specific context in quads mode. */ String STRIP_CONTEXT = ParseOp.class.getName() + ".stripContext"; boolean DEFAULT_STRIP_CONTEXT = false; /** * <code>true</code> iff HTTP requests may use cached responses. */ String USES_CACHE = ParseOp.class.getName() + ".usesCache"; boolean DEFAULT_USES_CACHE = true; /** * The {@link BufferedReader}'s internal buffer size (default is * {@value #DEFAULT_READ_BUFFER_SIZE}). */ String READ_BUFFER_SIZE = ParseOp.class.getName() + ".readBufferSize"; /** * Note: 8k is the default buffer size for {@link BufferedReader} so * this default is the same as not specifying an override. */ int DEFAULT_READ_BUFFER_SIZE = Bytes.kilobyte32 * 8; } public ParseOp(final BOp[] args, final Map<String, Object> annotations) { super(args, annotations); getRequiredProperty(Annotations.SOURCE_URI); getRequiredProperty(Annotations.RELATION_NAME); getRequiredProperty(Annotations.TIMESTAMP); } public ParseOp(final ParseOp op) { super(op); } /** * The s, p, o, and c variable names. */ static protected final Var<?> s = Var.var("s"), p = Var.var("p"), o = Var .var("o"), c = Var.var("c"); @Override public ParserStats newStats() { return new ParserStats(); } @Override public FutureTask<Void> eval(BOpContext<IBindingSet> context) { return new FutureTask<Void>(new ChunkTask(context, this)); } static private class ChunkTask implements Callable<Void> { private final BOpContext<IBindingSet> context; private final ParserStats stats; private final URI sourceUri; private final String uriStr; private final String baseUri; private final URI targetUri; /** * When true, attempt to resolve the resource against CLASSPATH. * * TODO Should we permit this? */ private final boolean allowClassPath = true; /** * The default {@link RDFFormat}. */ private final RDFFormat fallback; private final boolean sids, quads, silent; private final boolean stripContext; /** The connection timeout (ms) -or- ZERO (0) for an infinite timeout. */ private final int timeout; /** * When <code>true</code> HTTP caching of the request is allowed. */ private final boolean usesCache; /** * The {@link BufferedReader}'s internal buffer size (default is 8k). */ private final int readBufferSize; /** * Output chunk size. */ private final int chunkCapacity; /** * TODO Javadoc for annotations (which need to be defined) and * interaction with the triple store properties. */ private final IRDFParserOptions parserOptions; /** * The {@link AbstractTripleStore} on which the statements will * eventually be written. */ private final AbstractTripleStore database; /* * FIXME Both the {@link #buffer} and the {@link #tm} objects need to be * part of shared state in order to support truth maintenance. That * state needs to be shared across the operations used to add/resolve * IVs, compute the closure over the statements to be added and/or * removed, and then insert/remove the statements and entailments, * updating the proof chains. * * This does not belong on the ParserOp, but as something which wraps * ANY sequence of UPDATE operations (INSERT DATA, REMOVE DATA, LOAD * DATA, DELETE/INSERT, CLEAR, COPY, ADD, etc). TM is only available for * triples and SIDS on a Journal. */ // // /** StatementBuffer capacity. */ // private final int bufferCapacity; // // /** // * The object used to compute entailments for the database. // */ // private final InferenceEngine inferenceEngine; // // /** // * The object used to compute entailments for the database. // */ // public InferenceEngine getInferenceEngine() { // // return inferenceEngine; // // } // // /** // * Used to buffer writes. // * // * @see #getAssertionBuffer() // */ // private StatementBuffer<?> buffer; // // /** // * The object used to maintain the closure for the database iff // * incremental truth maintenance is enabled. // */ // private final TruthMaintenance tm; // // /** // * Return the assertion buffer. // * <p> // * The assertion buffer is used to buffer statements that are being // * asserted so as to maximize the opportunity for batch writes. Truth // * maintenance (if enabled) will be performed no later than the commit // * of the transaction. // * <p> // * Note: The same {@link #buffer} is reused by each loader so that we // * can on the one hand minimize heap churn and on the other hand disable // * auto-flush when loading a series of small documents. However, we // * obtain a new buffer each time we perform incremental truth // * maintenance. // * <p> // * Note: When non-<code>null</code> and non-empty, the buffer MUST be // * flushed (a) if a transaction completes (otherwise writes will not be // * stored on the database); or (b) if there is a read against the // * database during a transaction (otherwise reads will not see the // * unflushed statements). // * <p> // * Note: if {@link #truthMaintenance} is enabled then this buffer is // * backed by a temporary store which accumulates the {@link SPO}s to be // * asserted. Otherwise it will write directly on the database each time // * it is flushed, including when it overflows. // * // * @todo this should be refactored as an {@link IStatementBufferFactory} // * where the appropriate factory is required for TM vs non-TM // * scenarios (or where the factory is parameterize for tm vs // * non-TM). // */ // private StatementBuffer<?> getAssertionBuffer() { // // if (buffer == null) { // // if (tm != null) { // // buffer = new StatementBuffer(tm.newTempTripleStore(), // database, bufferCapacity); // // } else { // // buffer = new StatementBuffer(database, bufferCapacity) { // // }; // // } // // } // // return buffer; // // } public ChunkTask(final BOpContext<IBindingSet> context, final ParseOp op) { this.context = context; this.stats = (ParserStats) context.getStats(); this.sourceUri = (URI) op .getRequiredProperty(Annotations.SOURCE_URI); // String value of that URI. this.uriStr = sourceUri.stringValue(); // base URI defaults to the source URI this.baseUri = op.getProperty(Annotations.BASE_URI, sourceUri.stringValue()); final String namespace = ((String[]) op .getRequiredProperty(Annotations.RELATION_NAME))[0]; final long timestamp = (Long) op .getRequiredProperty(Annotations.TIMESTAMP); this.database = (AbstractTripleStore) context.getResource( namespace, timestamp); this.sids = database.isStatementIdentifiers(); this.quads = database.isQuads(); this.silent = op.getProperty(Annotations.SILENT, Annotations.DEFAULT_SILENT); /* * Note: This is used as the default context position for the * statement if the database mode is quads. It needs to be a * BigdataURI from the value factory for the target database - the * same factory that is used by the parser. */ this.targetUri = (URI) database.asValue((URI) op .getProperty(Annotations.TARGET_URI)); this.chunkCapacity = op.getChunkCapacity(); /* * Note: RDFFormat is not Serializable, so this annotation must be * the format name rather than the RDFFormat instance. */ this.fallback = RDFFormat.valueOf(op.getProperty( Annotations.FALLBACK, Annotations.DEFAULT_FALLBACK)); this.stripContext = op.getProperty(Annotations.STRIP_CONTEXT, Annotations.DEFAULT_STRIP_CONTEXT); // this.inferenceEngine = database.getInferenceEngine(); // // if (database.getAxioms().isNone()) { // // tm = null; // // } else { // // /* // * Truth maintenance: buffer will write on a tempStore. // */ // // tm = new TruthMaintenance(inferenceEngine); // // } // // this.bufferCapacity = op.getProperty(Annotations.BUFFER_CAPACITY, // Annotations.DEFAULT_BUFFER_CAPACITY); // HTTPConnection timeout. this.timeout = op.getProperty(Annotations.TIMEOUT, 0/* infinite */); // HTTP caching allowed. this.usesCache = op.getProperty(Annotations.USES_CACHE, Annotations.DEFAULT_USES_CACHE); // BufferedReader buffer size. this.readBufferSize = op.getProperty(Annotations.READ_BUFFER_SIZE, Annotations.DEFAULT_READ_BUFFER_SIZE); /* * Setup the parser options. */ { final Properties properties = database.getProperties(); /* * Initialize the parser options based on the database * properties. */ this.parserOptions = new RDFParserOptions(properties); /* * Now do explicit annotations which override anything we picked * up above. */ if (op.getProperty(Options.VERIFY_DATA) != null) { parserOptions.setVerifyData((Boolean) op .getProperty(Options.VERIFY_DATA)); } if (op.getProperty(Options.STOP_AT_FIRST_ERROR) != null) { parserOptions.setStopAtFirstError((Boolean) op .getProperty(Options.STOP_AT_FIRST_ERROR)); } if (op.getProperty(Options.DATATYPE_HANDLING) != null) { parserOptions.setDatatypeHandling((DatatypeHandling) op .getProperty(Options.DATATYPE_HANDLING)); } if (op.getProperty(Options.PRESERVE_BNODE_IDS) != null) { parserOptions.setPreserveBNodeIDs((Boolean) op .getProperty(Options.PRESERVE_BNODE_IDS)); } if ((properties.getProperty(Options.PRESERVE_BNODE_IDS) == null) && op.getProperty(Options.PRESERVE_BNODE_IDS) == null && database.getLexiconRelation().isStoreBlankNodes()) { /* * Note: preserveBNodeIDs is overridden based on whether or * not the target is storing the blank node identifiers * (unless the property was explicitly set - this amounts to * a conditional default). */ parserOptions.setPreserveBNodeIDs(true); } } } /** */ @Override public Void call() throws Exception { InputStream is = null; Reader reader = null; RDFFormat fmt = null; try { /* * The expected format based on the file name component of the * URL. */ fmt = RDFFormat.forFileName(uriStr, fallback); String contentEncoding = null; if (allowClassPath) { // try the classpath : is = getClass() .getResourceAsStream(uriStr); if (is == null) { /* * Searching for the resource from the root of the class * returned by getClass() (relative to the class' * package) failed. Next try searching for the desired * resource from the root of the jar; that is, search * the jar file for an exact match of the input string. */ is = getClass().getClassLoader().getResourceAsStream( uriStr); } } if (is == null) { /* * TODO Refactor to use the apache http components and * provide for managed connection pools, authentication, * etc. However, make sure that we preserve the ability to * read from the local file system. */ URLConnection conn = null; final URL url = new URL(uriStr); conn = (URLConnection) url.openConnection(); final HttpURLConnection conn2 = (conn instanceof HttpURLConnection) ? (HttpURLConnection) conn : null; if (conn2 != null) { /* * Only if using HTTP. */ conn2.setRequestMethod("GET"); /* * Set the AcceptHeader based on the expected format. */ final String acceptHeader; { final StringBuilder sb = new StringBuilder(); final List<String> acceptParams = RDFFormat .getAcceptParams(RDFFormat.values(), quads/* requireContext */, fmt/* preferredRDFFormat */); for (String acceptParam : acceptParams) { if (sb.length() > 0) { sb.append(","); } sb.append(acceptParam); } acceptHeader = sb.toString(); if (log.isDebugEnabled()) log.debug("Accept: " + acceptHeader); } conn2.setRequestProperty("Accept", acceptHeader); conn.setUseCaches(usesCache); conn.setReadTimeout(timeout); } conn.setDoInput(true); // connect. conn.connect(); if (conn2 != null) { // Extract the MIME type from the Content-Type header. final String mimeType = new MiniMime( conn.getContentType()).getMimeType(); // Figure out the RDFFormat from that. fmt = RDFFormat.forMIMEType(mimeType, fallback); contentEncoding = conn2.getContentEncoding(); } is = conn.getInputStream(); } /* * Obtain a buffered reader on the input stream. */ if (contentEncoding == null) { /* * Assume the default content encoding if we have no better * information. */ contentEncoding = fmt.getCharset().name(); } reader = new BufferedReader(new InputStreamReader(is, contentEncoding), readBufferSize); parse(reader, baseUri, fmt, targetUri); } catch (Exception ex) { final String msg = "While loading: " + uriStr + (fmt != null ? ", fmt=" + fmt : ""); if (silent) { log.warn(msg); } else { throw new RuntimeException(msg, ex); } } finally { if (reader != null) reader.close(); if (is != null) { is.close(); } } // done. return null; } /** * * @param source * @param baseURL * @param rdfFormat * @param defaultGraph * * @throws IOException * @throws RDFHandlerException * @throws RDFParseException */ private void parse(final Reader source, final String baseURL, final RDFFormat fmt, final URI defaultGraph ) throws RDFParseException, RDFHandlerException, IOException { final RDFParserFactory rdfParserFactory = RDFParserRegistry .getInstance().get(fmt); if (rdfParserFactory == null) { throw new RuntimeException( "Parser factory not found: source=" + uriStr + ", fmt=" + fmt); } final RDFParser rdfParser = rdfParserFactory.getParser(); rdfParser.setValueFactory(database.getValueFactory()); rdfParser.setVerifyData(parserOptions.getVerifyData()); rdfParser.setStopAtFirstError(parserOptions.getStopAtFirstError()); rdfParser.setDatatypeHandling(parserOptions.getDatatypeHandling()); rdfParser.setPreserveBNodeIDs(parserOptions.getPreserveBNodeIDs()); /* * Note: The vector size should be pretty big for this. 100k might * be good (that is the DataLoader default). */ final UnsyncLocalOutputBuffer<IBindingSet> unsyncBuffer = new UnsyncLocalOutputBuffer<IBindingSet>( chunkCapacity, context.getSink()); rdfParser.setRDFHandler(new AddStatementHandler(unsyncBuffer)); /* * Run the parser, which will cause statements to be inserted. */ rdfParser.parse(source, baseUri); unsyncBuffer.flush(); } /** * Helper class adds statements to the sail as they are visited by a parser. */ private class AddStatementHandler extends RDFHandlerBase { private final UnsyncLocalOutputBuffer<IBindingSet> unsyncBuffer; public AddStatementHandler( final UnsyncLocalOutputBuffer<IBindingSet> unsyncBuffer ) { this.unsyncBuffer = unsyncBuffer; } @SuppressWarnings({ "rawtypes", "unchecked" }) private final IConstant<IV> asConst(final Value v) { final IV iv = TermId.mockIV(VTE.valueOf(v)); iv.setValue((BigdataValue)v); return new Constant<IV>(iv); } public void handleStatement(final Statement stmt) throws RDFHandlerException { final ListBindingSet bset = new ListBindingSet(); bset.set(s, asConst(stmt.getSubject())); bset.set(p, asConst(stmt.getPredicate())); bset.set(o, asConst(stmt.getObject())); Resource context = stmt.getContext(); if (stripContext) { // Strip off the context position. context = null; } if (quads && context == null) { // Use the default context. context = targetUri; } if (quads && context == null) { throw new RuntimeException( "Quads mode, but data are triples and the target graph was not specified: " + uriStr); } if (context != null) { /* * Bind [c] if available regardless of the database mode. * This stops people from loading quads data into a triples * or SIDs database in a way which throws away the context * when they are not expecting that. */ bset.set(c, asConst(context)); } unsyncBuffer.add(bset); stats.toldTriples.increment(); } } } // ChunkTask }