/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jena.tdb.store.bulkloader ;
import java.io.InputStream ;
import java.util.List ;
import org.apache.jena.atlas.event.EventType ;
import org.apache.jena.atlas.lib.DateTimeUtils ;
import org.apache.jena.graph.Node ;
import org.apache.jena.graph.Triple ;
import org.apache.jena.riot.Lang ;
import org.apache.jena.riot.RDFDataMgr ;
import org.apache.jena.riot.RDFLanguages ;
import org.apache.jena.shared.PrefixMapping ;
import org.apache.jena.sparql.core.Quad ;
import org.apache.jena.tdb.TDB ;
import org.apache.jena.tdb.TDBException ;
import org.apache.jena.tdb.solver.stats.Stats ;
import org.apache.jena.tdb.solver.stats.StatsCollector ;
import org.apache.jena.tdb.store.DatasetGraphTDB ;
import org.apache.jena.tdb.store.nodetupletable.NodeTupleTable ;
import org.apache.jena.tdb.store.nodetupletable.NodeTupleTableView ;
import org.apache.jena.tdb.sys.Names ;
import org.slf4j.Logger ;
/** Overall framework for bulk loading */
public class BulkLoader {
// Coordinate the NodeTupleTable loading.
/** Tick point for messages during loading of data */
public static int DataTickPoint = 50 * 1000 ;
/** Tick point for messages during secondary index creation */
public static long IndexTickPoint = 100 * 1000 ;
/** Number of ticks per super tick */
public static int superTick = 10 ;
private static String baseName = "http://jena.apache.org/TDB/bulkload/event#" ;
public static EventType evStartBulkload = new EventType(baseName + "start-bulkload") ;
public static EventType evFinishBulkload = new EventType(baseName + "finish-bulkload") ;
public static EventType evStartDataBulkload = new EventType(baseName + "start-bulkload-data") ;
public static EventType evFinishDataBulkload = new EventType(baseName + "finish-bulkload-data") ;
public static EventType evStartIndexBulkload = new EventType(baseName + "start-bulkload-index") ;
public static EventType evFinishIndexBulkload = new EventType(baseName + "finish-bulkload-index") ;
static private Logger loadLogger = TDB.logLoader ;
// Event callbacks for the load stages?
// On what object? The dataset.
// /** Load into default graph */
// public static void loadTriples(DatasetGraphTDB dsg, String url, boolean
// showProgress)
// {
// loadTriples(dsg, asList(url) , showProgress) ;
// }
/** Load into default graph */
public static void loadDefaultGraph(DatasetGraphTDB dsg, List<String> urls, boolean showProgress, boolean collectStats) {
BulkStreamRDF dest = destinationDefaultGraph(dsg, showProgress, collectStats) ;
loadTriples$(dest, urls) ;
}
/** Load into default graph */
public static void loadDefaultGraph(DatasetGraphTDB dsg, InputStream input, boolean showProgress, boolean collectStats) {
BulkStreamRDF dest = destinationDefaultGraph(dsg, showProgress, collectStats) ;
loadTriples$(dest, input) ;
}
private static BulkStreamRDF destinationDefaultGraph(DatasetGraphTDB dsg, boolean showProgress, boolean collectStats) {
return destinationGraph(dsg, null, showProgress, collectStats) ;
}
/** Load into named graph */
public static void loadNamedGraph(DatasetGraphTDB dsg, Node graphNode, List<String> urls, boolean showProgress, boolean collectStats) {
BulkStreamRDF dest = destinationNamedGraph(dsg, graphNode, showProgress, collectStats) ;
loadTriples$(dest, urls) ;
}
/** Load into named graph */
public static void loadNamedGraph(DatasetGraphTDB dsg, Node graphNode, InputStream input, boolean showProgress, boolean collectStats) {
BulkStreamRDF dest = destinationNamedGraph(dsg, graphNode, showProgress, collectStats) ;
loadTriples$(dest, input) ;
}
/** Load into a dataset */
public static void loadDataset(DatasetGraphTDB dsg, List<String> urls, boolean showProgress, boolean collectStats) {
BulkStreamRDF dest = destinationDataset(dsg, showProgress, collectStats) ;
loadQuads$(dest, urls) ;
}
/** Load into a dataset */
public static void loadDataset(DatasetGraphTDB dsg, InputStream input, boolean showProgress, boolean collectStats) {
BulkStreamRDF dest = destinationDataset(dsg, showProgress, collectStats) ;
loadQuads$(dest, input) ;
}
/** Load into a graph */
private static void loadTriples$(BulkStreamRDF dest, List<String> urls) {
dest.startBulk() ;
for ( String url : urls ) {
loadLogger.info("Load: " + url + " -- " + DateTimeUtils.nowAsString()) ;
Lang lang = RDFLanguages.filenameToLang(url, Lang.NTRIPLES) ;
RDFDataMgr.parse(dest, url, lang) ;
}
dest.finishBulk() ;
}
/** Load into a graph */
private static void loadTriples$(BulkStreamRDF dest, InputStream input) {
loadLogger.info("Load: from input stream -- " + DateTimeUtils.nowAsString()) ;
dest.startBulk() ;
RDFDataMgr.parse(dest, input, Lang.NTRIPLES) ;
dest.finishBulk() ;
}
/** Load quads into a dataset */
private static void loadQuads$(BulkStreamRDF dest, List<String> urls) {
dest.startBulk() ;
for ( String url : urls ) {
loadLogger.info("Load: " + url + " -- " + DateTimeUtils.nowAsString()) ;
Lang lang = RDFLanguages.filenameToLang(url, Lang.NQUADS) ;
RDFDataMgr.parse(dest, url, lang) ;
}
dest.finishBulk() ;
}
/** Load quads into a dataset */
private static void loadQuads$(BulkStreamRDF dest, InputStream input) {
loadLogger.info("Load: from input stream -- " + DateTimeUtils.nowAsString()) ;
dest.startBulk() ;
RDFDataMgr.parse(dest, input, RDFLanguages.NQUADS) ;
dest.finishBulk() ;
}
private static BulkStreamRDF destinationNamedGraph(DatasetGraphTDB dsg, Node graphName, boolean showProgress, boolean collectStats) {
if ( graphName == null )
return destinationDefaultGraph(dsg, showProgress, collectStats) ;
return destinationGraph(dsg, graphName, showProgress, collectStats) ;
}
public static LoadMonitor createLoadMonitor(DatasetGraphTDB dsg, String itemName, boolean showProgress) {
if ( showProgress )
return new LoadMonitor(dsg, loadLogger, itemName, DataTickPoint, IndexTickPoint) ;
else
return new LoadMonitor(dsg, null, itemName, DataTickPoint, IndexTickPoint) ;
}
private static BulkStreamRDF destinationDataset(DatasetGraphTDB dsg, boolean showProgress, boolean collectStats) {
return new DestinationDSG(dsg, showProgress, collectStats) ;
}
private static BulkStreamRDF destinationGraph(DatasetGraphTDB dsg, Node graphNode, boolean showProgress, boolean collectStats) {
return new DestinationGraph(dsg, graphNode, showProgress, collectStats) ;
}
// Load triples and quads into a dataset.
private static final class DestinationDSG implements BulkStreamRDF {
final private DatasetGraphTDB dsg ;
final private boolean startedEmpty ;
final private LoadMonitor monitor1 ;
final private LoadMonitor monitor2 ;
final private LoaderNodeTupleTable loaderTriples ;
final private LoaderNodeTupleTable loaderQuads ;
final private boolean showProgress ;
private long count = 0 ;
private StatsCollector stats = null ;
private final boolean collectStats ;
DestinationDSG(final DatasetGraphTDB dsg, boolean showProgress, boolean collectStats) {
this.dsg = dsg ;
startedEmpty = dsg.isEmpty() ;
monitor1 = createLoadMonitor(dsg, "triples", showProgress) ;
monitor2 = createLoadMonitor(dsg, "quads", showProgress) ;
loaderTriples = new LoaderNodeTupleTable(dsg.getTripleTable().getNodeTupleTable(), "triples", monitor1) ;
loaderQuads = new LoaderNodeTupleTable(dsg.getQuadTable().getNodeTupleTable(), "quads", monitor2) ;
this.showProgress = showProgress ;
this.collectStats = collectStats ;
}
@Override
final public void startBulk() {
loaderTriples.loadStart() ;
loaderQuads.loadStart() ;
loaderTriples.loadDataStart() ;
loaderQuads.loadDataStart() ;
if ( collectStats )
this.stats = new StatsCollector() ;
}
@Override
public void triple(Triple triple) {
Node s = triple.getSubject() ;
Node p = triple.getPredicate() ;
Node o = triple.getObject() ;
process(Quad.tripleInQuad, s, p, o) ;
}
@Override
public void quad(Quad quad) {
Node s = quad.getSubject() ;
Node p = quad.getPredicate() ;
Node o = quad.getObject() ;
Node g = null ;
// Union graph?!
if ( !quad.isTriple() && !quad.isDefaultGraph() )
g = quad.getGraph() ;
process(g, s, p, o) ;
}
private void process(Node g, Node s, Node p, Node o) {
if ( g == null )
loaderTriples.load(s, p, o) ;
else
loaderQuads.load(g, s, p, o) ;
count++ ;
if ( stats != null )
stats.record(g, s, p, o) ;
}
@Override
public void finishBulk() {
loaderTriples.loadDataFinish() ;
loaderQuads.loadDataFinish() ;
loaderTriples.loadIndexStart() ;
loaderQuads.loadIndexStart() ;
loaderTriples.loadIndexFinish() ;
loaderQuads.loadIndexFinish() ;
loaderTriples.loadFinish() ;
loaderQuads.loadFinish() ;
if ( !dsg.getLocation().isMem() && startedEmpty && stats != null ) {
String filename = dsg.getLocation().getPath(Names.optStats) ;
Stats.write(filename, stats.results()) ;
}
forceSync(dsg) ;
}
@Override
public void start() {}
@Override
public void base(String base) {}
@Override
public void prefix(String prefix, String iri) {
dsg.getPrefixes().getPrefixMapping().setNsPrefix(prefix, iri) ;
}
@Override
public void finish() {}
}
// Load triples into a specific NodeTupleTable
private static final class DestinationGraph implements BulkStreamRDF {
final private DatasetGraphTDB dsg ;
final private Node graphName ;
final private LoadMonitor monitor ;
final private LoaderNodeTupleTable loaderTriples ;
final private boolean startedEmpty ;
private long count = 0 ;
private StatsCollector stats = null ;
private final boolean collectStats ;
// Graph node is null for default graph.
DestinationGraph(final DatasetGraphTDB dsg, Node graphNode, boolean showProgress, boolean collectStats) {
this.dsg = dsg ;
this.graphName = graphNode ;
this.collectStats = collectStats ;
// Choose NodeTupleTable.
NodeTupleTable nodeTupleTable ;
if ( graphNode == null || Quad.isDefaultGraph(graphNode) )
nodeTupleTable = dsg.getTripleTable().getNodeTupleTable() ;
else {
NodeTupleTable ntt = dsg.getQuadTable().getNodeTupleTable() ;
nodeTupleTable = new NodeTupleTableView(ntt, graphName) ;
}
startedEmpty = dsg.isEmpty() ;
monitor = createLoadMonitor(dsg, "triples", showProgress) ;
loaderTriples = new LoaderNodeTupleTable(nodeTupleTable, "triples", monitor) ;
}
@Override
final public void startBulk() {
loaderTriples.loadStart() ;
loaderTriples.loadDataStart() ;
if ( collectStats )
this.stats = new StatsCollector() ;
}
@Override
final public void triple(Triple triple) {
Node s = triple.getSubject() ;
Node p = triple.getPredicate() ;
Node o = triple.getObject() ;
loaderTriples.load(s, p, o) ;
if ( stats != null )
stats.record(null, s, p, o) ;
count++ ;
}
@Override
final public void finishBulk() {
loaderTriples.loadDataFinish() ;
loaderTriples.loadIndexStart() ;
loaderTriples.loadIndexFinish() ;
loaderTriples.loadFinish() ;
if ( !dsg.getLocation().isMem() && startedEmpty && stats != null ) {
String filename = dsg.getLocation().getPath(Names.optStats) ;
Stats.write(filename, stats.results()) ;
}
forceSync(dsg) ;
}
@Override
public void start() {}
@Override
public void quad(Quad quad) {
throw new TDBException("Quad encountered while loading a single graph") ;
}
@Override
public void base(String base) {}
@Override
public void prefix(String prefix, String iri) {
if ( graphName != null && graphName.isBlank() ) {
loadLogger.warn("Prefixes for blank node graphs not stored") ;
return ;
}
PrefixMapping pmap = (graphName == null)
? dsg.getPrefixes().getPrefixMapping()
: dsg.getPrefixes().getPrefixMapping(graphName.getURI()) ;
pmap.setNsPrefix(prefix, iri) ;
}
@Override
public void finish() {}
}
static void forceSync(DatasetGraphTDB dsg) {
// Force sync - we have been bypassing DSG tables.
// THIS DOES NOT WORK IF modules check for SYNC necessity.
dsg.getTripleTable().getNodeTupleTable().getNodeTable().sync() ;
dsg.getQuadTable().getNodeTupleTable().getNodeTable().sync() ;
dsg.getQuadTable().getNodeTupleTable().getNodeTable().sync() ;
dsg.getPrefixes().getNodeTupleTable().getNodeTable().sync() ;
// This is not enough -- modules check whether sync needed.
dsg.sync() ;
}
}