/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Sep 18, 2009
*/
package com.bigdata.samples;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.rio.RDFFormat;
import com.bigdata.journal.Journal;
import com.bigdata.journal.RWStrategy;
import com.bigdata.rdf.load.RDFFilenameFilter;
import com.bigdata.rdf.sail.BigdataSail;
import com.bigdata.rdf.sail.BigdataSailRepository;
import com.bigdata.rdf.store.DataLoader;
/**
* Sample code for loading RDF data using the {@link BigdataSail} and the
* openrdf API.
*
* @see DataLoader
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class LoadNamedGraphs extends SampleCode {
/**
* Load all data from some directory.
*
* @param dir
*
* @throws Exception
*/
public void loadAll(final Properties properties, final File file) throws Exception {
/*
* We are going to use the "quads" mode. Right now, the quads mode does
* not do inference AT ALL.
*/
// final File propertyFile = new File(
// "c:/bigdata-data-sets/LoadNamedGraphs.properties");
// // create a backing file
// final File journalFile = new File("c:/bigdata.jnl");
//// final File journalFile = File.createTempFile("bigdata", ".jnl");
//// journalFile.deleteOnExit();
// properties.setProperty(BigdataSail.Options.FILE, journalFile
// .getAbsolutePath());
// You can do the overrides in the property file.
// /*
// * Override the write retention queue (default is 500).
// *
// * This makes a BIG difference in the journal size and throughput if you
// * are bulk loading data and have enough RAM.
// */
// properties.setProperty(
// IndexMetadata.Options.WRITE_RETENTION_QUEUE_CAPACITY, "8000");
//
// properties.setProperty(IndexMetadata.Options.BTREE_BRANCHING_FACTOR,
// "64");
// instantiate a sail
final BigdataSail sail = new BigdataSail(properties);
try {
final Repository repo = new BigdataSailRepository(sail);
repo.initialize();
final RepositoryConnection cxn = repo.getConnection();
cxn.setAutoCommit(false);
try {
final long stmtsBefore =
// fast range count!
sail.getDatabase().getStatementCount();
// cxn.size();
final long start = System.currentTimeMillis();
if (file.getName().endsWith(".zip")
|| file.getName().endsWith(".ZIP")) {
// then process the sample data files one at a time
final InputStream is = new FileInputStream(file);
try {
final ZipInputStream zis = new ZipInputStream(
new BufferedInputStream(is));
try {
ZipEntry ze = null;
while ((ze = zis.getNextEntry()) != null) {
if (ze.isDirectory()) {
continue;
}
final String name = ze.getName();
if (log.isInfoEnabled())
log.info(name);
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final byte[] bytes = new byte[4096];
int count;
while ((count = zis.read(bytes, 0, 4096)) != -1) {
baos.write(bytes, 0, count);
}
baos.close();
final Reader reader = new InputStreamReader(
new ByteArrayInputStream(baos.toByteArray()));
final String baseIRI = file.toURI() + "/" + name;
cxn.add(reader, baseIRI, RDFFormat.forFileName(name));
// note: due to buffering, this reports stmts flush to the
// db
// not stmts added to the cxn.
final long elapsed = System.currentTimeMillis() - start;
// fast range count!
final long stmtsAfter = sail.getDatabase().getStatementCount();
// long stmtsAfter = cxn.size();
final long stmtsAdded = stmtsAfter - stmtsBefore;
final int throughput = (int) ((double) stmtsAdded
/ (double) elapsed * 1000d);
System.err.println("loaded: " + name + " : " + stmtsAdded
+ " stmts in " + elapsed + " millis: " + throughput
+ " stmts/sec");
logCounters(sail);
}
} finally {
zis.close();
}
} finally {
is.close();
}
} else if(file.isDirectory()) {
final File[] files = file.listFiles(new RDFFilenameFilter());
if (files != null) {
int nloaded = 0;
for (File f : files) {
// System.err.println("Reading: " + f);
final Reader reader = new InputStreamReader(
(f.getName().endsWith(".gz")
|| f.getName().endsWith(".GZ") ? new GZIPInputStream(
new FileInputStream(f))
: new FileInputStream(f)));
try {
final String baseIRI = file.toURI().toString();
cxn.add(reader, baseIRI, RDFFormat.forFileName(f
.getName()));
/*
* Note: due to buffering, this reports stmts flushed to
* the db not stmts added to the cxn.
*
* Note: cxn.size() will do a FULL SCAN of the statement
* index for many cases in order to report an exact
* range count. This is an issue with the Sesame API
* semantics (exact range count reporting) and with
* delete markers in the bigdata indices. Fast range
* counts are available with two key probes but do not
* satisfy the Sesame semantics. You can get the fast
* range count from the bigdata APIs.
*/
final long elapsed = System.currentTimeMillis() - start;
// fast range count!
final long stmtsAfter = sail.getDatabase().getStatementCount();
// long stmtsAfter = cxn.size();
final long stmtsAdded = stmtsAfter - stmtsBefore;
final int throughput = (int) ((double) stmtsAdded
/ (double) elapsed * 1000d);
nloaded++;
System.err.println("loaded: " + f
+ " : " + stmtsAdded + " stmts"
+" in " + elapsed + " millis" +
" : "+ throughput + " stmts/sec"+
", nloaded="+nloaded);
logCounters(sail);
} finally {
reader.close();
}
}
}
} else if(file.isFile()) {
final Reader reader = new InputStreamReader(
new FileInputStream(file));
try {
final String baseIRI = file.toURI().toString();
cxn.add(reader, baseIRI, RDFFormat.forFileName(file
.getName()));
// note: due to buffering, this reports stmts flush to the
// db not stmts added to the cxn.
final long elapsed = System.currentTimeMillis() - start;
// long stmtsAfter = cxn.size();
final long stmtsAfter = sail.getDatabase().getStatementCount();
final long stmtsAdded = stmtsAfter - stmtsBefore;
final int throughput = (int) ((double) stmtsAdded
/ (double) elapsed * 1000d);
System.err.println("loaded: " + file + " : " + stmtsAdded
+ " stmts in " + elapsed + " millis: " + throughput
+ " stmts/sec");
logCounters(sail);
} finally {
reader.close();
}
} else {
System.err.println("Can not load: "+file);
}
// autocommit is false, we need to commit our SAIL "transaction"
cxn.commit();
// gather statistics
final long elapsed = System.currentTimeMillis() - start;
// long stmtsAfter = cxn.size();
final long stmtsAfter = sail.getDatabase().getStatementCount();
final long stmtsAdded = stmtsAfter - stmtsBefore;
final int throughput = (int) ((double) stmtsAdded / (double) elapsed * 1000d);
System.err.println("statements after: " + stmtsAfter);
System.err.println("loaded: " + stmtsAdded + " in " + elapsed + " millis: "
+ throughput + " stmts/sec");
logCounters(sail);
} catch (Exception ex) {
cxn.rollback();
throw ex;
} finally {
// close the repository connection
cxn.close();
}
} finally {
sail.shutDown();
}
}
private void logCounters(final BigdataSail sail) {
if (!(sail.getDatabase().getIndexManager() instanceof Journal))
return;
if (!(((Journal) sail.getDatabase().getIndexManager())
.getBufferStrategy() instanceof RWStrategy))
return;
final StringBuilder sb = new StringBuilder();
((RWStrategy) ((Journal) sail.getDatabase()
.getIndexManager()).getBufferStrategy()).getStore()
.showAllocators(sb);
System.err.println(sb.toString());
}
/**
* Loads a bunch of data from a file, zip file, or directory
* (non-recursive). You can use <code>quad.properties</code> as the
* properties file or anything else that you like.
*
* @param args
* The name of the property and the name of the file or directory
* to load.
*
* @throws Exception
*/
public static void main(final String[] args) {
if (args.length < 2 ) {
System.out.println("usage: properties fileOrDirectoryOrZip");
System.exit(1);
}
final Properties properties;
try {
final File propertyFile = new File(args[0]);
if (!propertyFile.exists()) {
throw new FileNotFoundException(propertyFile.toString());
}
properties = new Properties();
final InputStream is = new BufferedInputStream(new FileInputStream(
propertyFile));
try {
properties.load(is);
} finally {
is.close();
}
} catch(IOException ex) {
throw new RuntimeException(ex);
}
try {
final File dataFileOrDirectory = new File(args[1]);
if (!dataFileOrDirectory.exists())
throw new FileNotFoundException(dataFileOrDirectory.toString());
new LoadNamedGraphs().loadAll(properties, dataFileOrDirectory);
} catch (Exception ex) {
ex.printStackTrace(System.err);
}
}
}