/**
* *****************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* This code was developed by the Information Integration Group as part of the
* Karma project at the Information Sciences Institute of the University of
* Southern California. For more information, publications, and related
* projects, please see: http://www.isi.edu/integration
* ****************************************************************************
*/
package edu.isi.karma.rdf;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Modifier;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.reflections.Reflections;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.ResIterator;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import edu.isi.karma.config.ModelingConfiguration;
import edu.isi.karma.config.ModelingConfigurationRegistry;
import edu.isi.karma.controller.update.UpdateContainer;
import edu.isi.karma.er.helper.PythonRepository;
import edu.isi.karma.er.helper.PythonRepositoryRegistry;
import edu.isi.karma.kr2rml.ContextIdentifier;
import edu.isi.karma.kr2rml.URIFormatter;
import edu.isi.karma.kr2rml.mapping.R2RMLMappingIdentifier;
import edu.isi.karma.kr2rml.mapping.WorksheetR2RMLJenaModelParser;
import edu.isi.karma.kr2rml.planning.UserSpecifiedRootStrategy;
import edu.isi.karma.kr2rml.writer.JSONKR2RMLRDFWriter;
import edu.isi.karma.kr2rml.writer.KR2RMLRDFWriter;
import edu.isi.karma.kr2rml.writer.N3KR2RMLRDFWriter;
import edu.isi.karma.metadata.KarmaMetadataManager;
import edu.isi.karma.metadata.PythonTransformationMetadata;
import edu.isi.karma.metadata.UserConfigMetadata;
import edu.isi.karma.metadata.UserPreferencesMetadata;
import edu.isi.karma.modeling.Namespaces;
import edu.isi.karma.modeling.Uris;
import edu.isi.karma.modeling.semantictypes.SemanticTypeUtil;
import edu.isi.karma.rdf.GenericRDFGenerator.InputType;
import edu.isi.karma.util.DBType;
import edu.isi.karma.util.EncodingDetector;
import edu.isi.karma.webserver.ContextParametersRegistry;
import edu.isi.karma.webserver.KarmaException;
import edu.isi.karma.webserver.ServletContextParameterMap;
import edu.isi.karma.webserver.ServletContextParameterMap.ContextParameter;
public class OfflineRdfGenerator {
private static Logger logger = LoggerFactory.getLogger(OfflineRdfGenerator.class);
private String inputType;
private String inputEncoding;
private String inputDelimiter;
private String inputTextQualifier;
private String inputHeaderStartIndex;
private String inputDataStartIndex;
private String modelFilePath;
private String modelURLString;
private String baseURI;
private String outputFilePath;
private String outputFileJSONPath;
private String bloomFiltersFilePath;
private List<KR2RMLRDFWriter> writers;
private URL modelURL;
private String dbtypeStr;
private String username;
private String password;
private String hostname;
private String encoding;
private String sourceFilePath;
private String dBorSIDName;
private String tablename;
private String topkrows;
private String queryFile;
private String portnumber;
private String sMaxNumLines;
private String sourceName;
private String selectionName;
private int port;
private DBType dbType;
private File inputFile;
private int maxNumLines;
private String rootTripleMap;
private List<String> killTripleMap;
private List<String> stopTripleMap;
private List<String> POMToKill;
private String contextFile;
private String contextURLString;
private URL contextURL;
private ServletContextParameterMap contextParameters;
public OfflineRdfGenerator(CommandLine cl)
{
this.writers = new LinkedList<>();
parseCommandLineOptions(cl);
}
public static void main(String[] args) {
Options options = createCommandLineOptions();
CommandLine cl = CommandLineArgumentParser.parse(args, options, OfflineRdfGenerator.class.getSimpleName());
if(cl == null)
{
return;
}
try {
OfflineRdfGenerator generator = new OfflineRdfGenerator(cl);
long start = System.currentTimeMillis();
generator.generate();
long end = System.currentTimeMillis();
logger.info("Time to generate RDF:" + (float) (end-start) /(1000*60) + " mins");
} catch (Exception e) {
logger.error("Error occured while generating RDF!", e);
}
}
private void generate() throws Exception {
if (validateCommandLineOptions()) {
createModelURL();
setupKarmaMetadata();
generateRDF();
closeWriters();
}
}
private void generateRDF() throws Exception
{
/**
* Generate RDF on the source type *
*/
long l = System.currentTimeMillis();
// Database table
if (inputType.equals("DB") || inputType.equals("SQL")) {
generateRdfFromDatabaseTable();
} // File based worksheets such as JSON, XML, CSV
else {
generateRdfFromFile();
}
logger.info("done after {}", System.currentTimeMillis() - l);
if(outputFilePath != null)
{
logger.info("RDF published at: " + outputFilePath);
}
if(outputFileJSONPath != null)
{
logger.info("JSON-LD published at: " + outputFileJSONPath);
}
}
private void setupKarmaMetadata() throws KarmaException {
ContextParametersRegistry contextParametersRegistry = ContextParametersRegistry.getInstance();
contextParameters = contextParametersRegistry.registerByKarmaHome(null);
UpdateContainer uc = new UpdateContainer();
KarmaMetadataManager userMetadataManager = new KarmaMetadataManager(contextParameters);
userMetadataManager.register(new UserPreferencesMetadata(contextParameters), uc);
userMetadataManager.register(new UserConfigMetadata(contextParameters), uc);
userMetadataManager.register(new PythonTransformationMetadata(contextParameters), uc);
PythonRepository pythonRepository = new PythonRepository(false, contextParameters.getParameterValue(ContextParameter.USER_PYTHON_SCRIPTS_DIRECTORY));
PythonRepositoryRegistry.getInstance().register(pythonRepository);
SemanticTypeUtil.setSemanticTypeTrainingStatus(false);
ModelingConfiguration modelingConfiguration = ModelingConfigurationRegistry.getInstance().register(contextParameters.getId());
modelingConfiguration.setLearnerEnabled(false); // disable automatic learning
}
protected void parseCommandLineOptions(CommandLine cl) {
inputType = (String) cl.getOptionValue("sourcetype");
inputEncoding = (String) cl.getOptionValue("encoding");
inputDelimiter = (String) cl.getOptionValue("delimiter");
if(inputDelimiter != null) {
if(inputDelimiter.equalsIgnoreCase("tab"))
inputDelimiter = "\t";
else if(inputDelimiter.equalsIgnoreCase("space"))
inputDelimiter = " ";
}
inputTextQualifier = (String) cl.getOptionValue("textqualifier");
inputHeaderStartIndex = (String) cl.getOptionValue("headerindex");
inputDataStartIndex = (String) cl.getOptionValue("dataindex");
modelFilePath = (String) cl.getOptionValue("modelfilepath");
modelURLString = (String) cl.getOptionValue("modelurl");
outputFilePath = (String) cl.getOptionValue("outputfile");
outputFileJSONPath = (String) cl.getOptionValue("jsonoutputfile");
baseURI = (String) cl.getOptionValue("baseuri");
bloomFiltersFilePath = (String) cl.getOptionValue("outputbloomfilter");
selectionName = (String) cl.getOptionValue("selection");
rootTripleMap = (String) cl.getOptionValue("root");
String killTripleMap = (String) cl.getOptionValue("killtriplemap");
String stopTripleMap = (String) cl.getOptionValue("stoptriplemap");
String POMToKill = (String) cl.getOptionValue("pomtokill");
contextFile = (String)cl.getOptionValue("contextfile");
contextURLString = (String)cl.getOptionValue("contexturl");
if (rootTripleMap == null) {
rootTripleMap = "";
}
if (killTripleMap == null) {
this.killTripleMap = new ArrayList<>();
}
else {
this.killTripleMap = new ArrayList<>(Arrays.asList(killTripleMap.split(",")));
int size = this.killTripleMap.size();
for (int i = 0; i < size; i++) {
String t = this.killTripleMap.remove(0);
this.killTripleMap.add(Namespaces.KARMA_DEV + t);
}
}
if (stopTripleMap == null) {
this.stopTripleMap = new ArrayList<>();
}
else {
this.stopTripleMap = new ArrayList<>(Arrays.asList(stopTripleMap.split(",")));
int size = this.stopTripleMap.size();
for (int i = 0; i < size; i++) {
String t = this.stopTripleMap.remove(0);
this.stopTripleMap.add(Namespaces.KARMA_DEV + t);
}
}
if (POMToKill == null) {
this.POMToKill = new ArrayList<>();
}
else {
this.POMToKill = new ArrayList<>(Arrays.asList(POMToKill.split(",")));
int size = this.POMToKill.size();
for (int i = 0; i < size; i++) {
String t = this.POMToKill.remove(0);
this.POMToKill.add(Namespaces.KARMA_DEV + t);
}
}
parseDatabaseCommandLineOptions(cl);
parseFileCommandLineOptions(cl);
}
protected void parseDatabaseCommandLineOptions(CommandLine cl)
{
dbtypeStr = (String) cl.getOptionValue("dbtype");
hostname = (String) cl.getOptionValue("hostname");
username = (String) cl.getOptionValue("username");
password = (String) cl.getOptionValue("password");
encoding = (String) cl.getOptionValue("encoding");
dBorSIDName = (String) cl.getOptionValue("dbname");
tablename = (String) cl.getOptionValue("tablename");
topkrows = (String) cl.getOptionValue("topkrows");
queryFile = (String) cl.getOptionValue("queryfile");
portnumber = (String) cl.getOptionValue("portnumber");
}
protected void parseFileCommandLineOptions(CommandLine cl)
{
sourceFilePath = (String) cl.getOptionValue("filepath");
sMaxNumLines = (String) cl.getOptionValue("maxNumLines");
sourceName = (String) cl.getOptionValue("sourcename");
}
protected boolean validateCommandLineOptions() throws IOException
{
if ((modelURLString == null && modelFilePath == null) || (outputFilePath == null && outputFileJSONPath == null) || inputType == null) {
logger.error("Mandatory value missing. Please provide argument value "
+ "for sourcetype, (modelfilepath or modelurl) and (outputfile or jsonoutputfile).");
return false;
}
if (!inputType.equalsIgnoreCase("DB")
&& !inputType.equalsIgnoreCase("CSV")
&& !inputType.equalsIgnoreCase("XML")
&& !inputType.equalsIgnoreCase("JSON")
&& !inputType.equalsIgnoreCase("SQL")
&& !inputType.equalsIgnoreCase("AVRO")
&& !inputType.equalsIgnoreCase("JL")
) {
logger.error("Invalid source type: " + inputType
+ ". Please choose from: DB, SQL, CSV, XML, JSON, AVRO, JL.");
return false;
}
return true;
}
private boolean validateFileCommandLineOptions() {
inputFile = new File(sourceFilePath);
if (!inputFile.exists()) {
logger.error("File not found: " + inputFile.getAbsolutePath());
return false;
}
if(encoding == null) {
encoding = EncodingDetector.detect(inputFile);
}
maxNumLines = -1;
if(sMaxNumLines != null) {
maxNumLines = Integer.parseInt(sMaxNumLines);
}
if(sourceName == null)
{
logger.error("You need to supply a value for '--sourcename'");
return false;
}
return true;
}
private void createModelURL() throws IOException {
/**
* VALIDATE THE OPTIONS *
*/
if(modelFilePath != null)
{
File modelFile = new File(modelFilePath);
if (!modelFile.exists()) {
throw new IOException("File not found: " + modelFile.getAbsolutePath());
}
modelURL = modelFile.toURI().toURL();
}
else
{
modelURL = new URL(modelURLString);
}
if (contextFile != null) {
File tmp = new File(contextFile);
if (!tmp.exists()) {
throw new IOException("File not found: " + tmp.getAbsolutePath());
}
contextURL = tmp.toURI().toURL();
}
else if(contextURLString != null)
{
contextURL = new URL(contextURLString);
}
if (baseURI != null && !baseURI.trim().isEmpty())
return;
try {
R2RMLMappingIdentifier modelIdentifier = new R2RMLMappingIdentifier(modelURL.toString(), modelURL, null);
Model model = WorksheetR2RMLJenaModelParser.loadSourceModelIntoJenaModel(modelIdentifier);
Property rdfTypeProp = model.getProperty(Uris.RDF_TYPE_URI);
Property baseURIProp = model.getProperty(Uris.KM_HAS_BASEURI);
RDFNode node = model.getResource(Uris.KM_R2RML_MAPPING_URI);
ResIterator res = model.listResourcesWithProperty(rdfTypeProp, node);
List<Resource> resList = res.toList();
for(Resource r: resList)
{
if (r.hasProperty(baseURIProp)) {
baseURI = r.getProperty(baseURIProp).asTriple().getObject().toString();
baseURI = baseURI.replace("\"", "");
}
}
} catch (IOException e) {
}
}
private void generateRdfFromDatabaseTable() throws Exception {
if(!validateDatabaseCommandLineOptions())
{
logger.error("Unable to generate RDF from database table!");
return;
}
DatabaseTableRDFGenerator dbRdfGen = new DatabaseTableRDFGenerator(dbType,
hostname, port, username, password, dBorSIDName, encoding, selectionName, contextParameters);
ContextIdentifier contextId = null;
if (contextURL != null) {
contextId = new ContextIdentifier(contextURL.getQuery(), contextURL, null);
}
if(inputType.equals("DB")) {
R2RMLMappingIdentifier id = new R2RMLMappingIdentifier(tablename, modelURL, null);
createWriters();
dbRdfGen.generateRDFFromTable(tablename, topkrows, writers, id, contextId, baseURI);
} else {
String query = loadQueryFromFile();
R2RMLMappingIdentifier id = new R2RMLMappingIdentifier(modelURL.toString(), modelURL, null);
createWriters();
dbRdfGen.generateRDFFromSQL(query, writers, id, contextId, baseURI);
}
}
private boolean validateDatabaseCommandLineOptions() {
if(encoding == null)
encoding = "UTF-8";
port = 0;
try {
port = Integer.parseInt(portnumber);
} catch (Throwable t) {
logger.error("Error occured while parsing value for portnumber."
+ " Provided value: " + portnumber);
return false;
}
// Validate the arguments
if (dbtypeStr == null || dbtypeStr.equals("") || hostname == null
|| hostname.equals("") || username == null || username.equals("")
|| password == null || password.equals("") || dBorSIDName == null
|| dBorSIDName.equals("")
|| (inputType.equals("DB") && (tablename == null || tablename.equals("")))
|| (inputType.equals("SQL") && (queryFile == null || queryFile.equals("")))
) {
if(inputType.equals("DB"))
logger.error("A mandatory value is missing for fetching data from "
+ "a database. Please provide argument values for dbtype, hostname, "
+ "username, password, portnumber, dbname and tablename.");
else
logger.error("A mandatory value is missing for fetching data from "
+ "a database. Please provide argument values for dbtype, hostname, "
+ "username, password, portnumber, dbname and queryfile.");
return false;
}
dbType = DBType.valueOf(dbtypeStr);
if (dbType == null) {
logger.error("Unidentified database type. Valid values: "
+ "Oracle, MySQL, SQLServer, PostGIS");
return false;
}
return true;
}
private String loadQueryFromFile() throws IOException {
File file = new File(queryFile);
String queryFileEncoding = EncodingDetector.detect(file);
String query = EncodingDetector.getString(file, queryFileEncoding);
return query;
}
protected void closeWriters() {
for(KR2RMLRDFWriter writer : writers)
{
writer.flush();
writer.close();
}
}
protected void createWriters() throws Exception
{
createN3Writer();
createBloomFilterWriter();
}
protected void createN3Writer()
throws UnsupportedEncodingException, FileNotFoundException {
if(outputFilePath != null)
{
OutputStreamWriter fw = new OutputStreamWriter(new FileOutputStream(outputFilePath), "UTF-8");
BufferedWriter bw = new BufferedWriter(fw);
PrintWriter pw = new PrintWriter(bw);
N3KR2RMLRDFWriter n3Writer = new N3KR2RMLRDFWriter(new URIFormatter(), pw);
if(baseURI != null)
{
n3Writer.setBaseURI(baseURI);
}
writers.add(n3Writer);
}
if (outputFileJSONPath != null) {
JSONKR2RMLRDFWriter jsonWriter = new JSONKR2RMLRDFWriter(new PrintWriter(outputFileJSONPath), baseURI);
writers.add(jsonWriter);
}
}
protected void createBloomFilterWriter() throws Exception {
if (bloomFiltersFilePath != null && !bloomFiltersFilePath.trim().isEmpty()) {
PrintWriter bloomfilterpw = new PrintWriter(new File(bloomFiltersFilePath));
logger.info(bloomFiltersFilePath);
writers.add(createBloomFilterWriter(bloomfilterpw, true, baseURI));
}
}
private KR2RMLRDFWriter createBloomFilterWriter(PrintWriter bloomfilterpw, Boolean isRDF, String baseURI)
throws Exception {
Reflections reflections = new Reflections("edu.isi.karma.kr2rml.writer");
Set<Class<? extends KR2RMLRDFWriter>> subTypes =
reflections.getSubTypesOf(KR2RMLRDFWriter.class);
for (Class<? extends KR2RMLRDFWriter> subType : subTypes)
{
if(!Modifier.isAbstract(subType.getModifiers()) && !subType.isInterface() && subType.getName().equals("BloomFilterKR2RMLRDFWriter"))
try
{
KR2RMLRDFWriter writer = subType.newInstance();
writer.setWriter(bloomfilterpw);
Properties p = new Properties();
p.setProperty("is.rdf", isRDF.toString());
p.setProperty("base.uri", baseURI);
writer.initialize(p);
return writer;
}
catch (Exception e)
{
bloomfilterpw.close();
throw new Exception("Unable to instantiate bloom filter writer", e);
}
}
bloomfilterpw.close();
throw new Exception("Bloom filter writing support not enabled. Please recompile with -Pbloom");
}
private void generateRdfFromFile()
throws Exception {
if(!validateFileCommandLineOptions())
{
logger.error("Unable to generate RDF from file because of invalid configuration");
return;
}
R2RMLMappingIdentifier id = new R2RMLMappingIdentifier(sourceName, modelURL, null);
createWriters();
GenericRDFGenerator rdfGenerator = new GenericRDFGenerator(selectionName);
rdfGenerator.addModel(id);
InputType inputType = null;
if(this.inputType.equalsIgnoreCase("CSV"))
inputType = InputType.CSV;
else if(this.inputType.equalsIgnoreCase("JSON"))
inputType = InputType.JSON;
else if(this.inputType.equalsIgnoreCase("XML"))
inputType = InputType.XML;
else if(this.inputType.equalsIgnoreCase("AVRO"))
inputType = InputType.AVRO;
else if(this.inputType.equalsIgnoreCase("JL"))
inputType = InputType.JL;
Model model = rdfGenerator.getModelParser(sourceName).getModel();
if (rootTripleMap != null && !rootTripleMap.isEmpty()) {
StmtIterator itr = model.listStatements(null, model.getProperty(Uris.KM_NODE_ID_URI), rootTripleMap);
Resource subject = null;
while (itr.hasNext()) {
subject = itr.next().getSubject();
}
if (subject != null) {
itr = model.listStatements(null, model.getProperty(Uris.RR_SUBJECTMAP_URI), subject);
while (itr.hasNext()) {
rootTripleMap = itr.next().getSubject().toString();
}
}
}
RDFGeneratorRequest request = new RDFGeneratorRequest(sourceName, inputFile.getName());
request.setInputFile(inputFile);
request.setDataType(inputType);
if(inputEncoding != null) request.setEncoding(inputEncoding);
if(inputDelimiter != null) request.setDelimiter(this.inputDelimiter);
if(inputTextQualifier != null) request.setTextQualifier(inputTextQualifier);
if(inputHeaderStartIndex != null) request.setHeaderStartIndex(Integer.parseInt(inputHeaderStartIndex));
if(inputDataStartIndex != null) request.setDataStartIndex(Integer.parseInt(inputDataStartIndex));
request.setMaxNumLines(maxNumLines);
request.setAddProvenance(false);
request.addWriters(writers);
request.setPOMToKill(POMToKill);
request.setTripleMapToKill(killTripleMap);
request.setTripleMapToStop(stopTripleMap);
request.setStrategy(new UserSpecifiedRootStrategy(rootTripleMap));
request.setContextParameters(contextParameters);
if (contextURL != null) {
ContextIdentifier contextId = new ContextIdentifier(contextURL.getQuery(), contextURL, null);
rdfGenerator.addContext(contextId);
request.setContextName(contextURL.getQuery());
}
rdfGenerator.generateRDF(request);
}
private static Options createCommandLineOptions() {
Options options = new Options();
options.addOption(new Option("sourcetype", "sourcetype", true, "type of source. Valid values: DB, SQL, CSV, JSON, XML"));
options.addOption(new Option("delimiter","delimiter", true, "column delimter for CSV file"));
options.addOption(new Option("encoding","encoding",true, "source encoding"));
options.addOption(new Option("textqualifier","textQualifier", true, "text qualifier for CSV file"));
options.addOption(new Option("headerindex", "headerindex", true, "header index for CSV file"));
options.addOption(new Option("dataindex", "dataindex", true, "data start index for CSV file"));
options.addOption(new Option("filepath", "filepath", true, "location of the input file"));
options.addOption(new Option("modelfilepath", "modelfilepath", true, "location of the model file"));
options.addOption(new Option("modelurl", "modelurl", true, "location of the model"));
options.addOption(new Option("sourcename", "sourcename", true, "name of the source in the model to use"));
options.addOption(new Option("outputfile", "outputfile", true, "location of the output file"));
options.addOption(new Option("dbtype", "dbtype", true, "database type. Valid values: Oracle, MySQL, SQLServer, PostGIS"));
options.addOption(new Option("hostname", "hostname", true, "hostname for database connection"));
options.addOption(new Option("username", "username", true, "username for database connection"));
options.addOption(new Option("password", "password", true, "password for database connection"));
options.addOption(new Option("portnumber", "portnumber", true, "portnumber for database connection"));
options.addOption(new Option("dbname", "dbname", true, "database or SID name for database connection"));
options.addOption(new Option("tablename", "tablename", true, "hostname for database connection"));
options.addOption(new Option("topkrows", "topkrows", true, "number of top k rows to select from the table"));
options.addOption(new Option("queryfile", "queryfile", true, "query file for loading data"));
options.addOption(new Option("outputbloomfilter", "bloomfiltersfile", true, "generate bloom filters"));
options.addOption(new Option("baseuri", "base URI", true, "specifies base uri"));
options.addOption(new Option("selection", "selection", true, "specifies selection name"));
options.addOption(new Option("root", "root", true, "specifies root"));
options.addOption(new Option("killtriplemap", "killtriplemap", true, "specifies TripleMap to kill"));
options.addOption(new Option("stoptriplemap", "stoptriplemap", true, "specifies TripleMap to stop"));
options.addOption(new Option("pomtokill", "pomtokill", true, "specifies POM to kill"));
options.addOption(new Option("jsonoutputfile", "jsonoutputfile", true, "specifies JSONOutputFile"));
options.addOption(new Option("contextfile", "contextile", true, "specifies global context file"));
options.addOption(new Option("contexturl", "contexturl", true, "specifies global context url"));
options.addOption(new Option("help", "help", false, "print this message"));
return options;
}
}