package edu.isi.karma.rdf;
import edu.isi.karma.controller.command.selection.SuperSelection;
import edu.isi.karma.controller.command.selection.SuperSelectionManager;
import edu.isi.karma.imp.Import;
import edu.isi.karma.imp.avro.AvroImport;
import edu.isi.karma.imp.csv.CSVImport;
import edu.isi.karma.imp.excel.ToCSV;
import edu.isi.karma.imp.json.JsonImport;
import edu.isi.karma.kr2rml.ContextIdentifier;
import edu.isi.karma.kr2rml.ErrorReport;
import edu.isi.karma.kr2rml.KR2RMLWorksheetRDFGenerator;
import edu.isi.karma.kr2rml.mapping.KR2RMLMapping;
import edu.isi.karma.kr2rml.mapping.R2RMLMappingIdentifier;
import edu.isi.karma.kr2rml.mapping.WorksheetR2RMLJenaModelParser;
import edu.isi.karma.kr2rml.planning.RootStrategy;
import edu.isi.karma.kr2rml.planning.SteinerTreeRootStrategy;
import edu.isi.karma.kr2rml.planning.WorksheetDepthRootStrategy;
import edu.isi.karma.kr2rml.writer.JSONKR2RMLRDFWriter;
import edu.isi.karma.kr2rml.writer.KR2RMLRDFWriter;
import edu.isi.karma.rdf.InputProperties.InputProperty;
import edu.isi.karma.rep.Worksheet;
import edu.isi.karma.rep.Workspace;
import edu.isi.karma.util.EncodingDetector;
import edu.isi.karma.util.JSONUtil;
import edu.isi.karma.webserver.ContextParametersRegistry;
import edu.isi.karma.webserver.KarmaException;
import edu.isi.karma.webserver.ServletContextParameterMap;
import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.JSONTokener;
import org.json.XML;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.*;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class GenericRDFGenerator extends RdfGenerator {
private static Logger logger = LoggerFactory.getLogger(GenericRDFGenerator.class);
protected ConcurrentHashMap<String, R2RMLMappingIdentifier> modelIdentifiers;
protected ConcurrentHashMap<String, WorksheetR2RMLJenaModelParser> readModelParsers;
protected HashMap<String, ContextIdentifier> contextIdentifiers;
protected HashMap<String, JSONObject> contextCache;
public enum InputType {
CSV,
JSON,
XML,
AVRO,
EXCEL,
JL
};
public GenericRDFGenerator() {
this(null);
}
public GenericRDFGenerator(String selectionName) {
super(selectionName);
this.modelIdentifiers = new ConcurrentHashMap<>();
this.readModelParsers = new ConcurrentHashMap<>();
this.contextCache = new HashMap<>();
this.contextIdentifiers = new HashMap<>();
}
public void addModel(R2RMLMappingIdentifier modelIdentifier) {
if(!modelIdentifiers.containsKey(modelIdentifier.getName())){
this.modelIdentifiers.put(modelIdentifier.getName(), modelIdentifier);
}
}
public void addContext(ContextIdentifier id) {
this.contextIdentifiers.put(id.getName(), id);
}
public WorksheetR2RMLJenaModelParser getModelParser(String modelName) throws JSONException, KarmaException {
WorksheetR2RMLJenaModelParser modelParser = readModelParsers.get(modelName);
R2RMLMappingIdentifier id = this.modelIdentifiers.get(modelName);
if(modelParser == null) {
modelParser = loadModel(id);
}
return modelParser;
}
private void generateRDF(String modelName, String sourceName,String contextName, InputStream data, InputType dataType, InputProperties inputTypeParameters,
boolean addProvenance, List<KR2RMLRDFWriter> writers, RootStrategy rootStrategy,
List<String> tripleMapToKill, List<String> tripleMapToStop, List<String> POMToKill, ServletContextParameterMap contextParameters)
throws KarmaException, IOException {
R2RMLMappingIdentifier id = this.modelIdentifiers.get(modelName);
ContextIdentifier contextId = this.contextIdentifiers.get(contextName);
if(id == null) {
throw new KarmaException("Cannot generate RDF. Model named " + modelName + " does not exist");
}
JSONObject context;
if (contextId == null) {
context = new JSONObject();
}
else {
context = this.contextCache.get(contextName);
}
if (context == null) {
try {
context = loadContext(contextId);
}catch(Exception e) {
context = new JSONObject();
}
}
for (KR2RMLRDFWriter writer : writers) {
if (writer instanceof JSONKR2RMLRDFWriter) {
JSONKR2RMLRDFWriter t = (JSONKR2RMLRDFWriter)writer;
t.setGlobalContext(context, contextId);
}
writer.setR2RMLMappingIdentifier(id);
}
//Check if the parser for this model exists, else create one
WorksheetR2RMLJenaModelParser modelParser = getModelParser(modelName);
generateRDF(modelParser, sourceName, data, dataType, inputTypeParameters, addProvenance, writers, rootStrategy, tripleMapToKill, tripleMapToStop, POMToKill, contextParameters);
}
private void generateRDF(WorksheetR2RMLJenaModelParser modelParser, String sourceName, InputStream data, InputType dataType, InputProperties inputTypeParameters,
boolean addProvenance, List<KR2RMLRDFWriter> writers, RootStrategy rootStrategy,
List<String> tripleMapToKill, List<String> tripleMapToStop, List<String> POMToKill, ServletContextParameterMap contextParameters) throws KarmaException, IOException {
logger.debug("Generating rdf for " + sourceName);
if(contextParameters == null)
{
contextParameters = ContextParametersRegistry.getInstance().getDefault();
logger.debug("No context specified. Defaulting to: " + contextParameters.getKarmaHome());
}
logger.debug("Initializing workspace for {}", sourceName);
Workspace workspace = initializeWorkspace(contextParameters);
logger.debug("Initialized workspace for {}", sourceName);
try
{
logger.debug("Generating worksheet for {}", sourceName);
Worksheet worksheet = generateWorksheet(sourceName, new BufferedInputStream(data), dataType, inputTypeParameters,
workspace);
logger.debug("Generated worksheet for {}", sourceName);
logger.debug("Parsing mapping for {}", sourceName);
//Generate mappping data for the worksheet using the model parser
KR2RMLMapping mapping = modelParser.parse();
logger.debug("Parsed mapping for {}", sourceName);
applyHistoryToWorksheet(workspace, worksheet, mapping);
SuperSelection selection = SuperSelectionManager.DEFAULT_SELECTION;
if (selectionName != null && !selectionName.trim().isEmpty())
selection = worksheet.getSuperSelectionManager().getSuperSelection(selectionName);
if (selection == null)
return;
//Generate RDF using the mapping data
ErrorReport errorReport = new ErrorReport();
if(rootStrategy == null)
{
rootStrategy = new SteinerTreeRootStrategy(new WorksheetDepthRootStrategy());
}
logger.debug("Generating output for {}", sourceName);
KR2RMLWorksheetRDFGenerator rdfGen = new KR2RMLWorksheetRDFGenerator(worksheet,
workspace, writers,
addProvenance, rootStrategy, tripleMapToKill, tripleMapToStop, POMToKill,
mapping, errorReport, selection);
rdfGen.generateRDF(true);
logger.debug("Generated output for {}", sourceName);
}
catch( Exception e)
{
logger.error("Error occurred while generating RDF", e);
throw new KarmaException(e.getMessage());
}
finally
{
removeWorkspace(workspace);
}
logger.debug("Generated rdf for {}", sourceName);
}
public void generateRDF(RDFGeneratorRequest request) throws KarmaException, IOException
{
InputStream inputStream = null;
if(request.getInputFile() != null)
{
inputStream = new FileInputStream(request.getInputFile());
}
else if(request.getInputData() != null)
{
inputStream = IOUtils.toInputStream(request.getInputData(), Charset.forName("UTF-8"));
request.setEncoding("UTF-8");
}
else if(request.getInputStream() != null)
{
inputStream = request.getInputStream();
}
generateRDF(request.getModelName(), request.getSourceName(), request.getContextName(),
inputStream, request.getDataType(), request.getInputTypeProperties(), request.isAddProvenance(),
request.getWriters(), request.getStrategy(),
request.getTripleMapToKill(), request.getTripleMapToStop(), request.getPOMToKill(), request.getContextParameters());
}
private InputType getInputType(Metadata metadata) {
String[] contentType = metadata.get(Metadata.CONTENT_TYPE).split(";");
switch (contentType[0]) {
case "application/json" : {
return InputType.JSON;
}
case "application/xml": {
return InputType.XML;
}
case "text/csv": {
return InputType.CSV;
}
case "text/excel": {
return InputType.EXCEL;
}
case "text/x-excel": {
return InputType.EXCEL;
}
}
return null;
}
protected Worksheet generateWorksheet(String sourceName, BufferedInputStream is, InputType inputType, InputProperties inputParameters,
Workspace workspace) throws IOException, KarmaException {
Worksheet worksheet = null;
try{
is.mark(Integer.MAX_VALUE);
String encoding = null;
if(inputType == null) {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, sourceName);
DefaultDetector detector = new DefaultDetector();
MediaType type = detector.detect(is, metadata);
ContentHandler contenthandler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
try {
parser.parse(is, contenthandler, metadata);
} catch (SAXException | TikaException e) {
logger.error("Unable to parse stream: " + e.getMessage());
throw new KarmaException("Unable to parse stream: "
+ e.getMessage());
}
MediaTypeRegistry registry = MimeTypes.getDefaultMimeTypes()
.getMediaTypeRegistry();
registry.addSuperType(new MediaType("text", "csv"), new MediaType(
"text", "plain"));
MediaType parsedType = MediaType.parse(metadata
.get(Metadata.CONTENT_TYPE));
if (registry.isSpecializationOf(registry.normalize(type), registry
.normalize(parsedType).getBaseType())) {
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
logger.info("Detected " + metadata.get(Metadata.CONTENT_TYPE));
inputType = getInputType(metadata);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
} else if(inputParameters.get(InputProperty.ENCODING) != null) {
encoding = (String)inputParameters.get(InputProperty.ENCODING);
} else {
encoding = EncodingDetector.detect(is);
}
is.reset();
if(inputType == null) {
throw new KarmaException("Content type unrecognized");
}
inputParameters.set(InputProperty.ENCODING, encoding);
switch (inputType) {
case JSON : {
worksheet = generateWorksheetFromJSONStream(sourceName, is, inputParameters,
workspace);
break;
}
case XML : {
worksheet = generateWorksheetFromXMLStream(sourceName, is, inputParameters,
workspace);
break;
}
case CSV : {
worksheet = generateWorksheetFromDelimitedStream(sourceName,
is, inputParameters, workspace);
break;
}
case EXCEL: {
worksheet = generateWorksheetFromExcelStream(sourceName, is, inputParameters, workspace);
break;
}
case AVRO : {
worksheet = generateWorksheetFromAvroStream(sourceName, is, inputParameters, workspace);
break;
}
case JL: {
worksheet = generateWorksheetFromJLStream(sourceName, is, inputParameters, workspace);
}
}
} catch (Exception e ) {
logger.error("Error generating worksheet", e);
throw new KarmaException("Unable to generate worksheet: " + e.getMessage());
}
if(worksheet == null) {
throw new KarmaException("Content type unrecognized");
}
return worksheet;
}
private synchronized WorksheetR2RMLJenaModelParser loadModel(R2RMLMappingIdentifier modelIdentifier) throws JSONException, KarmaException {
if(readModelParsers.containsKey(modelIdentifier.getName()))
{
return readModelParsers.get(modelIdentifier.getName());
}
WorksheetR2RMLJenaModelParser parser = new WorksheetR2RMLJenaModelParser(modelIdentifier);
this.readModelParsers.put(modelIdentifier.getName(), parser);
return parser;
}
public JSONObject loadContext(ContextIdentifier id) throws IOException {
if (contextCache.containsKey(id.getName())) {
return contextCache.get(id.getName());
}
InputStream jsonStream;
if(id.getContent() != null)
jsonStream = IOUtils.toInputStream(id.getContent(), "utf-8");
else
jsonStream = id.getLocation().openStream();
JSONTokener token = new JSONTokener(new InputStreamReader(jsonStream));
JSONObject obj = new JSONObject(token);
this.contextCache.put(id.getName(), obj);
return obj;
}
public Map<String, R2RMLMappingIdentifier> getModels()
{
return Collections.unmodifiableMap(modelIdentifiers);
}
private Worksheet generateWorksheetFromExcelStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace) throws IOException,
KarmaException, ClassNotFoundException, InvalidFormatException {
int worksheetIndex = (inputTypeParams.get(InputProperty.WORKSHEET_INDEX) != null)?
(int)inputTypeParams.get(InputProperty.WORKSHEET_INDEX) : 1;
// Convert the Excel file to a CSV file.
ToCSV csvConverter = new ToCSV();
StringWriter writer = new StringWriter();
csvConverter.convertWorksheetToCSV(is, worksheetIndex-1, writer);
String csv= writer.toString();
InputStream sheet = IOUtils.toInputStream(csv);
return this.generateWorksheetFromDelimitedStream(sourceName, sheet, inputTypeParams, workspace);
}
private Worksheet generateWorksheetFromDelimitedStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace) throws IOException,
KarmaException, ClassNotFoundException {
Worksheet worksheet;
int headerStartIndex = (inputTypeParams.get(InputProperty.HEADER_START_INDEX) != null)?
(int)inputTypeParams.get(InputProperty.HEADER_START_INDEX) : 1;
int dataStartIndex = (inputTypeParams.get(InputProperty.DATA_START_INDEX) != null)?
(int)inputTypeParams.get(InputProperty.DATA_START_INDEX) : 2;
char delimiter = (inputTypeParams.get(InputProperty.DELIMITER) != null)?
((String)inputTypeParams.get(InputProperty.DELIMITER)).charAt(0): ',';
char qualifier = (inputTypeParams.get(InputProperty.TEXT_QUALIFIER) != null)?
((String)inputTypeParams.get(InputProperty.TEXT_QUALIFIER)).charAt(0): '\"';
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
Import fileImport = new CSVImport(headerStartIndex, dataStartIndex, delimiter, qualifier, encoding, maxNumLines,
sourceName, is, workspace, null);
worksheet = fileImport.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromXMLStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace)
throws IOException {
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
String contents = IOUtils.toString(is, encoding);
JSONObject json = XML.toJSONObject(contents);
JsonImport imp = new JsonImport(json, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromJSONStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace)
throws IOException {
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
Reader reader = EncodingDetector.getInputStreamReader(is, encoding);
Object json = JSONUtil.createJson(reader);
JsonImport imp = new JsonImport(json, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromAvroStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace)
throws IOException, JSONException, KarmaException {
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
AvroImport imp = new AvroImport(is, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromJLStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace) throws Exception{
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)? (int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
Object json=JSONUtil.convertJSONLinesToJSONArray(is,encoding);
JsonImport imp = new JsonImport(json, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
}