package edu.isi.karma.research.modeling;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.text.DecimalFormat;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.XML;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import edu.isi.karma.config.ModelingConfiguration;
import edu.isi.karma.config.ModelingConfigurationRegistry;
import edu.isi.karma.controller.command.CommandException;
import edu.isi.karma.controller.command.selection.SuperSelection;
import edu.isi.karma.controller.command.selection.SuperSelectionManager;
import edu.isi.karma.controller.history.WorksheetCommandHistoryExecutor;
import edu.isi.karma.controller.update.UpdateContainer;
import edu.isi.karma.imp.Import;
import edu.isi.karma.imp.avro.AvroImport;
import edu.isi.karma.imp.csv.CSVImport;
import edu.isi.karma.imp.excel.ToCSV;
import edu.isi.karma.imp.json.JsonImport;
import edu.isi.karma.kr2rml.mapping.KR2RMLMapping;
import edu.isi.karma.kr2rml.mapping.R2RMLMappingIdentifier;
import edu.isi.karma.kr2rml.mapping.WorksheetR2RMLJenaModelParser;
import edu.isi.karma.metadata.OntologyMetadata;
import edu.isi.karma.metadata.SemanticTypeModelMetadata;
import edu.isi.karma.modeling.alignment.Alignment;
import edu.isi.karma.modeling.alignment.AlignmentManager;
import edu.isi.karma.modeling.alignment.SemanticModel;
import edu.isi.karma.rdf.GenericRDFGenerator.InputType;
import edu.isi.karma.rdf.InputProperties;
import edu.isi.karma.rdf.InputProperties.InputProperty;
import edu.isi.karma.rep.Worksheet;
import edu.isi.karma.rep.Workspace;
import edu.isi.karma.rep.WorkspaceManager;
import edu.isi.karma.rep.metadata.Tag;
import edu.isi.karma.rep.metadata.TagsContainer.Color;
import edu.isi.karma.rep.metadata.TagsContainer.TagName;
import edu.isi.karma.semantictypes.evaluation.EvaluateMRR;
import edu.isi.karma.semantictypes.evaluation.MRRItem;
import edu.isi.karma.util.EncodingDetector;
import edu.isi.karma.util.JSONUtil;
import edu.isi.karma.webserver.ExecutionController;
import edu.isi.karma.webserver.KarmaException;
import edu.isi.karma.webserver.ServletContextParameterMap;
import edu.isi.karma.webserver.ServletContextParameterMap.ContextParameter;
import edu.isi.karma.webserver.WorkspaceKarmaHomeRegistry;
import edu.isi.karma.webserver.WorkspaceRegistry;
public class OfflineTraining {
private static Logger logger = LoggerFactory.getLogger(OfflineTraining.class);
private InputType getInputType(Metadata metadata) {
String[] contentType = metadata.get(Metadata.CONTENT_TYPE).split(";");
switch (contentType[0]) {
case "application/json" : {
return InputType.JSON;
}
case "application/xml": {
return InputType.XML;
}
case "text/csv": {
return InputType.CSV;
}
case "text/excel": {
return InputType.EXCEL;
}
case "text/x-excel": {
return InputType.EXCEL;
}
}
return null;
}
protected Workspace initializeWorkspace(ServletContextParameterMap contextParameters) throws KarmaException {
Workspace workspace = WorkspaceManager.getInstance().createWorkspace(contextParameters.getId());
WorkspaceRegistry.getInstance().register(new ExecutionController(workspace));
WorkspaceKarmaHomeRegistry.getInstance().register(workspace.getId(), contextParameters.getKarmaHome());
ModelingConfiguration modelingConfiguration = ModelingConfigurationRegistry.getInstance().register(contextParameters.getId());
modelingConfiguration.setManualAlignment();
// to load the ontologies in the /preloaded-ontologies folder
OntologyMetadata omd = new OntologyMetadata(contextParameters);
omd.setup(new UpdateContainer(), workspace);
SemanticTypeModelMetadata stmd = new SemanticTypeModelMetadata(contextParameters);
stmd.setup(new UpdateContainer(), workspace);
Tag outlierTag = new Tag(TagName.Outlier, Color.Red);
workspace.getTagsContainer().addTag(outlierTag);
return workspace;
}
protected void removeWorkspace(Workspace workspace) {
WorkspaceManager.getInstance().removeWorkspace(workspace.getId());
WorkspaceRegistry.getInstance().deregister(workspace.getId());
WorkspaceKarmaHomeRegistry.getInstance().deregister(workspace.getId());
}
protected Worksheet generateWorksheet(String sourceName, BufferedInputStream is, InputType inputType, InputProperties inputParameters,
Workspace workspace) throws IOException, KarmaException {
Worksheet worksheet = null;
try{
is.mark(Integer.MAX_VALUE);
String encoding = null;
if(inputType == null) {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, sourceName);
DefaultDetector detector = new DefaultDetector();
MediaType type = detector.detect(is, metadata);
ContentHandler contenthandler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
try {
parser.parse(is, contenthandler, metadata);
} catch (SAXException | TikaException e) {
logger.error("Unable to parse stream: " + e.getMessage());
throw new KarmaException("Unable to parse stream: "
+ e.getMessage());
}
MediaTypeRegistry registry = MimeTypes.getDefaultMimeTypes()
.getMediaTypeRegistry();
registry.addSuperType(new MediaType("text", "csv"), new MediaType(
"text", "plain"));
MediaType parsedType = MediaType.parse(metadata
.get(Metadata.CONTENT_TYPE));
if (registry.isSpecializationOf(registry.normalize(type), registry
.normalize(parsedType).getBaseType())) {
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
logger.info("Detected " + metadata.get(Metadata.CONTENT_TYPE));
inputType = getInputType(metadata);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
} else if(inputParameters.get(InputProperty.ENCODING) != null) {
encoding = (String)inputParameters.get(InputProperty.ENCODING);
} else {
encoding = EncodingDetector.detect(is);
}
is.reset();
if(inputType == null) {
throw new KarmaException("Content type unrecognized");
}
inputParameters.set(InputProperty.ENCODING, encoding);
switch (inputType) {
case JSON : {
worksheet = generateWorksheetFromJSONStream(sourceName, is, inputParameters,
workspace);
break;
}
case XML : {
worksheet = generateWorksheetFromXMLStream(sourceName, is, inputParameters,
workspace);
break;
}
case CSV : {
worksheet = generateWorksheetFromDelimitedStream(sourceName,
is, inputParameters, workspace);
break;
}
case EXCEL: {
worksheet = generateWorksheetFromExcelStream(sourceName, is, inputParameters, workspace);
break;
}
case AVRO : {
worksheet = generateWorksheetFromAvroStream(sourceName, is, inputParameters, workspace);
}
}
if (worksheet != null)
AlignmentManager.Instance().createAlignment(workspace.getId(), worksheet.getId(), workspace.getOntologyManager());
} catch (Exception e ) {
logger.error("Error generating worksheet", e);
throw new KarmaException("Unable to generate worksheet: " + e.getMessage());
}
if(worksheet == null) {
throw new KarmaException("Content type unrecognized");
}
return worksheet;
}
private Worksheet generateWorksheetFromDelimitedStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace) throws IOException,
KarmaException, ClassNotFoundException {
Worksheet worksheet;
int headerStartIndex = (inputTypeParams.get(InputProperty.HEADER_START_INDEX) != null)?
(int)inputTypeParams.get(InputProperty.HEADER_START_INDEX) : 1;
int dataStartIndex = (inputTypeParams.get(InputProperty.DATA_START_INDEX) != null)?
(int)inputTypeParams.get(InputProperty.DATA_START_INDEX) : 2;
char delimiter = (inputTypeParams.get(InputProperty.DELIMITER) != null)?
((String)inputTypeParams.get(InputProperty.DELIMITER)).charAt(0): ',';
char qualifier = (inputTypeParams.get(InputProperty.TEXT_QUALIFIER) != null)?
((String)inputTypeParams.get(InputProperty.TEXT_QUALIFIER)).charAt(0): '\"';
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
Import fileImport = new CSVImport(headerStartIndex, dataStartIndex, delimiter, qualifier, encoding, maxNumLines,
sourceName, is, workspace, null);
worksheet = fileImport.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromExcelStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace) throws IOException,
KarmaException, ClassNotFoundException, InvalidFormatException {
int worksheetIndex = (inputTypeParams.get(InputProperty.WORKSHEET_INDEX) != null)?
(int)inputTypeParams.get(InputProperty.WORKSHEET_INDEX) : 1;
// Convert the Excel file to a CSV file.
ToCSV csvConverter = new ToCSV();
StringWriter writer = new StringWriter();
csvConverter.convertWorksheetToCSV(is, worksheetIndex-1, writer);
String csv= writer.toString();
InputStream sheet = IOUtils.toInputStream(csv);
return this.generateWorksheetFromDelimitedStream(sourceName, sheet, inputTypeParams, workspace);
}
private Worksheet generateWorksheetFromXMLStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace)
throws IOException {
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
String contents = IOUtils.toString(is, encoding);
JSONObject json = XML.toJSONObject(contents);
JsonImport imp = new JsonImport(json, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromJSONStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace)
throws IOException {
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
Reader reader = EncodingDetector.getInputStreamReader(is, encoding);
Object json = JSONUtil.createJson(reader);
JsonImport imp = new JsonImport(json, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
private Worksheet generateWorksheetFromAvroStream(String sourceName, InputStream is, InputProperties inputTypeParams,
Workspace workspace)
throws IOException, JSONException, KarmaException {
Worksheet worksheet;
String encoding = (String)inputTypeParams.get(InputProperty.ENCODING);
int maxNumLines = (inputTypeParams.get(InputProperty.MAX_NUM_LINES) != null)?
(int)inputTypeParams.get(InputProperty.MAX_NUM_LINES) : -1;
AvroImport imp = new AvroImport(is, sourceName, workspace, encoding, maxNumLines);
worksheet = imp.generateWorksheet();
return worksheet;
}
protected void applyHistoryToWorksheet(Workspace workspace, Worksheet worksheet,
KR2RMLMapping mapping) throws JSONException {
WorksheetCommandHistoryExecutor wchr = new WorksheetCommandHistoryExecutor(worksheet.getId(), workspace);
try
{
wchr.executeAllCommands(new JSONArray(mapping.getWorksheetHistoryString()));
// List<CommandTag> tags = new ArrayList<CommandTag>();
// tags.add(CommandTag.Transformation);
// wchr.executeCommandsByTags(tags, new JSONArray(mapping.getWorksheetHistoryString()));
}
catch (CommandException | KarmaException e)
{
logger.error("Unable to execute column transformations", e);
}
}
public SemanticModel applyModel(
ServletContextParameterMap contextParameters,
File source,
String sourceName,
InputType dataType,
File model,
boolean train,
boolean predict) throws
FileNotFoundException,
JSONException, KarmaException, MalformedURLException {
if (source == null || model == null)
return null;
InputStream data = new FileInputStream(source);
R2RMLMappingIdentifier rmlID = new R2RMLMappingIdentifier(model.getAbsolutePath(), model.toURI().toURL());
WorksheetR2RMLJenaModelParser modelParser = new WorksheetR2RMLJenaModelParser(rmlID);
InputProperties inputTypeParameters = new InputProperties();
ModelingConfiguration mConf = ModelingConfigurationRegistry.getInstance().getModelingConfiguration(contextParameters.getId());
mConf.setTrainOnApplyHistory(train);
mConf.setPredictOnApplyHistory(predict);
logger.debug("Initializing workspace for {}", sourceName);
Workspace workspace = initializeWorkspace(contextParameters);
logger.debug("Initialized workspace for {}", sourceName);
try
{
logger.debug("Generating worksheet for {}", sourceName);
Worksheet worksheet = generateWorksheet(sourceName, new BufferedInputStream(data), dataType, inputTypeParameters,
workspace);
logger.debug("Generated worksheet for {}", sourceName);
logger.debug("Parsing mapping for {}", sourceName);
//Generate mappping data for the worksheet using the model parser
KR2RMLMapping mapping = modelParser.parse();
logger.debug("Parsed mapping for {}", sourceName);
applyHistoryToWorksheet(workspace, worksheet, mapping);
if (predict) {
Alignment alignment = AlignmentManager.Instance().getAlignment(AlignmentManager.
Instance().constructAlignmentId(workspace.getId(), worksheet.getId()));
SuperSelection selection = SuperSelectionManager.DEFAULT_SELECTION;
if (alignment == null) {
logger.error("alignment is null!");
return null;
}
SemanticModel semanticModel = new SemanticModel(workspace, worksheet, worksheet.getTitle(), alignment.getSteinerTree(), selection);
semanticModel.setName(worksheet.getTitle());
return semanticModel;
}
return null;
}
catch( Exception e)
{
logger.error("Error occurred while generating RDF", e);
throw new KarmaException(e.getMessage());
}
finally
{
removeWorkspace(workspace);
}
}
private InputType getDataType(String fileName) {
String ext = FilenameUtils.getExtension(fileName);
if (ext.equalsIgnoreCase("csv")) return InputType.CSV;
else if (ext.equalsIgnoreCase("xml")) return InputType.XML;
else if (ext.equalsIgnoreCase("json")) return InputType.JSON;
else if (ext.equalsIgnoreCase("xsls")) return InputType.EXCEL;
return null;
}
public SemanticModel getCorrectModel(ServletContextParameterMap contextParameters,
File[] trainingSources,
File[] trainingModels,
File testSource,
File testModel,
int numberOfCandidates) throws JSONException, KarmaException, IOException {
ModelingConfiguration mConf = ModelingConfigurationRegistry.getInstance().getModelingConfiguration(contextParameters.getId());
boolean ontologyAlignment = mConf.getOntologyAlignment();
boolean knownModelsAlignment = mConf.getKnownModelsAlignment();
boolean learner = mConf.isLearnerEnabled();
boolean addOntologyPaths = mConf.getAddOntologyPaths();
mConf.setAddOntologyPaths(false);
mConf.setKnownModelsAlignment(false);
mConf.setLearnerEnabled(false);
mConf.setOntologyAlignment(false);
InputType trainingDataType = null;
for (int i = 0; i < trainingSources.length; i++) {
File f = trainingSources[i];
String sourceName = f.getName();
trainingDataType = getDataType(f.getName());
if (trainingDataType == null) continue;
applyModel(contextParameters, f, sourceName, trainingDataType, trainingModels[i], true, false);
}
if (testSource == null && testModel == null)
return null;
InputType testDataType = getDataType(testSource.getName());
if (testDataType == null) return null;
SemanticModel sm = applyModel(contextParameters, testSource, testSource.getName(), testDataType, testModel, false, true);
String modelJson = contextParameters.getParameterValue(ContextParameter.JSON_MODELS_DIR) + sm.getName() + "." + trainingSources.length + ".model.json";
String modelGraphviz = contextParameters.getParameterValue(ContextParameter.GRAPHVIZ_MODELS_DIR) + sm.getName() + "." + trainingSources.length + ".model.dot";
String evaluateMRR = contextParameters.getParameterValue(ContextParameter.EVALUATE_MRR) + sm.getName() + "." + trainingSources.length + ".mrr.json";
try {
sm.writeJson(modelJson);
} catch (Exception e) {
logger.error("error in exporting the model to JSON!");
// e.printStackTrace();
}
try {
sm.writeGraphviz(modelGraphviz, false, false);
} catch (Exception e) {
logger.error("error in exporting the model to GRAPHVIZ!");
// e.printStackTrace();
}
EvaluateMRR.printEvaluatedJSON(modelJson, evaluateMRR);
MRRItem mrrItem = EvaluateMRR.calculateMRRValue(modelJson, numberOfCandidates);
sm.setAccuracy(roundDecimals(mrrItem.getAccuracy(),2));
sm.setMrr(roundDecimals(mrrItem.getMrr(),2));
mConf.setAddOntologyPaths(ontologyAlignment);
mConf.setKnownModelsAlignment(knownModelsAlignment);
mConf.setLearnerEnabled(learner);
mConf.setOntologyAlignment(addOntologyPaths);
// String modelJson = contextParameters.getParameterValue(ContextParameter.JSON_MODELS_DIR) + testSource.getName() + "." + trainingSources.length + ".model.json";
// MRRItem mrrItem = EvaluateMRR.calculateMRRValue(modelJson, numberOfCandidates);
// SemanticModel sm = SemanticModel.readJson(modelJson);
// sm.setAccuracy(roundDecimals(mrrItem.getAccuracy(),2));
// sm.setMrr(roundDecimals(mrrItem.getMrr(),2));
return sm;
}
private static double roundDecimals(double d, int k) {
String format = "";
for (int i = 0; i < k; i++) format += "#";
DecimalFormat DForm = new DecimalFormat("#." + format);
return Double.valueOf(DForm.format(d));
}
public SemanticModel getCorrectModel(ServletContextParameterMap contextParameters,
File trainingSource,
File trainingModel,
File testSource,
File testModel,
int index,
int numberOfCandidates) throws JSONException, KarmaException, IOException {
ModelingConfiguration mConf = ModelingConfigurationRegistry.getInstance().getModelingConfiguration(contextParameters.getId());
boolean ontologyAlignment = mConf.getOntologyAlignment();
boolean knownModelsAlignment = mConf.getKnownModelsAlignment();
boolean learner = mConf.isLearnerEnabled();
boolean addOntologyPaths = mConf.getAddOntologyPaths();
mConf.setAddOntologyPaths(false);
mConf.setKnownModelsAlignment(false);
mConf.setLearnerEnabled(false);
mConf.setOntologyAlignment(false);
if (trainingSource != null && trainingModel != null) {
InputType trainingDataType = getDataType(trainingSource.getName());
if (trainingDataType == null) return null;
applyModel(contextParameters, trainingSource, trainingSource.getName(), trainingDataType, trainingModel, true, false);
}
if (testSource == null && testModel == null)
return null;
InputType testDataType = getDataType(testSource.getName());
if (testDataType == null) return null;
SemanticModel sm = applyModel(contextParameters, testSource, testSource.getName(), testDataType, testModel, false, true);
String modelJson = contextParameters.getParameterValue(ContextParameter.JSON_MODELS_DIR) + sm.getName() + "." + index + ".model.json";
String modelGraphviz = contextParameters.getParameterValue(ContextParameter.GRAPHVIZ_MODELS_DIR) + sm.getName() + "." + index + ".model.dot";
String evaluateMRR = contextParameters.getParameterValue(ContextParameter.EVALUATE_MRR) + sm.getName() + "." + index + ".mrr.json";
try {
sm.writeJson(modelJson);
} catch (Exception e) {
logger.error("error in exporting the model to JSON!");
// e.printStackTrace();
}
try {
sm.writeGraphviz(modelGraphviz, false, false);
} catch (Exception e) {
logger.error("error in exporting the model to GRAPHVIZ!");
// e.printStackTrace();
}
EvaluateMRR.printEvaluatedJSON(modelJson, evaluateMRR);
MRRItem mrrItem = EvaluateMRR.calculateMRRValue(modelJson, numberOfCandidates);
sm.setAccuracy(roundDecimals(mrrItem.getAccuracy(),2));
sm.setMrr(roundDecimals(mrrItem.getMrr(),2));
mConf.setAddOntologyPaths(ontologyAlignment);
mConf.setKnownModelsAlignment(knownModelsAlignment);
mConf.setLearnerEnabled(learner);
mConf.setOntologyAlignment(addOntologyPaths);
// String modelJson = contextParameters.getParameterValue(ContextParameter.JSON_MODELS_DIR) + testSource.getName() + "." + index + ".model.json";
// MRRItem mrrItem = EvaluateMRR.calculateMRRValue(modelJson, numberOfCandidates);
// SemanticModel sm = SemanticModel.readJson(modelJson);
// sm.setAccuracy(roundDecimals(mrrItem.getAccuracy(),2));
// sm.setMrr(roundDecimals(mrrItem.getMrr(),2));
return sm;
}
}