/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.kr2rml; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.NodeIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; import edu.isi.karma.controller.command.CommandException; import edu.isi.karma.controller.command.cleaning.SubmitCleaningCommand; import edu.isi.karma.controller.command.reconciliation.InvokeRubenReconciliationService; import edu.isi.karma.controller.command.transformation.SubmitPythonTransformationCommand; import edu.isi.karma.controller.command.worksheet.RenameColumnCommand; import edu.isi.karma.controller.command.worksheet.SplitByCommaCommandFactory.Arguments; import edu.isi.karma.controller.command.worksheet.SplitColumnByDelimiter; import edu.isi.karma.controller.history.CommandHistoryWriter.HistoryArguments; import edu.isi.karma.controller.history.HistoryJsonUtil; import edu.isi.karma.controller.history.WorksheetCommandHistoryReader; import edu.isi.karma.modeling.Uris; import edu.isi.karma.rep.HNode; import edu.isi.karma.rep.HNodePath; import edu.isi.karma.rep.HTable; import edu.isi.karma.rep.RepFactory; import edu.isi.karma.rep.Worksheet; import edu.isi.karma.view.VWorksheet; import edu.isi.karma.view.VWorkspace; import edu.isi.karma.webserver.KarmaException; public class WorksheetR2RMLJenaModelParser { private Model model; private String sourceName; private VWorksheet vWorksheet; private Worksheet worksheet; private VWorkspace vWorkspace; private RepFactory factory; // Internal data structures required private Map<String, SubjectMap> subjectMapIndex; private Map<String, TriplesMap> triplesMapIndex; private KR2RMLMappingAuxillaryInformation auxInfo; private R2RMLMapping r2rmlMapping; private int predicateIdCounter = 1; private int objectMapCounter = 1; private List<Resource> subjectMapResources; private static Logger logger = LoggerFactory.getLogger(WorksheetR2RMLJenaModelParser.class); private enum TransformationCommandKeysAndValues { commandName, SubmitCleaningCommand, SplitByCommaCommand, examples, SubmitPythonTransformationCommand, newColumnName, transformationCode, errorDefaultValue, RenameColumnCommand, InvokeRubenReconciliationService, alignmentNodeId } public WorksheetR2RMLJenaModelParser(VWorksheet vWorksheet, VWorkspace vWorkspace, Model model, String sourceName) throws IOException, JSONException, KarmaException { this.model = model; this.sourceName = sourceName; this.vWorksheet = vWorksheet; this.worksheet = vWorksheet.getWorksheet(); this.vWorkspace = vWorkspace; this.factory = vWorkspace.getRepFactory(); this.r2rmlMapping = new R2RMLMapping(); this.auxInfo = new KR2RMLMappingAuxillaryInformation(); this.subjectMapIndex = new HashMap<String, SubjectMap>(); this.triplesMapIndex = new HashMap<String, TriplesMap>(); this.subjectMapResources = new ArrayList<Resource>(); // Capture the main mapping resource that corresponds to the source name Resource mappingResource = getMappingResourceFromSourceName(); if (mappingResource == null) { throw new KarmaException("Resource not found in model for the source: " + sourceName); } // Perform any transformations on the worksheet if required performTransformations(mappingResource); // Generate TriplesMap for each InternalNode in the tree createSubjectMaps(mappingResource); // Identify the object property links createPredicateObjectMaps(mappingResource); // Calculate the nodes covered by each InternalNode calculateColumnNodesCoveredByBlankNodes(); } private Resource getMappingResourceFromSourceName() throws KarmaException { Property sourceNameProp = model.getProperty(Uris.KM_SOURCE_NAME_URI); RDFNode node = model.createLiteral(sourceName); ResIterator res = model.listResourcesWithProperty(sourceNameProp, node); List<Resource> resList = res.toList(); if (resList.size() > 1) { throw new KarmaException("More than one resource exists with source name: " + sourceName); } else if (resList.size() == 1) { return resList.get(0); } else return null; } private void performTransformations(Resource mappingResource) throws JSONException { List<String> normalizedCommandsJSON = getNormalizedTransformCommandsJSON(mappingResource); for (String commJson:normalizedCommandsJSON) { JSONObject commObj = new JSONObject(commJson); String commandName = commObj.getString("commandName"); JSONArray inputParams = commObj.getJSONArray(HistoryArguments.inputParameters.name()); String hNodeId = HistoryJsonUtil.getStringValue(Arguments.hNodeId.name(), inputParams); // Check for the type of command to execute // TODO: Needs cleanup. Use Command Factories. if (commandName.equals(TransformationCommandKeysAndValues.SplitByCommaCommand.name())) { String delimiter = HistoryJsonUtil.getStringValue(Arguments.delimiter.name(), inputParams); SplitColumnByDelimiter splitByDelim = new SplitColumnByDelimiter(hNodeId, worksheet, delimiter, vWorkspace.getWorkspace()); splitByDelim.split(null, null); } else if (commandName.equals(TransformationCommandKeysAndValues.SubmitCleaningCommand.name())) { String examples = HistoryJsonUtil.getStringValue( TransformationCommandKeysAndValues.examples.name(), inputParams); SubmitCleaningCommand comm = new SubmitCleaningCommand("", hNodeId, vWorksheet.getId(), examples); comm.doIt(vWorkspace); } else if (commandName.equals(TransformationCommandKeysAndValues.SubmitPythonTransformationCommand.name())) { String newColumnName = HistoryJsonUtil.getStringValue( TransformationCommandKeysAndValues.newColumnName.name(), inputParams); String transformationCode = HistoryJsonUtil.getStringValue( TransformationCommandKeysAndValues.transformationCode.name(), inputParams); String errorDefaultValue = HistoryJsonUtil.getStringValue( TransformationCommandKeysAndValues.errorDefaultValue.name(), inputParams); SubmitPythonTransformationCommand comm = new SubmitPythonTransformationCommand( "", newColumnName, transformationCode, vWorksheet.getId(), hNodeId, "", errorDefaultValue); try { comm.doIt(vWorkspace); } catch (CommandException e) { logger.error("Error executing Python Transformation command", e); e.printStackTrace(); } } else if (commandName.equals(TransformationCommandKeysAndValues.RenameColumnCommand.name())) { String newColumnName = HistoryJsonUtil.getStringValue( TransformationCommandKeysAndValues.newColumnName.name(), inputParams); RenameColumnCommand comm = new RenameColumnCommand("", newColumnName, hNodeId, vWorksheet.getId()); try { comm.doIt(vWorkspace); } catch (CommandException e) { logger.error("Error executing Rename Column command", e); e.printStackTrace(); } } else if (commandName.equals(TransformationCommandKeysAndValues.InvokeRubenReconciliationService.name())) { String alignmentNodeId = HistoryJsonUtil.getStringValue( TransformationCommandKeysAndValues.alignmentNodeId.name(), inputParams); InvokeRubenReconciliationService comm = new InvokeRubenReconciliationService("", alignmentNodeId, vWorksheet.getId()); try { comm.doIt(vWorkspace); } catch (CommandException e) { logger.error("Error executing Reconcilitation service command", e); e.printStackTrace(); } } } } private List<String> getNormalizedTransformCommandsJSON(Resource mappingResource) throws JSONException { Property hasTransformation = model.getProperty(Uris.KM_HAS_TRANSFORMATION_URI); NodeIterator transItr = model.listObjectsOfProperty(mappingResource, hasTransformation); List<String> commsJSON = new ArrayList<String>(); while (transItr.hasNext()) { String unnormalizedCommJson = transItr.next().toString(); JSONObject unnormalizedCommObj = new JSONObject(unnormalizedCommJson); JSONArray inputParamArr = (JSONArray) unnormalizedCommObj. get(HistoryArguments.inputParameters.name()); boolean result = WorksheetCommandHistoryReader.normalizeCommandHistoryJsonInput( vWorkspace, vWorksheet.getId(), inputParamArr); if (!result) { logger.error("Error occured while normalizing the JSONinput for transformation " + "commands."); continue; } commsJSON.add(unnormalizedCommObj.toString()); } return commsJSON; } private void createPredicateObjectMaps(Resource mappingResource) throws JSONException { Property hasTrMapUri = model.getProperty(Uris.KM_HAS_TRIPLES_MAP_URI); // Get all the triple maps NodeIterator trMapsResItr = model.listObjectsOfProperty(mappingResource, hasTrMapUri); while (trMapsResItr.hasNext()) { // Add the predicate object maps addPredicateObjectMapsForTripleMap(trMapsResItr.next().asResource()); } } private void createSubjectMaps(Resource mappingResource) throws JSONException { Property hasTrMapUri = model.getProperty(Uris.KM_HAS_TRIPLES_MAP_URI); // Get all the triple maps NodeIterator trMapsResItr = model.listObjectsOfProperty(mappingResource, hasTrMapUri); while (trMapsResItr.hasNext()) { Resource trMapRes = trMapsResItr.next().asResource(); SubjectMap subjMap = addSubjectMapForTripleMap(trMapRes); // Add the Triples map TriplesMap trMap = new TriplesMap(trMapRes.getURI(), subjMap); this.triplesMapIndex.put(trMapRes.getURI(), trMap); this.r2rmlMapping.addTriplesMap(trMap); } } private void addPredicateObjectMapsForTripleMap(Resource trMapRes) throws JSONException { Property predObjMapProp = model.getProperty(Uris.RR_PRED_OBJ_MAP_URI); Property predProp = model.getProperty(Uris.RR_PREDICATE_URI); Property objectMapProp = model.getProperty(Uris.RR_OBJECTMAP_URI); Property columnProp = model.getProperty(Uris.RR_COLUMN_URI); Resource rfObjClassUri = model.getResource(Uris.RR_REF_OBJECT_MAP_URI); Property parentTriplesMapProp = model.getProperty(Uris.RR_PARENT_TRIPLE_MAP_URI); Property rdfTypeProp = model.getProperty(Uris.RDF_TYPE_URI); TriplesMap trMap = this.triplesMapIndex.get(trMapRes.getURI()); if (trMap == null) { logger.error("No Triples Map found for resource: " + trMapRes.getURI()); return; } NodeIterator predObjItr = model.listObjectsOfProperty(trMapRes, predObjMapProp); while (predObjItr.hasNext()) { Resource pomBlankNode = predObjItr.next().asResource(); // Create the PredicateObjectMap object for current POM PredicateObjectMap pom = new PredicateObjectMap(trMap); // Get the predicate for the POM Predicate pred = null; NodeIterator pomPredItr = model.listObjectsOfProperty(pomBlankNode, predProp); while (pomPredItr.hasNext()) { RDFNode pomPredNode = pomPredItr.next(); pred = new Predicate(pomPredNode.toString() + "-" + getNewPredicateId()); // Check if the predicate value is a URI or a literal (such as column name) if (pomPredNode instanceof Resource) { pred.getTemplate().addTemplateTermToSet( new StringTemplateTerm(((Resource) pomPredNode).getURI(), true)); } else { pred.setTemplate(TemplateTermSetBuilder. constructTemplateTermSetFromR2rmlTemplateString( pomPredNode.toString(), worksheet, factory)); } } pom.setPredicate(pred); // Get the object for the POM ObjectMap objMap = null; NodeIterator pomObjItr = model.listObjectsOfProperty(pomBlankNode, objectMapProp); while (pomObjItr.hasNext()) { Resource objNode = pomObjItr.next().asResource(); /** Check if objBlankNode is a RefObjectMap or a normal object map with column **/ if (model.contains(objNode, rdfTypeProp, rfObjClassUri)) { NodeIterator parentTripleMapItr = model.listObjectsOfProperty(objNode, parentTriplesMapProp); while (parentTripleMapItr.hasNext()) { Resource parentTripleRes = parentTripleMapItr.next().asResource(); TriplesMap parentTM = this.triplesMapIndex.get(parentTripleRes.getURI()); // Create a RefObjectMap RefObjectMap rfMap = new RefObjectMap(objNode.getURI(), parentTM); objMap = new ObjectMap(getNewObjectMapId(), rfMap); // Add the link between triple maps in the auxInfo TriplesMapLink link = new TriplesMapLink(trMap, parentTM, pom); this.auxInfo.getTriplesMapGraph().addLink(link); } } else { NodeIterator objMapColStmts = model.listObjectsOfProperty(objNode, columnProp); while (objMapColStmts.hasNext()) { RDFNode colNode = objMapColStmts.next(); objMap = new ObjectMap(getNewObjectMapId(), TemplateTermSetBuilder.constructTemplateTermSetFromR2rmlColumnString( colNode.toString(), worksheet, factory)); } // Check if anything needs to be added to the hNodeIdToPredicateObjectMap Map addHNodeIdToPredObjectMapLink(objMap, pom); } } pom.setObject(objMap); trMap.addPredicateObjectMap(pom); } } private void addHNodeIdToPredObjectMapLink(ObjectMap objMap, PredicateObjectMap pom) { TemplateTermSet objTermSet = objMap.getTemplate(); for (TemplateTerm term:objTermSet.getAllTerms()) { if (term instanceof ColumnTemplateTerm) { String hNodeId = term.getTemplateTermValue(); List<PredicateObjectMap> existingPomList = this.auxInfo. getHNodeIdToPredObjLinks().get(hNodeId); if (existingPomList == null) { existingPomList = new ArrayList<PredicateObjectMap>(); } existingPomList.add(pom); this.auxInfo.getHNodeIdToPredObjLinks().put(hNodeId, existingPomList); } } } private int getNewPredicateId() { return predicateIdCounter++; } private String getNewObjectMapId() { return "ObjectMap" + objectMapCounter++; } private SubjectMap addSubjectMapForTripleMap(Resource trMapRes) throws JSONException { SubjectMap subjMap = null; Property subjMapProp = model.getProperty(Uris.RR_SUBJECTMAP_URI); Property templateProp = model.getProperty(Uris.RR_TEMPLATE_URI); Property rdfTypeProp = model.getProperty(Uris.RDF_TYPE_URI); Property rrClassProp = model.getProperty(Uris.RR_CLASS_URI); Resource steinerTreeRootNodeRes = model.getResource(Uris.KM_STEINER_TREE_ROOT_NODE); NodeIterator subjMapsItr = model.listObjectsOfProperty(trMapRes, subjMapProp); while (subjMapsItr.hasNext()){ Resource subjMapBlankRes = subjMapsItr.next().asResource(); subjectMapResources.add(subjMapBlankRes); String subjMapId = subjMapBlankRes.getId().getLabelString(); subjMap = new SubjectMap(subjMapId); this.subjectMapIndex.put(subjMapId, subjMap); // Get the subject template NodeIterator templateItr = model.listObjectsOfProperty(subjMapBlankRes, templateProp); TemplateTermSet subjTemplTermSet = null; while (templateItr.hasNext()) { RDFNode templNode = templateItr.next(); String template = templNode.toString(); subjTemplTermSet = TemplateTermSetBuilder.constructTemplateTermSetFromR2rmlTemplateString( template, worksheet, factory); } subjMap.setTemplate(subjTemplTermSet); // Get the subject type NodeIterator rdfTypesItr = model.listObjectsOfProperty(subjMapBlankRes, rrClassProp); while (rdfTypesItr.hasNext()) { RDFNode typeNode = rdfTypesItr.next(); if (typeNode.isAnon()) { NodeIterator typeTemplItr = model.listObjectsOfProperty(typeNode.asResource(), templateProp); while (typeTemplItr.hasNext()) { RDFNode templNode = typeTemplItr.next(); String template = templNode.toString(); TemplateTermSet typeTermSet = TemplateTermSetBuilder. constructTemplateTermSetFromR2rmlTemplateString( template, worksheet, factory); subjMap.addRdfsType(typeTermSet); } continue; } if (typeNode instanceof Resource) { // Skip the steiner tree root type if(((Resource) typeNode).getURI().equals(Uris.KM_STEINER_TREE_ROOT_NODE)) continue; StringTemplateTerm uriTerm = new StringTemplateTerm( ((Resource) typeNode).getURI(), true); TemplateTermSet typeTermSet = new TemplateTermSet(); typeTermSet.addTemplateTermToSet(uriTerm); subjMap.addRdfsType(typeTermSet); } else { TemplateTermSet typeTermSet = TemplateTermSetBuilder. constructTemplateTermSetFromR2rmlTemplateString( typeNode.toString(), worksheet, factory); subjMap.addRdfsType(typeTermSet); } } // Check if it is as the Steiner tree root node if (model.contains(subjMapBlankRes, rdfTypeProp, steinerTreeRootNodeRes)) { subjMap.setAsSteinerTreeRootNode(true); } } return subjMap; } private void calculateColumnNodesCoveredByBlankNodes() throws JSONException { Property termTypeProp = model.getProperty(Uris.RR_TERM_TYPE_URI); Resource blankNodeRes = model.getResource(Uris.RR_BLANK_NODE_URI); Property kmCoverColumnProp = model.getProperty(Uris.KM_BLANK_NODE_COVERS_COLUMN_URI); Property kmBnodePrefixProp = model.getProperty(Uris.KM_BLANK_NODE_PREFIX_URI); List<HNodePath> allColPaths = worksheet.getHeaders().getAllPaths(); ResIterator blankNodeSubjMapItr = model.listResourcesWithProperty(termTypeProp, blankNodeRes); for (Resource subjMapRes:subjectMapResources) { if (model.contains(subjMapRes, termTypeProp, blankNodeRes)) { Resource blankNodeSubjRes = blankNodeSubjMapItr.next(); SubjectMap subjMap = this.subjectMapIndex.get(blankNodeSubjRes.getId().getLabelString()); subjMap.setAsBlankNode(true); // Get the column it covers NodeIterator coverColItr = model.listObjectsOfProperty(blankNodeSubjRes, kmCoverColumnProp); List<String> columnsCoveredHnodeIds = new ArrayList<String>(); while (coverColItr.hasNext()) { RDFNode coveredColNode = coverColItr.next(); String coveredColStr = coveredColNode.asLiteral().getString(); // If hierarchical column if (coveredColStr.startsWith("[") && coveredColStr.endsWith("]")) { JSONArray strArr = new JSONArray(coveredColStr); HTable hTable = worksheet.getHeaders(); for (int i=0; i<strArr.length(); i++) { String cName = (String) strArr.get(i); logger.debug("Column being normalized: "+ cName); HNode hNode = hTable.getHNodeFromColumnName(cName); if(hNode == null || hTable == null) { logger.error("Error retrieving column: " + cName); } if (i == strArr.length()-1) { // Found! String hNodeId = hNode.getId(); columnsCoveredHnodeIds.add(hNodeId); } else { hTable = hNode.getNestedTable(); } } } // Single level column else { for (HNodePath path:allColPaths) { HNode lastNode = path.getLeaf(); if (coveredColStr.equals(lastNode.getColumnName())) { columnsCoveredHnodeIds.add(lastNode.getId()); } } } } logger.debug("Adding columns for blank node" + subjMap.getId() + " List: " + columnsCoveredHnodeIds); this.auxInfo.getBlankNodesColumnCoverage().put(subjMap.getId(), columnsCoveredHnodeIds); // Get the blank node prefix NodeIterator bnodePrefixItr = model.listObjectsOfProperty(blankNodeSubjRes, kmBnodePrefixProp); while (bnodePrefixItr.hasNext()) { this.auxInfo.getBlankNodesUriPrefixMap().put(subjMap.getId(), bnodePrefixItr.next().toString()); } } } } public KR2RMLMappingAuxillaryInformation getAuxInfo() { return auxInfo; } public R2RMLMapping getR2rmlMapping() { return r2rmlMapping; } }