/**
* This file is part of d:swarm graph extension.
*
* d:swarm graph extension is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* d:swarm graph extension is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with d:swarm graph extension. If not, see <http://www.gnu.org/licenses/>.
*/
package org.dswarm.graph.gdm.parse;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import com.google.common.base.Optional;
import org.apache.jena.vocabulary.RDF;
import org.neo4j.graphdb.DynamicLabel;
import org.neo4j.graphdb.DynamicRelationshipType;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Label;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.index.Index;
import org.neo4j.graphdb.index.IndexHits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.dswarm.graph.DMPGraphException;
import org.dswarm.graph.GraphIndexStatics;
import org.dswarm.graph.GraphProcessingStatics;
import org.dswarm.graph.NodeType;
import org.dswarm.graph.hash.HashUtils;
import org.dswarm.graph.index.NamespaceIndex;
import org.dswarm.graph.json.LiteralNode;
import org.dswarm.graph.json.ResourceNode;
import org.dswarm.graph.json.Statement;
import org.dswarm.graph.model.GraphStatics;
import org.dswarm.graph.parse.Neo4jHandler;
/**
* TODO: re-factor usage of CommonHandler
*
* @author tgaengler
*/
public class Neo4jDeltaGDMHandler implements GDMHandler {
private static final Logger LOG = LoggerFactory.getLogger(Neo4jDeltaGDMHandler.class);
public static final int DELTA_SIZE = 50000;
public static final int DELTA_TIME = 30;
private long totalTriples = 0;
private int addedNodes = 0;
private int addedLabels = 0;
private int addedRelationships = 0;
private long sinceLastCommit = 0;
private int i = 0;
private int literals = 0;
private long tick = System.currentTimeMillis();
private final GraphDatabaseService database;
private final Map<String, Node> bnodes;
private final Index<Relationship> statementHashes;
private final Index<Relationship> statementUUIDs;
private final Map<Long, Long> nodeResourceMap;
private final NamespaceIndex namespaceIndex;
private Transaction tx;
public Neo4jDeltaGDMHandler(final GraphDatabaseService database, final NamespaceIndex namespaceIndexArg) throws DMPGraphException {
this.database = database;
namespaceIndex = namespaceIndexArg;
tx = database.beginTx();
try {
LOG.debug("start write TX");
bnodes = new HashMap<>();
statementHashes = database.index().forRelationships(GraphIndexStatics.STATEMENT_HASHES_INDEX_NAME);
statementUUIDs = database.index().forRelationships(GraphIndexStatics.STATEMENT_UUIDS_INDEX_NAME);
nodeResourceMap = new HashMap<>();
} catch (final Exception e) {
tx.failure();
tx.close();
final String message = "couldn't load indices successfully";
Neo4jDeltaGDMHandler.LOG.error(message, e);
Neo4jDeltaGDMHandler.LOG.debug("couldn't finish write TX successfully");
throw new DMPGraphException(message);
}
}
@Override
public void handleStatement(final Statement st, final long resourceHash, final long index) throws DMPGraphException {
// utilise r for the resource property
i++;
try {
final org.dswarm.graph.json.Node subject = st.getSubject();
final org.dswarm.graph.json.Predicate predicate = st.getPredicate();
final String predicateName = predicate.getUri();
final String prefixedPredicateURI = namespaceIndex.createPrefixedURI(predicateName);
final org.dswarm.graph.json.Node object = st.getObject();
final Long statementUUID = HashUtils.getUUID(st.getUUID());
final Long order = st.getOrder();
// Check index for subject
// TODO: what should we do, if the subject is a resource type?
Node subjectNode = determineNode(subject, false);
if (subjectNode == null) {
if (subject instanceof ResourceNode) {
subjectNode = database.createNode(GraphProcessingStatics.RESOURCE_LABEL);
final String subjectURI = ((ResourceNode) subject).getUri();
final String prefixedSubjectURI = namespaceIndex.createPrefixedURI(subjectURI);
subjectNode.setProperty(GraphStatics.URI_PROPERTY, prefixedSubjectURI);
} else {
// subject is a blank node
subjectNode = database.createNode(GraphProcessingStatics.BNODE_LABEL);
// note: can I expect an id here?
bnodes.put("" + subject.getId(), subjectNode);
}
addedNodes++;
}
if (object instanceof LiteralNode) {
literals++;
final LiteralNode literal = (LiteralNode) object;
final String value = literal.getValue();
final Node objectNode = database.createNode(GraphProcessingStatics.LEAF_LABEL, GraphProcessingStatics.LITERAL_LABEL);
objectNode.setProperty(GraphStatics.VALUE_PROPERTY, value);
// not really needed, or? -since label is set
objectNode.setProperty(GraphProcessingStatics.LEAF_IDENTIFIER, true);
final long finalResourceHash = addResourceProperty(subjectNode, subject, objectNode, resourceHash);
addedNodes++;
addRelationship(subjectNode, prefixedPredicateURI, objectNode, Optional.of(finalResourceHash), subject, resourceHash, statementUUID, order,
index,
subject.getType(), object.getType());
} else { // must be Resource
// Make sure object exists
boolean isType = false;
final Optional<String> optionalPrefixedObjectURI;
// add Label if this is a type entry
if (predicateName.equals(RDF.type.getURI())) {
final String objectURI = ((ResourceNode) object).getUri();
optionalPrefixedObjectURI = Optional.of(namespaceIndex.createPrefixedURI(objectURI));
addLabel(subjectNode, optionalPrefixedObjectURI.get());
isType = true;
} else {
optionalPrefixedObjectURI = Optional.absent();
}
// Check index for object
Node objectNode = determineNode(object, isType);
Optional<Long> optionalResourceHash = Optional.absent();
if (objectNode == null) {
if (object instanceof ResourceNode) {
// object is a resource node
objectNode = database.createNode(GraphProcessingStatics.LEAF_LABEL, GraphProcessingStatics.RESOURCE_LABEL);
// not really needed, or? -since label is set
objectNode.setProperty(GraphProcessingStatics.LEAF_IDENTIFIER, true);
final String finalPrefixedObjectURI;
if(optionalPrefixedObjectURI.isPresent()) {
finalPrefixedObjectURI = optionalPrefixedObjectURI.get();
} else {
final String objectURI = ((ResourceNode) object).getUri();
finalPrefixedObjectURI = namespaceIndex.createPrefixedURI(objectURI);
}
objectNode.setProperty(GraphStatics.URI_PROPERTY, finalPrefixedObjectURI);
if (isType) {
addLabel(objectNode, NodeType.TypeResource.toString());
addLabel(objectNode, namespaceIndex.getRDFCLASSPrefixedURI());
}
} else {
// object is a blank node
objectNode = database.createNode(GraphProcessingStatics.BNODE_LABEL);
bnodes.put("" + object.getId(), objectNode);
if (!isType) {
optionalResourceHash = Optional.of(addResourceProperty(subjectNode, subject, objectNode, resourceHash));
} else {
addLabel(objectNode, NodeType.TypeBNode.toString());
addLabel(objectNode, namespaceIndex.getRDFCLASSPrefixedURI());
}
}
addedNodes++;
}
// // leave out, rdf:type statements for now (enable, them if footprint is not too high)
// if (!isType) {
addRelationship(subjectNode, prefixedPredicateURI, objectNode, optionalResourceHash, subject, resourceHash, statementUUID, order,
index,
subject.getType(),
object.getType());
// }
}
totalTriples++;
final long nodeDelta = totalTriples - sinceLastCommit;
final long timeDelta = (System.currentTimeMillis() - tick) / 1000;
if (nodeDelta >= DELTA_SIZE || timeDelta >= DELTA_TIME) { // Commit every 50k operations or every 30 seconds
tx.success();
tx.close();
tx = database.beginTx();
sinceLastCommit = totalTriples;
final double duration = (double) nodeDelta / timeDelta;
LOG.debug("{} triples @ ~{} triples/second.", totalTriples, duration);
tick = System.currentTimeMillis();
}
} catch (final Exception e) {
final String message = "couldn't finish write TX successfully";
LOG.error(message, e);
tx.failure();
tx.close();
throw new DMPGraphException(message);
}
}
@Override public NamespaceIndex getNamespaceIndex() {
return namespaceIndex;
}
@Override public Neo4jHandler getHandler() {
// nothing TODO here ...
return null;
}
@Override public GraphDatabaseService getDatabase() {
return database;
}
public void closeTransaction() {
LOG.debug("close write TX finally");
tx.success();
tx.close();
}
@Override
public long getCountedStatements() {
return totalTriples;
}
@Override
public int getNodesAdded() {
return addedNodes;
}
@Override
public int getRelationshipsAdded() {
return addedRelationships;
}
@Override
public int getCountedLiterals() {
return literals;
}
private void addLabel(final Node node, final String labelString) {
final Label label = DynamicLabel.label(labelString);
boolean hit = false;
final Iterable<Label> labels = node.getLabels();
for (final Label lbl : labels) {
if (label.equals(lbl)) {
hit = true;
break;
}
}
if (!hit) {
node.addLabel(label);
addedLabels++;
}
}
private Relationship addRelationship(final Node subjectNode, final String predicateName, final Node objectNode,
final Optional<Long> optionalResourceHash,
final org.dswarm.graph.json.Node subject, final long resourceHash, final Long statementUUID, final Long order, final long index,
final org.dswarm.graph.json.NodeType subjectNodeType, final org.dswarm.graph.json.NodeType objectNodeType) throws DMPGraphException {
final StringBuilder sb = new StringBuilder();
final String subjectIdentifier = getIdentifier(subjectNode, subjectNodeType);
final String objectIdentifier = getIdentifier(objectNode, objectNodeType);
sb.append(subjectNodeType.toString()).append(":").append(subjectIdentifier).append(" ").append(predicateName).append(" ")
.append(objectNodeType.toString()).append(":").append(objectIdentifier).append(" ");
final long hash = HashUtils.generateHash(sb.toString());
final Relationship rel;
final IndexHits<Relationship> hits = statementHashes.get(GraphStatics.HASH, hash);
if (hits == null || !hits.hasNext()) {
final RelationshipType relType = DynamicRelationshipType.withName(predicateName);
rel = subjectNode.createRelationshipTo(objectNode, relType);
final long finalStatementUUID;
if (statementUUID == null) {
finalStatementUUID = HashUtils.generateHash(UUID.randomUUID().toString());
} else {
finalStatementUUID = statementUUID;
}
rel.setProperty(GraphStatics.UUID_PROPERTY, finalStatementUUID);
if (order != null) {
rel.setProperty(GraphStatics.ORDER_PROPERTY, order);
}
rel.setProperty(GraphStatics.INDEX_PROPERTY, index);
statementHashes.add(rel, GraphStatics.HASH, hash);
statementUUIDs.add(rel, GraphStatics.UUID, finalStatementUUID);
addedRelationships++;
addResourceProperty(subjectNode, subject, rel, optionalResourceHash, resourceHash);
} else {
rel = hits.next();
}
if (hits != null) {
hits.close();
}
return rel;
}
private Node determineNode(final org.dswarm.graph.json.Node resource, final boolean isType) throws DMPGraphException {
final Node node;
if (resource instanceof ResourceNode) {
// resource node
final String resourceURI = ((ResourceNode) resource).getUri();
final String prefixedResourceURI = namespaceIndex.createPrefixedURI(resourceURI);
if (!isType) {
return database.findNode(GraphProcessingStatics.RESOURCE_LABEL, GraphStatics.URI_PROPERTY, prefixedResourceURI);
} else {
return database.findNode(GraphProcessingStatics.RESOURCE_TYPE_LABEL, GraphStatics.URI_PROPERTY, prefixedResourceURI);
}
}
if (resource instanceof LiteralNode) {
// literal node - should never be the case
return null;
}
// resource must be a blank node
node = bnodes.get("" + resource.getId());
return node;
}
private long addResourceProperty(final Node subjectNode, final org.dswarm.graph.json.Node subject, final Node objectNode,
final long resourceHash) throws DMPGraphException {
final long finalResourceHash = determineResourceHash(subjectNode, subject, resourceHash);
objectNode.setProperty(GraphStatics.RESOURCE_PROPERTY, finalResourceHash);
return finalResourceHash;
}
private long addResourceProperty(final Node subjectNode, final org.dswarm.graph.json.Node subject, final Relationship rel,
final Optional<Long> optionalResourceHash, final long resourceHash) throws DMPGraphException {
final long finalResourceHash;
if (optionalResourceHash.isPresent()) {
finalResourceHash = optionalResourceHash.get();
} else {
finalResourceHash = determineResourceHash(subjectNode, subject, resourceHash);
}
rel.setProperty(GraphStatics.RESOURCE_PROPERTY, finalResourceHash);
return finalResourceHash;
}
private long determineResourceHash(final Node subjectNode, final org.dswarm.graph.json.Node subject, final long resourceHash)
throws DMPGraphException {
final long nodeId = subjectNode.getId();
final long finalResourceHash;
if (nodeResourceMap.containsKey(nodeId)) {
finalResourceHash = nodeResourceMap.get(nodeId);
} else {
if (subject instanceof ResourceNode) {
final String subjectURI = ((ResourceNode) subject).getUri();
final String prefixedSubjectURI = namespaceIndex.createPrefixedURI(subjectURI);
finalResourceHash = HashUtils.generateHash(prefixedSubjectURI);
} else {
finalResourceHash = resourceHash;
}
nodeResourceMap.put(nodeId, finalResourceHash);
}
return finalResourceHash;
}
private String getIdentifier(final Node node, final org.dswarm.graph.json.NodeType nodeType) {
final String identifier;
switch (nodeType) {
case Resource:
identifier = (String) node.getProperty(GraphStatics.URI_PROPERTY, null);
break;
case BNode:
identifier = "" + node.getId();
break;
case Literal:
identifier = (String) node.getProperty(GraphStatics.VALUE_PROPERTY, null);
break;
default:
identifier = null;
break;
}
return identifier;
}
}