/*
* Copyright 2014 reto.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.fusepool.datalifecycle.core;
import java.io.PrintWriter;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.access.LockableMGraph;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.ontologies.OWL;
import static org.apache.clerezza.rdf.ontologies.PLATFORM.baseUri;
import org.apache.clerezza.rdf.utils.UnionMGraph;
import org.apache.clerezza.rdf.utils.smushing.SameAsSmusher;
import org.apache.stanbol.commons.indexedgraph.IndexedMGraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Provided the functionality to perform a smush job on dataset. This does
* not itself extend Task but is typically used from within a task. The task
* also ensures all temporary (urn:x-temp:) URI are replaced with HTTP URIs.
*
* @author reto
*/
class SmushingJob {
/**
* Smush the union of the digest and enhancements graphs using the
* interlinking graph. More precisely collates URIs coming from different
* equivalent resources in a single one chosen among them. All the triples
* in the union graph are copied in the smush graph that is then smushed
* using the interlinking graph. URIs are canonicalized to http://
*
* @param graphToSmushRef
* @return
*/
static void perform(DataSet dataSet, PrintWriter messageWriter, UriRef baseUri) {
new SmushingJob(dataSet, messageWriter, baseUri).perform();
}
// Scheme of non-http URI used
static final String URN_SCHEME = "urn:x-temp:";
private final DataSet dataSet;
private static final Logger log = LoggerFactory.getLogger(SmushingJob.class);
private final PrintWriter messageWriter;
private final String baseUriString;
private SmushingJob(DataSet dataSet, PrintWriter messageWriter, UriRef baseUri) {
this.dataSet = dataSet;
this.messageWriter = messageWriter;
this.baseUriString = baseUri.getUnicodeString();
}
/**
* Smush the union of the source, digest and enhancements graphs using the
* interlinking graph. More precisely collates URIs coming from different
* equivalent resources in a single one chosen among them. All the triples
* in the union graph are copied in the smush graph that is then smushed
* using the interlinking graph. URIs are canonicalized to http://
*
* @param graphToSmushRef
* @return
*/
void perform() {
messageWriter.println("Smushing task.");
final SameAsSmusher smusher = new SameAsSmusher() {
@Override
protected UriRef getPreferedIri(Set<UriRef> uriRefs
) {
Set<UriRef> httpUri = new HashSet<UriRef>();
for (UriRef uriRef : uriRefs) {
if (uriRef.getUnicodeString().startsWith("http")) {
httpUri.add(uriRef);
}
}
if (httpUri.size() == 1) {
return httpUri.iterator().next();
}
// There is no http URI in the set of equivalent resource. The entity was unknown.
// A new representation of the entity with http URI will be created.
if (httpUri.size() == 0) {
return generateNewHttpUri(dataSet, uriRefs);
}
if (httpUri.size() > 1) {
return chooseBest(httpUri);
}
throw new Error("Negative size set.");
}
};
if (dataSet.getSmushGraph().size() > 0) {
dataSet.getSmushGraph().clear();
}
dataSet.getSmushGraph().addAll(dataSet.getDigestGraph());
dataSet.getSmushGraph().addAll(dataSet.getEnhancementsGraph());
log.info("All triples from the union of digest and enhancements graph are now in the smush graph.");
log.info("Starting smushing.");
smusher.smush(dataSet.getSmushGraph(), dataSet.getInterlinksGraph(), true);
log.info("Smush task completed.");
// Remove from smush graph equivalences between temporary uri (urn:x-temp) and http uri that are added by the clerezza smusher.
// These equivalences must be removed as only equivalences between known entities (http uri) must be maintained and then published
MGraph equivToRemove = new SimpleMGraph();
Lock srl = dataSet.getSmushGraph().getLock().readLock();
srl.lock();
try {
Iterator<Triple> isameas = dataSet.getSmushGraph().filter(null, OWL.sameAs, null);
while (isameas.hasNext()) {
Triple sameas = isameas.next();
NonLiteral subject = sameas.getSubject();
Resource object = sameas.getObject();
if (subject.toString().startsWith("<" + URN_SCHEME) || object.toString().startsWith("<" + URN_SCHEME)) {
equivToRemove.add(sameas);
}
}
} finally {
srl.unlock();
}
dataSet.getSmushGraph().removeAll(equivToRemove);
messageWriter.println("Smushing of " + dataSet.getUri()
+ "Smushed graph size = " + dataSet.getSmushGraph().size());
canonicalizeResources();
}
/**
* Generates a new http URI that will be used as the canonical one in place
* of a set of equivalent non-http URIs. An owl:sameAs statement is added to
* the interlinking graph stating that the canonical http URI is equivalent
* to one of the non-http URI in the set of equivalent URIs.
*
* @param uriRefs
* @return
*/
private UriRef generateNewHttpUri(DataSet dataSet, Set<UriRef> uriRefs) {
UriRef bestNonHttp = chooseBest(uriRefs);
String nonHttpString = bestNonHttp.getUnicodeString();
if (!nonHttpString.startsWith(URN_SCHEME)) {
throw new RuntimeException("Sorry we current assume all non-http "
+ "URIs to be canonicalized to be urn:x-temp, cannot handle: " + nonHttpString);
}
String httpUriString = nonHttpString.replaceFirst(URN_SCHEME, baseUriString);
//TODO check that this URI is in fact new
UriRef httpUriRef = new UriRef(httpUriString);
// add an owl:sameAs statement in the interlinking graph
dataSet.getInterlinksGraph().add(new TripleImpl(bestNonHttp, OWL.sameAs, httpUriRef));
return httpUriRef;
}
private UriRef chooseBest(Set<UriRef> httpUri) {
Iterator<UriRef> iter = httpUri.iterator();
UriRef best = iter.next();
while (iter.hasNext()) {
UriRef next = iter.next();
if (next.getUnicodeString().compareTo(best.getUnicodeString()) < 0) {
best = next;
}
}
return best;
}
/**
* All the resources in the smush graph must be http dereferencable when
* published. All the triples in the smush graph are copied into a temporary
* graph. For each triple the subject and the object that have a non-http
* URI are changed in http uri and an equivalence link is added in the
* interlinking graph for each resource (subject and object) that has been
* changed.
*/
private void canonicalizeResources() {
LockableMGraph graph = dataSet.getSmushGraph();
MGraph graphCopy = new SimpleMGraph();
// graph containing the same triple with the http URI for each subject and object
MGraph canonicGraph = new SimpleMGraph();
Lock rl = graph.getLock().readLock();
rl.lock();
try {
graphCopy.addAll(graph);
} finally {
rl.unlock();
}
Iterator<Triple> ismushTriples = graphCopy.iterator();
while (ismushTriples.hasNext()) {
Triple triple = ismushTriples.next();
UriRef subject = (UriRef) triple.getSubject();
Resource object = triple.getObject();
// generate an http URI for both subject and object and add an equivalence link into the interlinking graph
if (subject.getUnicodeString().startsWith(URN_SCHEME)) {
subject = generateNewHttpUri(dataSet, Collections.singleton(subject));
}
if (object.toString().startsWith("<" + URN_SCHEME)) {
object = generateNewHttpUri(dataSet, Collections.singleton((UriRef) object));
}
// add the triple with the http uris to the canonic graph
canonicGraph.add(new TripleImpl(subject, triple.getPredicate(), object));
}
Lock wl = graph.getLock().writeLock();
wl.lock();
try {
graph.clear();
graph.addAll(canonicGraph);
} finally {
wl.unlock();
}
}
}