package at.ac.univie.mminf.qskos4j.issues.inlinks;
import at.ac.univie.mminf.qskos4j.issues.Issue;
import at.ac.univie.mminf.qskos4j.issues.concepts.AuthoritativeConcepts;
import at.ac.univie.mminf.qskos4j.progress.MonitoredIterator;
import at.ac.univie.mminf.qskos4j.result.ExtrapolatedCollectionResult;
import at.ac.univie.mminf.qskos4j.util.RandomSubSet;
import org.openrdf.OpenRDFException;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.query.QueryEvaluationException;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.TupleQueryResult;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.sparql.SPARQLRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URISyntaxException;
import java.util.*;
/**
* Finds concepts that aren't referred by other vocabularies on the Web (
* <a href="https://github.com/cmader/qSKOS/wiki/Quality-Issues#wiki-Missing_InLinks">Missing In-Links</a>
* ).
*/
public class MissingInLinks extends Issue<ExtrapolatedCollectionResult<Resource>> {
private final Logger logger = LoggerFactory.getLogger(MissingInLinks.class);
private AuthoritativeConcepts authoritativeConcepts;
private Collection<RepositoryConnection> connections = new ArrayList<>();
private Map<Resource, Set<URI>> conceptReferencingResources = new HashMap<>();
private Integer queryDelayMillis = 0;
private Float randomSubsetSize_percent;
public MissingInLinks(AuthoritativeConcepts authoritativeConcepts) {
super(authoritativeConcepts,
"mil",
"Missing In-Links",
"Uses the sindice index to find concepts that aren't referenced by other datasets on the Web",
IssueType.ANALYTICAL,
new URIImpl("https://github.com/cmader/qSKOS/wiki/Quality-Issues#missing-in-links"));
this.authoritativeConcepts = authoritativeConcepts;
}
@Override
protected ExtrapolatedCollectionResult<Resource> invoke() throws OpenRDFException {
Collection<Resource> conceptsToCheck = getConceptsToCheck(randomSubsetSize_percent);
if (randomSubsetSize_percent != null) {
logger.info("Using subset of " +conceptsToCheck.size()+ " concepts for In-Link checking");
}
Iterator<Resource> conceptIt = new MonitoredIterator<>(
conceptsToCheck,
progressMonitor,
"finding In-Links");
while (conceptIt.hasNext()) {
rankConcept(conceptIt.next());
}
return new ExtrapolatedCollectionResult<>(extractUnreferencedConcepts(), randomSubsetSize_percent);
}
private Collection<Resource> getConceptsToCheck(Float randomSubsetSize_percent) throws OpenRDFException
{
if (randomSubsetSize_percent == null) {
return authoritativeConcepts.getResult().getData();
}
else {
return new RandomSubSet<>(authoritativeConcepts.getResult().getData(), randomSubsetSize_percent);
}
}
private void rankConcept(Resource concept)
{
if (connections.isEmpty()) {
logger.warn("no repository for querying defined");
}
for (RepositoryConnection connection : connections) {
rankConceptForConnection(concept, connection);
}
}
private void rankConceptForConnection(Resource concept, RepositoryConnection connection)
{
String query = "SELECT distinct ?resource WHERE " +
"{?resource ?p <"+concept.toString()+"> " +
"FILTER isIRI(?resource) "+
"FILTER(regex(str(?resource), \"^http.*\"))}";
// delay to avoid flooding the SPARQL endpoint
try {
Thread.sleep(queryDelayMillis);
}
catch (InterruptedException e) {
// ignore this exception
}
try {
TupleQueryResult result = connection.prepareTupleQuery(QueryLanguage.SPARQL, query).evaluate();
addToConceptsRankMap(concept, result);
}
catch (Exception e) {
logger.error("Error evaluating query '" +query);
}
}
private void addToConceptsRankMap(Resource concept, TupleQueryResult result)
throws QueryEvaluationException
{
Set<URI> referencingResourcesOnOtherHost =
getReferencingResourcesOnOtherHost(concept, result);
Set<URI> allReferencingResources = conceptReferencingResources.get(concept);
if (allReferencingResources == null) {
allReferencingResources = new HashSet<>();
conceptReferencingResources.put(concept, allReferencingResources);
}
allReferencingResources.addAll(referencingResourcesOnOtherHost);
}
private Set<URI> getReferencingResourcesOnOtherHost(
Value concept,
TupleQueryResult result) throws QueryEvaluationException
{
Set<URI> referencingResourcesOnOtherHost = new HashSet<>();
while (result.hasNext()) {
Value referencingResource = result.next().getValue("resource");
try {
if (referencingResource instanceof URI &&
concept instanceof URI &&
isDistinctHost((URI) concept, (URI) referencingResource))
{
referencingResourcesOnOtherHost.add((URI) referencingResource);
}
}
catch (URISyntaxException e) {
// should never happen => don't add to list
}
}
return referencingResourcesOnOtherHost;
}
private boolean isDistinctHost(URI resource, URI otherResource)
throws URISyntaxException
{
String host = new java.net.URI(resource.toString()).getHost();
String otherHost = new java.net.URI(otherResource.toString()).getHost();
return !host.equalsIgnoreCase(otherHost);
}
private Collection<Resource> extractUnreferencedConcepts() {
Collection<Resource> unrefConcepts = new HashSet<>();
for (Resource concept : conceptReferencingResources.keySet()) {
if (conceptReferencingResources.get(concept).isEmpty()) {
unrefConcepts.add(concept);
}
}
return unrefConcepts;
}
public void setQueryDelayMillis(int delayMillis) {
queryDelayMillis = delayMillis;
}
public void setSubsetSize(Float subsetSizePercent) {
randomSubsetSize_percent = subsetSizePercent;
}
/**
* Adds the repository containing the vocabulary that's about to test to the list of
* other repositories. This is only useful for in-link testing purposes.
*/
public void addRepositoryLoopback() throws OpenRDFException {
connections.add(repCon);
}
/**
* Adds a SPARQL endpoint for estimation of in-links.
*
* @param endpointUrl SPARL endpoint URL
*/
public void addSparqlEndPoint(String endpointUrl) throws OpenRDFException {
Repository repo = new SPARQLRepository(endpointUrl);
repo.initialize();
connections.add(repo.getConnection());
}
}