/*. * Copyright 2013 Fusepool Project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package eu.fusepool.datalifecycle.core; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.io.StringWriter; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.security.AccessController; import java.security.AllPermission; import java.util.ArrayList; import java.util.Collections; import java.util.Dictionary; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; import javax.ws.rs.DefaultValue; import javax.ws.rs.FormParam; import javax.ws.rs.GET; import javax.ws.rs.HeaderParam; import javax.ws.rs.POST; import javax.ws.rs.Path; import javax.ws.rs.Produces; import javax.ws.rs.QueryParam; import javax.ws.rs.WebApplicationException; import javax.ws.rs.core.Context; import javax.ws.rs.core.Response; import javax.ws.rs.core.UriInfo; import org.apache.clerezza.jaxrs.utils.RedirectUtil; import org.apache.clerezza.jaxrs.utils.TrailingSlash; import org.apache.clerezza.rdf.core.BNode; import org.apache.clerezza.rdf.core.LiteralFactory; import org.apache.clerezza.rdf.core.MGraph; import org.apache.clerezza.rdf.core.NonLiteral; import org.apache.clerezza.rdf.core.Resource; import org.apache.clerezza.rdf.core.Triple; import org.apache.clerezza.rdf.core.TripleCollection; import org.apache.clerezza.rdf.core.TypedLiteral; import org.apache.clerezza.rdf.core.UriRef; import org.apache.clerezza.rdf.core.access.LockableMGraph; import org.apache.clerezza.rdf.core.access.TcManager; import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; import org.apache.clerezza.rdf.core.impl.SimpleMGraph; import org.apache.clerezza.rdf.core.impl.TripleImpl; import org.apache.clerezza.rdf.core.serializedform.Parser; import org.apache.clerezza.rdf.ontologies.DC; import org.apache.clerezza.rdf.ontologies.RDF; import org.apache.clerezza.rdf.ontologies.RDFS; import org.apache.clerezza.rdf.ontologies.SIOC; import org.apache.clerezza.rdf.utils.GraphNode; import org.apache.felix.scr.annotations.Activate; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.ConfigurationPolicy; import org.apache.felix.scr.annotations.Deactivate; import org.apache.felix.scr.annotations.Properties; import org.apache.felix.scr.annotations.Property; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.ReferenceCardinality; import org.apache.felix.scr.annotations.ReferencePolicy; import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.commons.indexedgraph.IndexedMGraph; import org.apache.stanbol.commons.web.viewable.RdfViewable; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; import org.apache.stanbol.enhancer.servicesapi.ContentSource; import org.apache.stanbol.enhancer.servicesapi.EnhancementException; import org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager; import org.apache.stanbol.enhancer.servicesapi.impl.ByteArraySource; import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses; import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory; import org.apache.stanbol.entityhub.servicesapi.model.Entity; import org.apache.stanbol.entityhub.servicesapi.model.Representation; import org.apache.stanbol.entityhub.servicesapi.site.SiteManager; import org.osgi.framework.BundleContext; import org.osgi.framework.Constants; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.fusepool.datalifecycle.Interlinker; import eu.fusepool.datalifecycle.RdfDigester; import eu.fusepool.datalifecycle.Rdfizer; import eu.fusepool.datalifecycle.utils.FileUtil; import eu.fusepool.datalifecycle.utils.LinksRetriever; import eu.fusepool.datalifecycle.ontologies.DLC; import org.apache.clerezza.rdf.core.Literal; import org.apache.clerezza.rdf.core.access.LockableMGraphWrapper; /** * This is the controller class of the fusepool data life cycle component. The * main functionalities provided are 1) XML2RDF transformation 2) Indexing and * Information Extraction 3) Reconciliation/Interlinking 4) Smushing */ @Component(immediate = true, metatype = true, policy = ConfigurationPolicy.OPTIONAL) @Properties(value = { @Property(name = "javax.ws.rs", boolValue = true), @Property(name = Constants.SERVICE_RANKING, intValue = SourcingAdmin.DEFAULT_SERVICE_RANKING) }) @Service(Object.class) @Path("sourcing") public class SourcingAdmin { // Service property attributes public static final int DEFAULT_SERVICE_RANKING = 101; // Base URI property attributes. This property is used to canonicalize URIs of type urn:x-temp. // The value of the property is updated at service activation from the service configuration panel. public static final String BASE_URI_DESCRIPTION = "Base http URI to be used when publishing data ( e.g. http://mydomain.com )"; public static final String BASE_URI_LABEL = "Base URI"; public static final String DEFAULT_BASE_URI = "http://localhost:8080"; @Property(label = BASE_URI_LABEL, value = DEFAULT_BASE_URI, description = BASE_URI_DESCRIPTION) public static final String BASE_URI = "baseUri"; // base uri updated at service activation from the service property in the osgi console private UriRef baseUri; // Confidence threshold for enhencements attributes. This property is used to set the minimum value of acceptance of // computed enhancements public static final String CONFIDENCE_THRESHOLD_DESCRIPTION = "Minimum value for acceptance of computed enhancements"; public static final String CONFIDENCE_THRESHOLD_LABEL = "Confidence threshold"; public static final String DEFAULT_CONFIDENCE_VALUE = "0.5"; @Property(label = CONFIDENCE_THRESHOLD_LABEL, value = DEFAULT_CONFIDENCE_VALUE, description = CONFIDENCE_THRESHOLD_DESCRIPTION) public static final String CONFIDENCE_THRESHOLD = "confidenceThreshold"; // confidence threshold value updated at service activation from the service property in the osgi console private double confidenceThreshold = 0.5; /** * Using slf4j for normal logging */ private static final Logger log = LoggerFactory.getLogger(SourcingAdmin.class); BundleContext bundleCtx = null; @Reference private Parser parser; @Reference private ContentItemFactory contentItemFactory; @Reference private EnhancementJobManager enhancementJobManager; @Reference private DataSetFactory dataSetFactory; @Reference private DlcGraphProvider dlcGraphProvider; /** * This service allows to get entities from configures sites */ @Reference private SiteManager siteManager; /** * This service allows accessing and creating persistent triple collections */ @Reference private TcManager tcManager; // Stores bindings to different implementations of RdfDigester @Reference(cardinality = ReferenceCardinality.OPTIONAL_MULTIPLE, policy = ReferencePolicy.DYNAMIC, referenceInterface = eu.fusepool.datalifecycle.RdfDigester.class) private final Map<String, RdfDigester> digesters = new HashMap<String, RdfDigester>(); // Stores bindings to different implementations of Rdfizer @Reference(cardinality = ReferenceCardinality.OPTIONAL_MULTIPLE, policy = ReferencePolicy.DYNAMIC, referenceInterface = eu.fusepool.datalifecycle.Rdfizer.class) private final Map<String, Rdfizer> rdfizers = new HashMap<String, Rdfizer>(); // Stores bindings to different instances of Interlinker @Reference(cardinality = ReferenceCardinality.OPTIONAL_MULTIPLE, policy = ReferencePolicy.DYNAMIC, referenceInterface = eu.fusepool.datalifecycle.Interlinker.class) private final Map<String, Interlinker> interlinkers = new HashMap<String, Interlinker>(); /** * This is the name of the graph in which we "log" the requests */ //private UriRef REQUEST_LOG_GRAPH_NAME = new UriRef("http://example.org/resource-resolver-log.graph"); /** * Register graph referencing graphs for life cycle monitoring; */ public static final String CONTENT_GRAPH_NAME = "urn:x-localinstance:/content.graph"; private UriRef CONTENT_GRAPH_REF = new UriRef(CONTENT_GRAPH_NAME); // data upload codes private final int UPLOAD_XML = 1; private final int UPLOAD_RDF = 2; // tasks codes private final int TEXT_EXTRACTION = 1; private final int COMPUTE_ENHANCEMENTS = 2; private final int RECONCILE_GRAPH_OPERATION = 3; private final int SMUSH_GRAPH_OPERATION = 4; private final int PUBLISH_DATA = 5; //mesage to show when base URI is invalid private final String INVALID_BASE_URI_ALERT = "A valid base URI has not been set. It can be set in the framework configuration panel (eu.fusepool.datalifecycle.SourcingAdmin)"; // Validity of base Uri (enables interlinking, smushing and publishing tasks) private boolean isValidBaseUri = false; //all active and some other tasks final private Set<Task> tasks = Collections.synchronizedSet(new HashSet<Task>()); @SuppressWarnings("unchecked") @Activate protected void activate(ComponentContext context) { log.info("The Sourcing Admin Service is being activated"); // Get the value of the base uri from the service property set in the Felix console Dictionary<String, Object> dict = context.getProperties(); Object baseUriObj = dict.get(BASE_URI); String baseUriString = baseUriObj.toString(); if ((!"".equals(baseUriString)) && (baseUriString.startsWith("http://"))) { if (baseUriString.endsWith("/")) { baseUriString = baseUriString.substring(0, baseUriString.length() - 1); } isValidBaseUri = true; log.info("Base URI: {}", baseUriString); } else { isValidBaseUri = false; } baseUri = new UriRef((baseUriString)); // Get the value of the confidence threshold from the service property set in the Felix console Object confidenceObj = dict.get(CONFIDENCE_THRESHOLD); if (confidenceObj != null) { confidenceThreshold = Double.valueOf(confidenceObj.toString()); } } @Deactivate protected void deactivate(ComponentContext context) { log.info("The Sourcing Admin Service is being deactivated"); } /** * Bind digesters used by this component. Adds a digester to an hashmap * * @param digester */ protected void bindDigesters(RdfDigester digester) { log.info("Binding digester " + digester.getName()); if (!digesters.containsKey(digester.getName())) { digesters.put(digester.getName(), digester); log.info("Digester " + digester.getName() + " bound"); } else { log.info("Digester " + digester.getName() + " already bound."); } } /** * Unbind digesters used by this component. Removes a digester from the hash * map. * * @param digester */ protected void unbindDigesters(RdfDigester digester) { if (digesters.containsKey(digester.getName())) { digesters.remove(digester.getName()); log.info("Digester " + digester.getName() + " unbound."); } } /** * Bind interlinkers used by this component */ protected void bindInterlinkers(Interlinker interlinker) { log.info("Binding interlinker " + interlinker.getName()); if (!interlinkers.containsKey(interlinker.getName())) { interlinkers.put(interlinker.getName(), interlinker); log.info("Interlinker " + interlinker.getName() + " bound"); } else { log.info("Interlinker " + interlinker.getName() + " already bound."); } } /** * Unbind interlinkers */ protected void unbindInterlinkers(Interlinker interlinker) { if (interlinkers.containsKey(interlinker.getName())) { interlinkers.remove(interlinker.getName()); log.info("Interlinker " + interlinker.getName() + " unbound."); } } /** * Bind rdfizers used by this component */ protected void bindRdfizers(Rdfizer rdfizer) { log.info("Binding rdfizer " + rdfizer.getName()); if (!rdfizers.containsKey(rdfizer.getName())) { rdfizers.put(rdfizer.getName(), rdfizer); log.info("Rdfizer " + rdfizer.getName() + " bound"); } else { log.info("Rdfizer " + rdfizer.getName() + " already bound."); } } /** * Unbind rdfizers */ protected void unbindRdfizers(Rdfizer rdfizer) { if (rdfizers.containsKey(rdfizer.getName())) { rdfizers.remove(rdfizer.getName()); log.info("Rdfizer " + rdfizer.getName() + " unbound."); } } /** * This method return an RdfViewable, this is an RDF serviceUri with * associated presentational information. */ @GET public RdfViewable serviceEntry(@Context final UriInfo uriInfo, @QueryParam("url") final UriRef url, @HeaderParam("user-agent") String userAgent) { //this makes sure this service is not invoked with a trailing slash which would affect //relative resolution of links (e.g. css) TrailingSlash.enforcePresent(uriInfo); final String resourcePath = uriInfo.getAbsolutePath().toString(); if (url != null) { String query = url.toString(); log.info(query); } //The URI at which this service was accessed, this will be the //central serviceUri in the response final UriRef serviceUri = new UriRef(resourcePath); //the in memory graph to which the triples for the response are added final MGraph responseGraph = new IndexedMGraph(); { final LockableMGraph dlcGraph = dlcGraphProvider.getDlcGraph(); Lock rl = dlcGraph.getLock().readLock(); rl.lock(); try { responseGraph.addAll(dlcGraph); } finally { rl.unlock(); } } // add available digesters Iterator<String> digestersNames = digesters.keySet().iterator(); while (digestersNames.hasNext()) { String digesterName = digestersNames.next(); responseGraph.add(new TripleImpl(DlcGraphProvider.DATA_LIFECYCLE_GRAPH_REFERENCE, DLC.enhanceService, new UriRef("urn:x-temp:/" + digesterName))); responseGraph.add(new TripleImpl(new UriRef("urn:x-temp:/" + digesterName), RDFS.label, new PlainLiteralImpl(digesterName))); } // add available rdfizers Iterator<String> rdfizersNames = rdfizers.keySet().iterator(); while (rdfizersNames.hasNext()) { String rdfizerName = rdfizersNames.next(); responseGraph.add(new TripleImpl(DlcGraphProvider.DATA_LIFECYCLE_GRAPH_REFERENCE, DLC.rdfizeService, new UriRef("urn:x-temp:/" + rdfizerName))); responseGraph.add(new TripleImpl(new UriRef("urn:x-temp:/" + rdfizerName), RDFS.label, new PlainLiteralImpl(rdfizerName))); } // add available interlinkers Iterator<String> interlinkersNames = interlinkers.keySet().iterator(); while (interlinkersNames.hasNext()) { String interlinkerName = interlinkersNames.next(); NonLiteral interlinkerNode = new BNode(); responseGraph.add(new TripleImpl(DlcGraphProvider.DATA_LIFECYCLE_GRAPH_REFERENCE, DLC.interlinkService, interlinkerNode)); responseGraph.add(new TripleImpl(interlinkerNode, RDFS.label, new PlainLiteralImpl(interlinkerName))); } //This GraphNode represents the service within our result graph final GraphNode node = new GraphNode(DlcGraphProvider.DATA_LIFECYCLE_GRAPH_REFERENCE, responseGraph); // Adds information about base uri configuration if (!isValidBaseUri) { responseGraph.add(new TripleImpl(DlcGraphProvider.DATA_LIFECYCLE_GRAPH_REFERENCE, RDFS.comment, new PlainLiteralImpl(INVALID_BASE_URI_ALERT))); } // The DLC service uri (set in component config panel) should be the same as the base uri (otherwise there might be a base uri config error) String platformPort = (uriInfo.getBaseUri().getPort() > 0) ? ":" + String.valueOf(uriInfo.getBaseUri().getPort()) : ""; String platformBaseUri = uriInfo.getBaseUri().getScheme() + "://" + uriInfo.getBaseUri().getHost() + platformPort; if (!platformBaseUri.equals((baseUri))) { String message = "The DLC service URI " + platformBaseUri + " is different from the base URI " + baseUri + " set in the component configuration."; responseGraph.add(new TripleImpl(DlcGraphProvider.DATA_LIFECYCLE_GRAPH_REFERENCE, RDFS.comment, new PlainLiteralImpl(message))); } for (Task task : tasks) { if (task.isActive()) { node.addProperty(DLC.activeTask, task.getUri()); responseGraph.addAll(task.getNode().getGraph()); } } //What we return is the GraphNode we created with a template path return new RdfViewable("SourcingAdmin", node, SourcingAdmin.class); } private LockableMGraph getContentGraph() { return tcManager.getMGraph(CONTENT_GRAPH_REF); } /** * Creates a new dataset with tasks and product graphs and adds its uri and * a label to the data life cycle graph. A graph will contain the RDF data * uploaded or sent by a transformation task that have to be processed (text * extraction, NLP processing, reconciliation, smushing). The following * graphs are created to store the results of the processing tasks * enhance.graph interlink.graph smush.graph publish.graph These graphs will * be empty at the beginning. * * @param uriInfo * @param graphName * @return */ @POST @Path("create_pipe") @Produces("text/plain") public Response createPipeRequest(@Context final UriInfo uriInfo, @FormParam("pipe_label") final String pipeLabel) { AccessController.checkPermission(new AllPermission()); // use dataset label as name after validation String datasetName = getValidDatasetName(pipeLabel); dataSetFactory.createDataSet(datasetName); return RedirectUtil.createSeeOtherResponse("./", uriInfo); } /** * Check whether a label can be used as a dataset name. To be a valid name a * label must be: 1) not null and at least one character long 2) without * white spaces 3) unique (no two dataset can have the same name) * * @return String */ private String getValidDatasetName(String label) { String newDatasetName = null; //check validity if (label == null || "".equals(label)) { return null; } // replace white space if present newDatasetName = label.replace(' ', '-'); //check uniqueness of name Lock rl = dlcGraphProvider.getDlcGraph().getLock().readLock(); rl.lock(); try { Iterator<Triple> idatasets = dlcGraphProvider.getDlcGraph().filter(null, RDF.type, DLC.Pipe); while (idatasets.hasNext()) { GraphNode datasetNode = new GraphNode((UriRef) idatasets.next().getSubject(), dlcGraphProvider.getDlcGraph()); String datasetName = datasetNode.getLiterals(RDFS.label).next().getLexicalForm(); if (newDatasetName.equals(datasetName)) { return null; } } } finally { rl.unlock(); } return newDatasetName; } /** * Applies one of the following operations to a graph: - add triples * (operation code: 1) - remove all triples (operation code: 2) - delete * graph (operation code: 3) - reconcile (operation code: 4) - smush * (operation code: 5) */ @POST @Path("dataUpload") @Produces("text/plain") public Response dataUpload(@Context final UriInfo uriInfo, @FormParam("pipe") final UriRef pipeRef, @FormParam("operation_code") final int operationCode, @FormParam("data_url") final URL dataUrl, @FormParam("rdfizer") final String rdfizer) throws IOException { AccessController.checkPermission(new AllPermission()); // validate arguments and handle all the connection exceptions StringWriter stringWriter = new StringWriter(); PrintWriter messageWriter = new PrintWriter(stringWriter); if (pipeExists(pipeRef)) { DataSet dataSet = dataSetFactory.getDataSet(pipeRef); switch (operationCode) { case UPLOAD_RDF: uploadRdf(dataSet, dataUrl, messageWriter); break; case UPLOAD_XML: uploadXml(dataSet, dataUrl, rdfizer, messageWriter); break; } } else { messageWriter.println("The dataset does not exist."); } //return stringWriter.toString(); return RedirectUtil.createSeeOtherResponse("./", uriInfo); } /** * Uploads RDF files. Each file name must end with .rdf or .ttl,.nt,.n3. An * url that does not ends with the mentioned extensions or ends with a slash * is supposed to refer to a folder in a local file system or in a remote * one (http server). * * @param dataSet * @param dataUrl * @param messageWriter * @throws IOException */ private void uploadRdf(DataSet dataSet, URL dataUrl, PrintWriter messageWriter) throws IOException { String[] fileNameExtensions = {".rdf", ".ttl", ".nt", ".n3"}; // retrieves the list of file to be uploaded ArrayList<String> fileList = FileUtil.getFileList(dataUrl, fileNameExtensions); Iterator<String> ifile = fileList.iterator(); while (ifile.hasNext()) { URL fileUrl = new URL(ifile.next()); URLConnection connection = fileUrl.openConnection(); String mediaType = guessContentTypeFromUri(fileUrl); InputStream stream = connection.getInputStream(); if (stream != null) { parser.parse(dataSet.getSourceGraph(), stream, mediaType); } } } /** * Uploads XML files. Each file name must end with .xml or .nxml. An url * that does not ends with the mentioned extensions or ends with a slash is * supposed to refer to a folder in a local file system or in a remote one * (http server). * * @param dataSet * @param dataUrl * @param rdfizerName * @param messageWriter */ private void uploadXml(DataSet dataSet, URL dataUrl, String rdfizerName, PrintWriter messageWriter) throws IOException { Rdfizer rdfizer = rdfizers.get(rdfizerName); String[] fileNameExtensions = {".xml", ".nxml"}; // retrieves the list of file to be uploaded ArrayList<String> fileList = FileUtil.getFileList(dataUrl, fileNameExtensions); Iterator<String> ifile = fileList.iterator(); while (ifile.hasNext()) { URL fileUrl = new URL(ifile.next()); URLConnection connection = fileUrl.openConnection(); InputStream stream = connection.getInputStream(); if (stream != null) { dataSet.getSourceGraph().addAll(rdfizer.transform(stream)); } } } /** * Applies one of the following operations to a graph: - add triples * (operation code: 1) - remove all triples (operation code: 2) - delete * graph (operation code: 3) - reconcile (operation code: 4) - smush * (operation code: 5) */ @POST @Path("performTask") @Produces("text/plain") public Response performTaskRequest(@Context final UriInfo uriInfo, @FormParam("pipe") final UriRef pipeRef, @FormParam("task_code") final int taskCode, @FormParam("rdfdigester") final String rdfdigester, @FormParam("interlinker") final String interlinker) throws IOException { AccessController.checkPermission(new AllPermission()); // validate arguments and handle all the connection exceptions StringWriter stringWriter = new StringWriter(); PrintWriter messageWriter = new PrintWriter(stringWriter); performTask(pipeRef, taskCode, rdfdigester, interlinker, messageWriter); //return stringWriter.toString(); return RedirectUtil.createSeeOtherResponse("./", uriInfo); } @POST @Path("processBatch") public Response processBatch(@Context final UriInfo uriInfo, @FormParam("dataSet") final UriRef dataSetRef, @FormParam("url") final URL url, @FormParam("rdfizer") final String rdfizerName, @FormParam("digester") final String digester, @FormParam("interlinker") final String interlinker, @FormParam("maxFiles") @DefaultValue("10") final int maxFiles, @FormParam("skipPreviouslyAdded") final String skipPreviouslyAddedValue, @FormParam("recurse") final String recurseValue, @FormParam("smushAndPublish") final String smushAndPublishValue) throws Exception { final boolean skipPreviouslyAdded = "on".equals(skipPreviouslyAddedValue); final boolean recurse = "on".equals(recurseValue); final boolean smushAndPublish = "on".equals(smushAndPublishValue); if (dataSetRef == null) { throw new WebApplicationException("Param dataSet must be specified", Response.Status.BAD_REQUEST); } AccessController.checkPermission(new DlcPermission()); final DataSet dataSet = dataSetFactory.getDataSet(dataSetRef); final Rdfizer rdfizer = rdfizerName.equals("none") ? null : rdfizers.get(rdfizerName); Task task = new Task(uriInfo) { @Override public void execute() { try { final int[] count = {0}; LinksRetriever.processLinks(url, recurse, new LinksRetriever.LinkProcessor() { public boolean process(URL dataUrl) { if (skipPreviouslyAdded) { Lock lock = dataSet.getLogGraph().getLock().readLock(); lock.lock(); try { if (dataSet.getLogGraph().filter(null, DLC.retrievedURI, new UriRef((dataUrl.toString()))).hasNext()) { return true; } } finally { lock.unlock(); } } if (isTerminationRequested()) { return false; } if (++count[0] > maxFiles) { return false; } try { rdfUploadPublish(dataSet, dataUrl, rdfizer, digester, interlinker, smushAndPublish, log); } catch (Exception e) { log.println("Exception processing " + dataUrl); e.printStackTrace(log); } return true; } }); } catch (Exception ex) { ex.printStackTrace(log); } } }; tasks.add(task); task.start(); return Response.seeOther(new URI(task.getUri().getUnicodeString())).build(); } @POST @Path("reprocess") public Response reprocess(@Context final UriInfo uriInfo, @FormParam("dataSet") final UriRef dataSetRef, @FormParam("interlinker") final String interlinkerName) throws Exception { if (dataSetRef == null) { throw new WebApplicationException("Param dataSet must be specified", Response.Status.BAD_REQUEST); } AccessController.checkPermission(new DlcPermission()); final DataSet dataSet = dataSetFactory.getDataSet(dataSetRef); final Interlinker interlinker = interlinkerName.equals("none") ? null : interlinkers.get(interlinkerName); Task task = new Task(uriInfo) { @Override public void execute() { try { if (interlinker != null) { log.println("Interlinking with: " + interlinker); final TripleCollection dataSetInterlinks = interlinker.interlink(dataSet.getDigestGraph(), dataSet.getDigestGraph()); dataSet.getInterlinksGraph().addAll(dataSetInterlinks); log.println("Added " + dataSetInterlinks.size() + " data-set interlinks to " + dataSet.getInterlinksGraphRef().getUnicodeString()); } else { log.println("No interlinker selected, proceding."); } // Smush SmushingJob.perform(dataSet, log, baseUri); // Publish publishData(dataSet, log); } catch (Exception ex) { ex.printStackTrace(log); } } }; tasks.add(task); task.start(); return Response.seeOther(new URI(task.getUri().getUnicodeString())).build(); } @GET @Path("task/{id}") public RdfViewable describeTask(@Context final UriInfo uriInfo) { final String resourcePath = uriInfo.getAbsolutePath().toString(); final UriRef taskUri = new UriRef(resourcePath); for (Task task : tasks) { if (task.getUri().equals(taskUri)) { return new RdfViewable("task", task.getNode(), SourcingAdmin.class); } } throw new WebApplicationException(Response.Status.NOT_FOUND); } @POST @Path("task/{id}") public Response actOnTaks(@Context final UriInfo uriInfo, @FormParam("action") String action) throws URISyntaxException { final String resourcePath = uriInfo.getAbsolutePath().toString(); final UriRef taskUri = new UriRef(resourcePath); for (Task task : tasks) { if (task.getUri().equals(taskUri)) { if ("TERMINATE".equalsIgnoreCase(action)) { task.requestTermination(); return Response.seeOther(new URI(task.getUri().getUnicodeString())).build(); } throw new WebApplicationException(Response.Status.BAD_REQUEST); } } throw new WebApplicationException(Response.Status.NOT_FOUND); } /** * Performs a task on a dataset: digest, interlink, smush, publish. * * @param pipeRef * @param taskCode * @param rdfdigester * @param interlinker * @param messageWriter * @throws IOException */ private void performTask(UriRef pipeRef, int taskCode, String rdfdigester, String interlinker, PrintWriter messageWriter) throws IOException { AccessController.checkPermission(new AllPermission()); if (pipeExists(pipeRef)) { DataSet dataSet = dataSetFactory.getDataSet((pipeRef)); switch (taskCode) { case TEXT_EXTRACTION: extractTextFromRdf(dataSet, rdfdigester, messageWriter); break; case COMPUTE_ENHANCEMENTS: computeEnhancements(dataSet, messageWriter); break; case RECONCILE_GRAPH_OPERATION: reconcile(dataSet, interlinker, messageWriter); break; case SMUSH_GRAPH_OPERATION: SmushingJob.perform(dataSet, messageWriter, baseUri); break; case PUBLISH_DATA: publishData(dataSet, messageWriter); break; } } else { messageWriter.println("The pipe does not exist."); } } @POST @Path("runsequence") @Produces("text/plain") public String runSequence(@Context final UriInfo uriInfo, @FormParam("pipe") final UriRef pipeRef, @FormParam("sequence_code") final int sequenceCode, @FormParam("digester") final String digester, @FormParam("interlinker") final String interlinker) throws IOException { AccessController.checkPermission(new AllPermission()); StringWriter stringWriter = new StringWriter(); PrintWriter messageWriter = new PrintWriter(stringWriter); messageWriter.println("Pipe: " + pipeRef.getUnicodeString() + " Digester: " + digester + " Interlinker: " + interlinker); if (pipeExists(pipeRef)) { DataSet dataSet = dataSetFactory.getDataSet(pipeRef); performAllTasks(dataSet, digester, interlinker, messageWriter); } else { messageWriter.println("The dataset does not exist."); } return stringWriter.toString(); } /** * Uploads and transforms Patent or PubMed XML data into RDF. * * @param dataUrl * @param rdfizer * @return */ private MGraph transformXml(DataSet dataSet, URL dataUrl, Rdfizer rdfizer, PrintWriter messageWriter) throws IOException { AccessController.checkPermission(new AllPermission()); // create a graph to store the data after the document transformation MGraph documentGraph = null; InputStream xmldata = null; if (isValidUrl(dataUrl)) { try { URLConnection connection = dataUrl.openConnection(); connection.addRequestProperty("Accept", "application/xml; q=1"); xmldata = connection.getInputStream(); } catch (FileNotFoundException ex) { messageWriter.println("The file " + dataUrl.toString() + " has not been found."); throw ex; } } else { messageWriter.println("The URL " + dataUrl.toString() + " is not a valid one.\n"); } int numberOfTriples = 0; if (xmldata != null) { documentGraph = rdfizer.transform(xmldata); numberOfTriples = documentGraph.size(); } if (documentGraph != null && numberOfTriples > 0) { // add the triples of the document graph to the source graph of the selected dataset Lock wl = dataSet.getSourceGraph().getLock().writeLock(); wl.lock(); try { dataSet.getSourceGraph().addAll(documentGraph); } finally { wl.unlock(); } messageWriter.println("Added " + numberOfTriples + " triples from " + dataUrl + " to " + dataSet.getSourceGraphRef().getUnicodeString()); } return documentGraph; } /** * Load RDF data into an existing graph from a URL (schemes: "file://" or * "http://"). The arguments to be passed are: 1) graph in which the RDF * data must be stored 2) url of the dataset After the upload the input * graph is sent to a digester to extract text for indexing and adding * entities found by NLP components (in the default chain) as subject * * @return the added triples */ private MGraph addTriples(DataSet dataSet, URL dataUrl, PrintWriter messageWriter) throws IOException { AccessController.checkPermission(new AllPermission()); // add the triples of the temporary graph into the graph selected by the user if (isValidUrl(dataUrl)) { MGraph updatedGraph = addTriplesCommand(dataSet.getSourceGraph(), dataUrl); messageWriter.println("Added " + updatedGraph.size() + " triples from " + dataUrl + " to " + dataSet.getSourceGraphRef().getUnicodeString()); return updatedGraph; } else { messageWriter.println("The URL of the data is not a valid one."); throw new RuntimeException("Invalid URL; " + dataUrl); } } /** * * Add triples to graph */ private MGraph addTriplesCommand(LockableMGraph targetGraph, URL dataUrl) throws IOException { AccessController.checkPermission(new AllPermission()); URLConnection connection = dataUrl.openConnection(); connection.addRequestProperty("Accept", "application/rdf+xml; q=.9, text/turte;q=1"); // create a temporary graph to store the data SimpleMGraph tempGraph = new SimpleMGraph(); String mediaType = connection.getHeaderField("Content-type"); if ((mediaType == null) || mediaType.equals("application/octet-stream")) { mediaType = guessContentTypeFromUri(dataUrl); } InputStream data = connection.getInputStream(); if (data != null) { parser.parse(tempGraph, data, mediaType); targetGraph.addAll(tempGraph); } return tempGraph; } /** * Reconciles a source graph against itself and against the content graph. * The result of the reconciliation is an equivalence set stored in the * interlink graph of the pipe. * * @param sourceGraphRef the URI of the referenced graph, i.e. the graph for * which the reconciliation should be performed. * @return String */ private void reconcile(DataSet dataSet, String selectedInterlinker, PrintWriter messageWriter) { if (dataSet.getSourceGraph().size() > 0) { // size of interlink graph before reconciliations int interlinkGraphInitSize = dataSet.getInterlinksGraph().size(); // reconcile the source graph against itself reconcileCommand(dataSet, dataSet.getSourceGraphRef(), dataSet.getSourceGraphRef(), selectedInterlinker); // size of interlink graph after reconciliation of source graph against itself int interlinkSourceGraphSize = dataSet.getInterlinksGraph().size(); // new interlinks within source graph int numSourceInterlinks = interlinkSourceGraphSize - interlinkGraphInitSize; if (numSourceInterlinks > 0) { messageWriter.println("A reconciliation task has been done on " + dataSet.getSourceGraphRef().getUnicodeString() + "\n" + numSourceInterlinks + " owl:sameAs statements have been created."); } else { messageWriter.println("A reconciliation task has been done on " + dataSet.getSourceGraphRef().getUnicodeString() + ". No equivalent entities have been found."); } // reconcile the source graph against the content graph if (getContentGraph().size() > 0) { reconcileCommand(dataSet, dataSet.getSourceGraphRef(), CONTENT_GRAPH_REF, selectedInterlinker); // size of interlink graph after reconciliation of source graph against content graph int interlinkContentGraphSize = dataSet.getInterlinksGraph().size(); // new interlinks with content graph int numContentInterlinks = interlinkContentGraphSize - interlinkSourceGraphSize; if (numContentInterlinks > 0) { messageWriter.println("A reconciliation task has been done between " + dataSet.getSourceGraphRef().getUnicodeString() + " and " + CONTENT_GRAPH_NAME + "\n" + numContentInterlinks + " owl:sameAs statements have been created."); } else { messageWriter.println("A reconciliation task has been done between " + dataSet.getSourceGraphRef().getUnicodeString() + " and " + CONTENT_GRAPH_NAME + "\n" + ". No equivalent entities have been found."); } } } else { messageWriter.println("The source graph does not exist or is empty."); } } /** * Reconciles a source graph with a target graph. The result of the * reconciliation is an equivalence set stored in the interlink graph of the * pipe. The graph used as source is the source rdf graph. */ private void reconcileCommand(DataSet dataSet, UriRef sourceGraphRef, UriRef targetGraphRef, String selectedInterlinker) { if (graphExists(sourceGraphRef)) { // Get the source graph from the triple store LockableMGraph sourceGraph = dataSet.getSourceGraph(); // reconcile the source graph with the target graph Interlinker interlinker = interlinkers.get(selectedInterlinker); TripleCollection owlSameAs = interlinker.interlink(sourceGraph, targetGraphRef); if (owlSameAs.size() > 0) { LockableMGraph sameAsGraph = dataSet.getInterlinksGraph(); sameAsGraph.addAll(owlSameAs); // add a reference of the equivalence set to the source graph dlcGraphProvider.getDlcGraph().add(new TripleImpl(dataSet.getInterlinksGraphRef(), DLC.subjectsTarget, sourceGraphRef)); // add a reference of the equivalence set to the target graph dlcGraphProvider.getDlcGraph().add(new TripleImpl(dataSet.getInterlinksGraphRef(), DLC.objectsTarget, targetGraphRef)); } } } /** * Extract text from dcterms:title and dcterms:abstract fields in the source * graph and adds a sioc:content property with that text in the enhance * graph. The text is used by the ECS for indexing. The keywords will be * related to the resource so that it could be retrieved. * * @return */ private void extractTextFromRdf(DataSet dataSet, String selectedDigester, PrintWriter messageWriter) { RdfDigester digester = digesters.get(selectedDigester); MGraph tempGraph = new IndexedMGraph(); LockableMGraph sourceGraph = dataSet.getSourceGraph(); Lock rl = sourceGraph.getLock().readLock(); rl.lock(); try { tempGraph.addAll(sourceGraph); } finally { rl.unlock(); } digester.extractText(tempGraph); tempGraph.removeAll(sourceGraph); dataSet.getDigestGraph().addAll(tempGraph); messageWriter.println("Extracted text from " + dataSet.getDigestGraphRef().getUnicodeString() + " by " + selectedDigester + " digester"); } /** * Sends the digested content to the default chain to compute enhancements * them stores them in the dataset's enhancements graph * * @param dataSet * @param messageWriter */ private void computeEnhancements(DataSet dataSet, PrintWriter messageWriter) { LockableMGraph digestGraph = dataSet.getDigestGraph(); computeEnhancements(digestGraph, dataSet.getEnhancementsGraph(), messageWriter); } private void computeEnhancements(LockableMGraph sourceGraph, MGraph targetGraph, PrintWriter messageWriter) { Lock digestLock = sourceGraph.getLock().readLock(); digestLock.lock(); try { Iterator<Triple> isiocStmt = sourceGraph.filter(null, SIOC.content, null); while (isiocStmt.hasNext()) { Triple stmt = isiocStmt.next(); UriRef itemRef = (UriRef) stmt.getSubject(); String content = ((PlainLiteralImpl) stmt.getObject()).getLexicalForm(); if (!"".equals(content) && content != null) { try { enhance(targetGraph, content, itemRef); } catch (IOException e) { throw new RuntimeException(); } catch (EnhancementException e) { e.printStackTrace(); } } } } finally { digestLock.unlock(); } } /** * Add dc:subject properties to an item pointing to entities which are * assumed to be related to content item. This method uses the * enhancementJobManager to extract related entities using NLP engines * available in the default chain. The node uri is also the uri of the * content item so that the enhancements will be referred that node. Each * enhancement found with a confidence value above a threshold is then added * as a dc:subject to the node */ private void enhance(MGraph targetGraph, String content, UriRef itemRef) throws IOException, EnhancementException { final ContentSource contentSource = new ByteArraySource( content.getBytes(), "text/plain"); final ContentItem contentItem = contentItemFactory.createContentItem( itemRef, contentSource); enhancementJobManager.enhanceContent(contentItem); // this contains the enhancement results final MGraph contentMetadata = contentItem.getMetadata(); addSubjects(targetGraph, itemRef, contentMetadata); } /** * Add dc:subject property to items pointing to entities extracted by NLP * engines in the default chain. Given a node and a TripleCollection * containing fise:Enhancements about that node dc:subject properties are * added to an item pointing to entities referenced by those enhancements if * the enhancement confidence value is above a threshold. * * @param node * @param metadata */ private void addSubjects(MGraph targetGraph, UriRef itemRef, TripleCollection metadata) { final GraphNode enhancementType = new GraphNode(TechnicalClasses.ENHANCER_ENHANCEMENT, metadata); final Set<UriRef> entities = new HashSet<UriRef>(); // get all the enhancements final Iterator<GraphNode> enhancements = enhancementType.getSubjectNodes(RDF.type); while (enhancements.hasNext()) { final GraphNode enhhancement = enhancements.next(); final Iterator<Literal> confidenceLiterals = enhhancement.getLiterals(org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE); //look the confidence value for each enhancement double enhancementConfidence = confidenceLiterals.hasNext() ? LiteralFactory.getInstance().createObject(Double.class, (TypedLiteral) confidenceLiterals.next()) : 1; if (enhancementConfidence >= confidenceThreshold) { // get entities referenced in the enhancement final Iterator<Resource> referencedEntities = enhhancement.getObjects(org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE); while (referencedEntities.hasNext()) { final UriRef entity = (UriRef) referencedEntities.next(); // Add dc:subject to the patent for each referenced entity targetGraph.add(new TripleImpl(itemRef, DC.subject, entity)); entities.add(entity); } } } for (UriRef uriRef : entities) { // We don't get the entity description directly from metadata // as the context there would include addResourceDescription(uriRef, targetGraph); } } /** * Add a description of the entities extracted from the text by NLP engines * in the default chain */ private void addResourceDescription(UriRef iri, MGraph mGraph) { final Entity entity = siteManager.getEntity(iri.getUnicodeString()); if (entity != null) { final RdfValueFactory valueFactory = new RdfValueFactory(mGraph); final Representation representation = entity.getRepresentation(); if (representation != null) { valueFactory.toRdfRepresentation(representation); } } } /** * Moves data from smush.grah to content.graph. The triples (facts) in the * two graphs must be coherent, i.e. the same. Before publishing the current * smushed data must be compared with the last published data. New triples * in the smushed graph not in the published graph must be added while * triples in the published graph absent in the smushed graph must be * removed. The algorithm is as follows 1) make all URIs in smush.graph http * dereferencable (uri canonicalization) 2) find triples in smush.graph not * in publish.graph (new triples) 3) find triples in publish.graph not in * smush.graph (old triples) 4) add new triples to content.graph 5) remove * old triples from content.graph 6) delete all triples in publish.graph 7) * copy triples from smush.graph to publish.graph */ private void publishData(DataSet dataSet, PrintWriter messageWriter) { // add these triples to the content.graph MGraph triplesToAdd = new SimpleMGraph(); // remove these triples from the content.graph MGraph triplesToRemove = new SimpleMGraph(); // triples to add to the content.graph Lock ls = dataSet.getSmushGraph().getLock().readLock(); ls.lock(); try { Iterator<Triple> ismush = dataSet.getSmushGraph().iterator(); while (ismush.hasNext()) { Triple smushTriple = ismush.next(); if (!dataSet.getPublishGraph().contains(smushTriple)) { triplesToAdd.add(smushTriple); } } } finally { ls.unlock(); } // triples to remove from the content.graph Lock lp = dataSet.getPublishGraph().getLock().readLock(); lp.lock(); try { Iterator<Triple> ipublish = dataSet.getPublishGraph().iterator(); while (ipublish.hasNext()) { Triple publishTriple = ipublish.next(); if (!dataSet.getSmushGraph().contains(publishTriple)) { triplesToRemove.add(publishTriple); } } } finally { lp.unlock(); } if (triplesToRemove.size() > 0) { getContentGraph().removeAll(triplesToRemove); log.info("Removed " + triplesToRemove.size() + " triples from " + CONTENT_GRAPH_REF.getUnicodeString()); } else { log.info("No triples to remove from " + CONTENT_GRAPH_REF.getUnicodeString()); } if (triplesToAdd.size() > 0) { getContentGraph().addAll(triplesToAdd); log.info("Added " + triplesToAdd.size() + " triples to " + CONTENT_GRAPH_REF.getUnicodeString()); } else { log.info("No triples to add to " + CONTENT_GRAPH_REF.getUnicodeString()); } dataSet.getPublishGraph().clear(); Lock rl = dataSet.getSmushGraph().getLock().readLock(); rl.lock(); try { dataSet.getPublishGraph().addAll(dataSet.getSmushGraph()); } finally { rl.unlock(); } // update the dataset status to published in the dlc meta graph updateDatasetStatus(dataSet); messageWriter.println("Copied " + triplesToAdd.size() + " triples from " + dataSet.getUri() + " to content-graph"); } /** * Updates the dataset status to published in the dlc meta graph * * @param datasetName */ private void updateDatasetStatus(DataSet dataSet) { UriRef statusRef = new UriRef(dataSet.getUri().getUnicodeString() + "/Status"); dlcGraphProvider.getDlcGraph().remove(new TripleImpl(statusRef, RDF.type, DLC.Unpublished)); dlcGraphProvider.getDlcGraph().remove(new TripleImpl(statusRef, RDFS.label, new PlainLiteralImpl("Unpublished"))); dlcGraphProvider.getDlcGraph().add(new TripleImpl(statusRef, RDF.type, DLC.Published)); dlcGraphProvider.getDlcGraph().add(new TripleImpl(statusRef, RDFS.label, new PlainLiteralImpl("Published"))); } /** * Performs the following tasks in sequence - Enhance - Interlink - Smush - * Publish * * @param pipeRef * @param digester * @param interlinker * @param mediaType * @return */ private void performAllTasks(DataSet dataSet, String digesterName, String interlinkerName, PrintWriter messageWriter) throws IOException { // Digest RDF data extractTextFromRdf(dataSet, digesterName, messageWriter); //compute enhacements computeEnhancements(dataSet, messageWriter); // Interlink (against itself and content.graph) reconcile(dataSet, interlinkerName, messageWriter); // Smush SmushingJob.perform(dataSet, messageWriter, baseUri); // Publish publishData(dataSet, messageWriter); } /** * Performs the following tasks in sequence - RDF data upload - Enhance - * Interlink - Smush - Publish * * @param pipeRef * @param dataUrl * @param digester * @param interlinker * @param mediaType * @return */ private void rdfUploadPublish(DataSet dataSet, URL dataUrl, Rdfizer rdfizer, String digesterName, String interlinkerName, boolean smushAndPublish, PrintWriter messageWriter) throws IOException { // Transform to RDF TripleCollection addedTriples = rdfizer == null ? addTriples(dataSet, dataUrl, messageWriter) : transformXml(dataSet, dataUrl, rdfizer, messageWriter); // Digest. Add sioc:content and dc:subject predicates LockableMGraph digestedTriples = new LockableMGraphWrapper(new IndexedMGraph()); digestedTriples.addAll(addedTriples); RdfDigester digester = digesters.get(digesterName); digester.extractText(digestedTriples); dataSet.getDigestGraph().addAll(digestedTriples); messageWriter.println("Added " + digestedTriples.size() + " digested triples to " + dataSet.getDigestGraphRef().getUnicodeString()); MGraph enhancedTriples = new IndexedMGraph(); computeEnhancements(digestedTriples, enhancedTriples, messageWriter); dataSet.getEnhancementsGraph().addAll(enhancedTriples); messageWriter.println("Added " + enhancedTriples.size() + " enahnced triples to " + dataSet.getEnhancementsGraphRef().getUnicodeString()); // Interlink (self) if (!interlinkerName.equals("none")) { Interlinker interlinker = interlinkers.get(interlinkerName); final TripleCollection dataSetInterlinks = interlinker.interlink(digestedTriples, dataSet.getDigestGraphRef()); dataSet.getInterlinksGraph().addAll(dataSetInterlinks); messageWriter.println("Added " + dataSetInterlinks.size() + " data-set interlinks to " + dataSet.getInterlinksGraphRef().getUnicodeString()); // Interlink (content.graph) final TripleCollection contentGraphInterlinks = interlinker.interlink(digestedTriples, CONTENT_GRAPH_REF); dataSet.getInterlinksGraph().addAll(contentGraphInterlinks); messageWriter.println("Added " + contentGraphInterlinks.size() + " content-graph interlinks to " + dataSet.getInterlinksGraphRef().getUnicodeString()); } if (smushAndPublish) { // Smush SmushingJob.perform(dataSet, messageWriter, baseUri); // Publish publishData(dataSet, messageWriter); } GraphNode logEntry = new GraphNode(new BNode(), dataSet.getLogGraph()); logEntry.addProperty(RDF.type, DLC.LogEntry); logEntry.addProperty(DLC.retrievedURI, new UriRef(dataUrl.toString())); } /** * Validate URL A valid URL must start with file:/// or http:// */ private boolean isValidUrl(URL url) { boolean isValidUrl = false; if (url != null) { if (url.toString().startsWith("http://") || url.toString().startsWith("file:/")) { isValidUrl = true; } } return isValidUrl; } /** * Extracts the content type from the file extension * * @param url * @return */ private String guessContentTypeFromUri(URL url) { String contentType = null; if (url.getFile().endsWith("ttl")) { contentType = "text/turtle"; } else if (url.getFile().endsWith("nt")) { contentType = "text/rdf+nt"; } else if (url.getFile().endsWith("n3")) { contentType = "text/rdf+n3"; } else if (url.getFile().endsWith("rdf")) { contentType = "application/rdf+xml"; } else if (url.getFile().endsWith("xml")) { contentType = "application/xml"; } return contentType; } /** * Checks if a graph exists and returns a boolean value. true if graph exist * false if graph does not exist * * @param graph_ref * @return */ private boolean graphExists(UriRef graph_ref) { Set<UriRef> graphs = tcManager.listMGraphs(); Iterator<UriRef> igraphs = graphs.iterator(); while (igraphs.hasNext()) { UriRef graphRef = igraphs.next(); if (graph_ref.toString().equals(graphRef.toString())) { return true; } } return false; } /** * Checks whether a pipe exists */ private boolean pipeExists(UriRef pipeRef) { boolean result = false; if (pipeRef != null) { GraphNode pipeNode = new GraphNode(pipeRef, dlcGraphProvider.getDlcGraph()); if (pipeNode != null) { result = true; } } return result; } }