/******************************************************************************/
/* Copyright (C) 2010-2011, Sebastian Hellmann */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
/* You may obtain a copy of the License at */
/* */
/* http://www.apache.org/licenses/LICENSE-2.0 */
/* */
/* Unless required by applicable law or agreed to in writing, software */
/* distributed under the License is distributed on an "AS IS" BASIS, */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
/* See the License for the specific language governing permissions and */
/* limitations under the License. */
/******************************************************************************/
package org.nlp2rdf.implementation.spotlight;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.TreeMap;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.nlp2rdf.core.NIFParameters;
import org.nlp2rdf.core.RLOGSLF4JBinding;
import org.nlp2rdf.core.Span;
import org.nlp2rdf.core.Text2RDF;
import org.nlp2rdf.core.urischemes.URIScheme;
import org.nlp2rdf.core.vocab.NIFDatatypeProperties;
import org.nlp2rdf.core.vocab.RLOGIndividuals;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hp.hpl.jena.ontology.Individual;
import com.hp.hpl.jena.ontology.OntModel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
/**
* A Wrapper for DBPedia Spotlight Author: Ciro Baron Neto This demo uses the
* work cited here:
* https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Citation
* #statistical
*/
public class SpotlightWrapper extends NIFWrapper {
private static Logger log = LoggerFactory.getLogger(SpotlightWrapper.class);
// spotlight properties
public Hashtable<String, List<String>> h = new Hashtable<>();
String dbpediaResourceLink = "http://dbpedia.org/resource/";
// String spotlightAPI = "http://spotlight.dbpedia.org/";
String spotlightAPI = "http://spotlight.sztaki.hu:2222/";
String confidence = "0.1";
String support = "0";
String policy = "whitelist";
String disambiguator = "Default";
public void processText(Individual context, OntModel inputModel,
OntModel outputModel, NIFParameters nifParameters) {
String contextString = context
.getPropertyValue(
NIFDatatypeProperties.isString
.getDatatypeProperty(inputModel)).asLiteral()
.getString();
String prefix = nifParameters.getPrefix();
URIScheme urischeme = nifParameters.getUriScheme();
confidence = nifParameters.getOptions().valueOf("confidence")
.toString();
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
// create an empty Annotation just with the given text
Annotation document = new Annotation(contextString);
// run all Annotators on this text
pipeline.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and
// has values with custom types
List<CoreMap> sentences = document
.get(CoreAnnotations.SentencesAnnotation.class);
// get all the sentences and words and read it in an intermediate
// structure
// NOTE: this can be greatly optimized of course
// for now it is just simple and cheap to implement it like this
int wordCount = 0;
TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
for (CoreMap sentence : sentences) {
Span sentenceSpan = new Span(
sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
List<Span> wordSpans = new ArrayList<Span>();
for (CoreLabel coreLabel : sentence
.get(CoreAnnotations.TokensAnnotation.class)) {
wordSpans
.add(new Span(
coreLabel
.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
coreLabel
.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)));
wordCount++;
}
tokenizedText.put(sentenceSpan, wordSpans);
}
Text2RDF t = new Text2RDF();
t.generateNIFModel(prefix, context, urischeme, inputModel,
tokenizedText);
outputModel.add(RLOGSLF4JBinding.log(nifParameters.getLogPrefix(),
"Finished creating " + tokenizedText.size()
+ " sentence(s) with " + wordCount + " word(s) ",
RLOGIndividuals.DEBUG, this.getClass().getCanonicalName(),
null, null));
// query spotlight
querySpotlight(contextString);
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence
.get(CoreAnnotations.TokensAnnotation.class)) {
Span wordSpan = new Span(
token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class),
token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
// the word should exist already
Individual wordIndividual = outputModel.getIndividual(urischeme
.generate(prefix, contextString, wordSpan));
if (wordIndividual == null) {
log.error("SKIPPING: word was not found in the model: "
+ urischeme.generate(prefix, contextString,
wordSpan));
continue;
}
if (h.get(String.valueOf(wordSpan.getStart())) != null) {
List<String> l = h.get(String.valueOf(wordSpan.getStart()));
Iterator<String> iterator = l.iterator();
while (iterator.hasNext()) {
String uri = iterator.next();
wordIndividual.addProperty(outputModel.getProperty(
"http://www.w3.org/2005/11/its/rdf#",
"taIdentRef"), outputModel
.createResource(dbpediaResourceLink + uri));
}
}
outputModel.setNsPrefix("itsrdf",
"http://www.w3.org/2005/11/its/rdf#");
}
}
}
// Method to query spotlight webservice
public Hashtable<String, List<String>> querySpotlight(String context) {
// String context =
// "Hong Kong (CNN) -- A week into paralyzing pro-democracy protests in Hong Kong, authorities and demonstrators are still at loggerheads. Both sides say they are open to talks, but each wants concessions from the other. A student group said Sunday that it would restart dialogue with the government if police do a better job of handling clashes between pro-democracy protesters and people opposed to the demonstrations. The protesters, many of them students, have blocked major highways in several key districts for the past week, challenging a decision by Beijing about how elections will work in the semiautonomous Chinese territory.";
log.info("Querying API.");
log.info("This service uses the paper: Improving Efficiency and Accuracy in Multilingual Entity Extraction, from Joachim Daiber and Max Jakob and Chris Hokamp and Pablo N. Mendes. More information can be found here: https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Citation#statistical");
HttpClient client = new HttpClient();
try {
GetMethod getMethod = new GetMethod(spotlightAPI
+ "rest/candidates?" + "confidence=" + confidence
+ "&support=" + support + "&policy=" + policy
+ "&disambiguator=" + disambiguator + "&text="
+ URLEncoder.encode(context, "utf-8"));
getMethod
.addRequestHeader(new Header("Accept", "application/json"));
// Execute the method.
int statusCode = client.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
}
// Read the response body.
byte[] responseBody = getMethod.getResponseBody();
JSONObject j = new JSONObject(new String(responseBody));
// System.out.println(j);
JSONObject annotation = new JSONObject(j
.getJSONObject("annotation").toString());
// System.out.println(annotation);
// check if is only one object
boolean isArray = true;
try {
JSONObject surface = annotation.getJSONObject("surfaceForm");
isArray = false;
String key = surface.getString("@offset");
String value = surface.getJSONObject("resource").get("@uri")
.toString();
List<String> value2 = new ArrayList<String>();
value2.add(value);
// log.info("Adding value "+value+" for offset "+key);
h.put(key, value2);
} catch (Exception e) {
e.getMessage();
}
if (isArray) {
JSONArray surface = annotation.getJSONArray("surfaceForm");
try {
JSONArray resultJSON = new JSONArray(surface.toString());
// System.out.println(resultJSON);
for (int i = 0; i < resultJSON.length(); i++) {
try {
JSONObject entity = resultJSON.getJSONObject(i);
// check if is an array of entities
isArray = false;
try {
JSONArray resources = entity
.getJSONArray("resource");
isArray = true;
List<String> value2 = new ArrayList<String>();
for (int k = 0; k < resources.length(); k++) {
JSONObject entity2 = resources
.getJSONObject(k);
// System.out.println("----" + entity2);
String key = entity.getString("@offset");
String value = entity2.getString("@uri");
// log.info("Adding value "+value+" for offset "+key);
value2.add(value);
h.put(key, value2);
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
// if there is only one entity
if (!isArray) {
String key = entity.getString("@offset");
String value = entity.getJSONObject("resource")
.get("@uri").toString();
List<String> value2 = new ArrayList<String>();
value2.add(value);
h.put(key, value2);
}
} catch (JSONException e) {
System.out.println("JSON exception " + e);
}
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
return h;
}
}