/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.annotator.impl.spotlight; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.aksw.gerbil.annotator.http.AbstractHttpBasedAnnotator; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Span; import org.aksw.gerbil.transfer.nif.data.SpanImpl; import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpHeaders; import org.apache.http.StatusLine; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.util.EntityUtils; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.carrotsearch.hppc.ObjectObjectOpenHashMap; /** * Client of DBpedia Spotlight. This implementation is mainly based on the * implementations of the BAT-Framework and the HAWK project. * * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ public class SpotlightClient { private static final Logger LOGGER = LoggerFactory.getLogger(SpotlightClient.class); private static final String TYPE_PREFIX_URI_MAPPING[][] = new String[][] { { "freebase", "http://rdf.freebase.com/ns/" }, { "dbpedia", "http://dbpedia.org/ontology/" } }; private static final String DEFAULT_REQUEST_URL = "http://spotlight.dbpedia.org:80/rest"; // private static final double DEFAULT_MIN_CONFIDENCE = -1; // private static final int DEFAULT_MIN_SUPPORT = -1; private static final String ANNOTATE_RESOURCE = "annotate"; private static final String SPOT_RESOURCE = "spot"; private static final String DISAMBIGUATE_RESOURCE = "disambiguate"; private String serviceURL; // private double minConfidence = 0.2; // private int minSupport = 20; private ObjectObjectOpenHashMap<String, String> typePrefixToUriMapping; private SpotlightAnnotator annotator; public SpotlightClient(SpotlightAnnotator annotator) { this(DEFAULT_REQUEST_URL, annotator); } public SpotlightClient(String serviceURL, SpotlightAnnotator annotator) { this.serviceURL = serviceURL.endsWith("/") ? serviceURL : (serviceURL + "/"); this.annotator = annotator; typePrefixToUriMapping = new ObjectObjectOpenHashMap<String, String>(); for (int i = 0; i < TYPE_PREFIX_URI_MAPPING.length; ++i) { typePrefixToUriMapping.put(TYPE_PREFIX_URI_MAPPING[i][0], TYPE_PREFIX_URI_MAPPING[i][1]); } } protected String request(String inputText, String requestUrl) throws GerbilException { String parameters; try { parameters = "text=" + URLEncoder.encode(inputText, "UTF-8"); } catch (UnsupportedEncodingException e) { LOGGER.error("Exception while encoding request data.", e); throw new GerbilException("Exception while encoding request data.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } HttpPost request = null; try { request = annotator.createPostRequest(requestUrl); } catch (IllegalArgumentException e) { throw new GerbilException("Couldn't create HTTP request.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } HttpEntity entity = new StringEntity(parameters, "UTF-8"); request.addHeader(HttpHeaders.CONTENT_TYPE, "application/x-www-form-urlencoded;charset=UTF-8"); request.addHeader(HttpHeaders.ACCEPT, "application/json"); request.addHeader(HttpHeaders.ACCEPT_CHARSET, "UTF-8"); request.setEntity(entity); entity = null; CloseableHttpResponse response = null; InputStream is = null; try { try { response = annotator.getClient().execute(request); } catch (java.net.SocketException e) { if (e.getMessage().contains(AbstractHttpBasedAnnotator.CONNECTION_ABORT_INDICATING_EXCPETION_MSG)) { LOGGER.error("It seems like the annotator has needed too much time and has been interrupted."); throw new GerbilException( "It seems like the annotator has needed too much time and has been interrupted.", e, ErrorTypes.ANNOTATOR_NEEDED_TOO_MUCH_TIME); } else { LOGGER.error("Exception while sending request.", e); throw new GerbilException("Exception while sending request.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } } catch (Exception e) { LOGGER.error("Exception while sending request.", e); throw new GerbilException("Exception while sending request.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } StatusLine status = response.getStatusLine(); if ((status.getStatusCode() < 200) || (status.getStatusCode() >= 300)) { LOGGER.error("Response has the wrong status: " + status.toString()); throw new GerbilException("Response has the wrong status: " + status.toString(), ErrorTypes.UNEXPECTED_EXCEPTION); } entity = response.getEntity(); try { return IOUtils.toString(entity.getContent(), "UTF-8"); } catch (Exception e) { LOGGER.error("Couldn't parse the response.", e); throw new GerbilException("Couldn't parse the response.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } } finally { IOUtils.closeQuietly(is); if (entity != null) { try { EntityUtils.consume(entity); } catch (IOException e1) { } } if (response != null) { try { response.close(); } catch (IOException e) { } } annotator.closeRequest(request); } } public List<TypedNamedEntity> annotateSavely(Document document) { try { return annotate(document); } catch (GerbilException e) { LOGGER.error("Error while requesting DBpedia Spotlight to annotate text. Returning null.", e); return null; } } public List<TypedNamedEntity> annotate(Document document) throws GerbilException { String response = request(document.getText(), serviceURL + ANNOTATE_RESOURCE); return parseAnnotationResponse(response); } protected List<TypedNamedEntity> parseAnnotationResponse(String response) { List<TypedNamedEntity> markings = new ArrayList<TypedNamedEntity>(); JSONParser parser = new JSONParser(); JSONObject jsonObject = null; try { jsonObject = (JSONObject) parser.parse(response); } catch (ParseException e) { LOGGER.error("Error while parsing DBpedia Spotlight response. Returning null.", e); return null; } JSONArray resources = (JSONArray) jsonObject.get("Resources"); JSONObject resource; int start; int length; String uri = null; Set<String> types; String typeStrings[], uriParts[]; if (resources != null) { for (Object res : resources.toArray()) { resource = (JSONObject) res; start = Integer.parseInt((String) resource.get("@offset")); length = ((String) resource.get("@surfaceForm")).length(); try { uri = URLDecoder.decode((String) resource.get("@URI"), "UTF-8"); } catch (UnsupportedEncodingException e) { LOGGER.error("Error while parsing DBpedia Spotlight response. Returning null.", e); return null; } // create Types set typeStrings = ((String) resource.get("@types")).split(","); types = new HashSet<String>(typeStrings.length); for (int i = 0; i < typeStrings.length; ++i) { uriParts = typeStrings[i].split(":"); uriParts[0] = uriParts[0].toLowerCase(); if (typePrefixToUriMapping.containsKey(uriParts[0])) { types.add(typePrefixToUriMapping.get(uriParts[0]) + uriParts[1]); } else { types.add(typeStrings[i]); } } markings.add(new TypedNamedEntity(start, length, uri, types)); } } return markings; } public List<Span> spotSavely(Document document) { try { return spot(document); } catch (GerbilException e) { LOGGER.error("Error while requesting DBpedia Spotlight to spot text. Returning null.", e); return null; } } public List<Span> spot(Document document) throws GerbilException { String response = request(document.getText(), serviceURL + SPOT_RESOURCE); return parseSpottingResponse(response); } protected List<Span> parseSpottingResponse(String response) { List<Span> markings = new ArrayList<Span>(); JSONParser parser = new JSONParser(); JSONObject jsonObject = null; try { jsonObject = (JSONObject) parser.parse(response); } catch (ParseException e) { LOGGER.error("Error while parsing DBpedia Spotlight response. Returning null.", e); return null; } jsonObject = (JSONObject) jsonObject.get("annotation"); JSONArray resources = (JSONArray) jsonObject.get("surfaceForm"); JSONObject resource; int start; int length; if (resources != null) { for (Object res : resources.toArray()) { resource = (JSONObject) res; start = Integer.parseInt((String) resource.get("@offset")); length = ((String) resource.get("@name")).length(); markings.add(new SpanImpl(start, length)); } } return markings; } public List<TypedNamedEntity> disambiguate(Document document) throws GerbilException { String text = document.getText(); StringBuilder requestBuilder = new StringBuilder(); requestBuilder.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?><annotation text=\""); try { requestBuilder.append(URLEncoder.encode(document.getText(), "UTF-8")); } catch (UnsupportedEncodingException e) { LOGGER.error("Exception while encoding request data.", e); throw new GerbilException("Exception while encoding request data.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } requestBuilder.append("\">"); List<Span> spans = document.getMarkings(Span.class); int start; for (Span span : spans) { start = span.getStartPosition(); requestBuilder.append("<surfaceForm name=\""); try { requestBuilder.append(URLEncoder.encode(text.substring(start, start + span.getLength()), "UTF-8")); } catch (UnsupportedEncodingException e) { LOGGER.error("Exception while encoding request data.", e); throw new GerbilException("Exception while encoding request data.", e, ErrorTypes.UNEXPECTED_EXCEPTION); } requestBuilder.append("\" offset=\""); requestBuilder.append(start); requestBuilder.append("\" />"); } requestBuilder.append("</annotation>"); String response = request(requestBuilder.toString(), serviceURL + DISAMBIGUATE_RESOURCE); return parseAnnotationResponse(response); } public List<TypedNamedEntity> disambiguateSavely(Document document) { try { return disambiguate(document); } catch (GerbilException e) { LOGGER.error("Error while requesting DBpedia Spotlight to spot text. Returning null.", e); return null; } } }