/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; /** * * @author themis */ import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.methods.GetMethod; import org.dbpedia.spotlight.exceptions.AnnotationException; import org.dbpedia.spotlight.model.DBpediaResource; import org.dbpedia.spotlight.model.Text; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import scala.actors.threadpool.Arrays; /** * Simple web service-based annotation client for DBpedia Spotlight. * * @author pablomendes, Joachim Daiber, Themistoklis Mavridis */ public class DBpediaSpotlightClient extends AnnotationClient { /** * @param API_URL the url of the api * @param CONFIDENCE the confidence value for the DBpedia spotlight API * @param SUPPORT the support value for the DBpedia spotlight API * @param typesDBspot the list to contain all the semantic types (categories) * @param entitiesString the list to contain all the semantic entities * @param ent_cnt_dbpspot the number of entities that contained a term of the query * @param cat_cnt_dbpspot the number of categories that contained a term of the query * @param ent_cnt_dbpspot_whole the number of entities that contained the query as a whole * @param cat_cnt_dbpspot_whole the number of categories that contained the query as a whole * @param ent_avg_score the average score of the entities recognized * @param ent_max_score the maximum score of the entities recognized * @param ent_min_score the minimum score of the entities recognized * @param ent_median_score the median of scores of the entities recognized * @param ent_std_score the standard deviation of scores of the entities recognized * @param ent_avg_support the average support of the entities recognized * @param ent_max_support the maximum support of the entities recognized * @param ent_min_support the minimum support of the entities recognized * @param ent_median_support the median support of the entities recognized * @param ent_std_support the standard deviation of supports of the entities recognized * @param ent_avg_dif the average difference in similarity scores between first and second ranked entities * @param ent_max_dif the maximum difference in similarity scores between first and second ranked entities * @param ent_min_dif the minimum difference in similarity scores between first and second ranked entities * @param ent_median_dif the median difference in similarity scores between first and second ranked entities * @param ent_std_dif the standard deviation of difference in similarity scores between first and second ranked entities * @param ent_sim_cnt_dbpspot the average similarity score of entities that contained a term of the query * @param ent_sup_cnt_dbpspot the average support of entities that contained a term of the query * @param ent_dif_cnt_dbpspot the average difference in similarity scores between first and second ranked of entities that contained a term of the query * @param unique_ent_cnt_dbpspot the number of entities that don't have second candidate * @param unique_ent_scoreSum_dbpspot the sum of similarities scores of the entities that don't have second candidate * @param high_precision_content the percentage of total entities which are annotated using high precision settings */ //support = resource prominence //similarity score = topical relevance //percentageOfSecondRank = contextual ambiguity //private final static String API_URL = "http://jodaiber.dyndns.org:2222/"; private final static String API_URL = "http://spotlight.dbpedia.org/"; private double CONFIDENCE = 0.20; private int SUPPORT = 5; private List<String> typesDBspot; private List<String> entitiesString; private List<Double> similarityScores; private List<Double> similarityDifference; private List<Double> supports; private List<String> allEntities; private int ent_cnt_dbpspot=0; private int cat_cnt_dbpspot=0; private int ent_cnt_dbpspot_whole=0; private int cat_cnt_dbpspot_whole=0; private double ent_avg_score=-1.0; private double ent_max_score=-1.0; private double ent_min_score=-1.0; private double ent_median_score=-1.0; private double ent_std_score=-1.0; private double ent_avg_support=-1.0; private double ent_max_support=-1.0; private double ent_min_support=-1.0; private double ent_median_support=-1.0; private double ent_std_support=-1.0; private double ent_avg_dif=-1.0; private double ent_max_dif=-1.0; private double ent_min_dif=-1.0; private double ent_median_dif=-1.0; private double ent_std_dif=-1.0; private double ent_dif_cnt_dbpspot=-1.0; private double ent_sim_cnt_dbpspot=-1.0; private double ent_sup_cnt_dbpspot=-1.0; private double unique_ent_cnt_dbpspot=-1.0; private double unique_ent_scoreSum_dbpspot=-1.0; private double high_precision_content=-1.0; public DBpediaSpotlightClient(double conf,int sup){ this.CONFIDENCE=conf; this.SUPPORT=sup; } @Override public List<DBpediaResource> extract(Text text) throws AnnotationException { LOG.info("Querying API."); String spotlightResponse; try { GetMethod getMethod = new GetMethod(API_URL + "rest/annotate/?" + "confidence=" + CONFIDENCE + "&support=" + SUPPORT + "&text=" + URLEncoder.encode(text.text(), "utf-8")); getMethod.addRequestHeader(new Header("Accept", "application/json")); spotlightResponse = request(getMethod); } catch (UnsupportedEncodingException e) { throw new AnnotationException("Could not encode text.", e); } assert spotlightResponse != null; JSONObject resultJSON = null; JSONArray entities = null; try { resultJSON = new JSONObject(spotlightResponse); entities = resultJSON.getJSONArray("Resources"); } catch (JSONException e) { throw new AnnotationException("Received invalid response from DBpedia Spotlight API."); } LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>(); for(int i = 0; i < entities.length(); i++) { try { JSONObject entity = entities.getJSONObject(i); resources.add(new DBpediaResource(entity.getString("@URI"),Integer.parseInt(entity.getString("@support")))); } catch (JSONException e) { LOG.error("JSON exception "+e); } } return resources; } /** * Method that recognizes the entities through DBpedia spotlight the content of a given URL * @param url_check the url to be annotated * @param StemFlag a flag to determine if we want to use stemming */ @Override public void extract(String url_check,boolean StemFlag) throws AnnotationException { try{ Thread.sleep(1000); }catch(InterruptedException ex){ Thread.currentThread().interrupt(); } LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>(); entitiesString = new ArrayList<>(); typesDBspot = new ArrayList<>(); similarityScores = new ArrayList<>(); similarityDifference = new ArrayList<>(); supports = new ArrayList<>(); allEntities = new ArrayList<>(); double simScore=0.0; double percOfSec=0.0; try { LOG.info("Querying API."); String spotlightResponse; String request = API_URL + "rest/annotate?" + "confidence=" + CONFIDENCE + "&support=" + SUPPORT + "&url=" + URLEncoder.encode(url_check, "utf-8"); GetMethod getMethod = new GetMethod(request); getMethod.addRequestHeader(new Header("Accept", "application/json")); spotlightResponse = request(getMethod); assert spotlightResponse != null; JSONObject resultJSON = null; JSONArray entities = null; if(spotlightResponse.startsWith("{")){ resultJSON = new JSONObject(spotlightResponse); entities = resultJSON.getJSONArray("Resources"); for(int i = 0; i < entities.length(); i++) { try { JSONObject entity = entities.getJSONObject(i); //get the entity string by getting the last part of the URI String entityString = entity.getString("@URI").substring(28).toLowerCase().replaceAll("[\\_,\\%28,\\%29]", " "); if(StemFlag){//if we use stemming, we use Snowball stemmr of both entities and queries String[] splitEntity = entityString.split(" "); entityString=""; StemmerSnow stemmer = new StemmerSnow(); List<String> splitEntityList=stemmer.stem(Arrays.asList(splitEntity)); StringBuilder sb = new StringBuilder(); for(String s:splitEntityList){ sb.append(s.trim()); sb.append(" "); } entityString = sb.toString().trim(); } boolean flag_new_entity=false; if(!entitiesString.contains(entityString)){ flag_new_entity=true; entitiesString.add(entityString);//if we have found a unique entity we include it in the list } String typesString = entity.getString("@types");//we get the semantic types/categories String[] types = typesString.split("\\,"); String delimiter="";//the delimiter is different according to the type for(String type :types){ if(type.contains("DBpedia")||type.contains("Schema")){ //if it is DBpedia or Schema delimiter = "\\:"; } if(type.contains("Freebase")){//if it is Freebase delimiter = "\\/"; } String[] typeStrings = type.split(delimiter); String typeString = typeStrings[typeStrings.length-1].toLowerCase().replaceAll("[\\_,\\%28,\\%29]", " "); if(StemFlag){//if we choose to use stemming String[] splitType = typeString.split(" "); typeString=""; StemmerSnow stemmer = new StemmerSnow(); List<String> splitTypeList=stemmer.stem(Arrays.asList(splitType)); StringBuilder sb = new StringBuilder(); for(String s:splitTypeList){ sb.append(s.trim()); sb.append(" "); } typeString = sb.toString().trim(); } if(!typesDBspot.contains(typeString)){ typesDBspot.add(typeString); } } simScore = Double.valueOf(entity.getString("@similarityScore")); percOfSec = Double.valueOf(entity.getString("@percentageOfSecondRank")); allEntities.add(entityString); similarityScores.add(simScore); supports.add(Double.valueOf(entity.getString("@support"))); if (percOfSec==-1.0) similarityDifference.add(-1.0); else similarityDifference.add(simScore*(1-percOfSec)); //resources.add(new DBpediaResource(entity.getString("@URI"),Integer.parseInt(entity.getString("@support")))); } catch (JSONException e) { LOG.error("JSON exception "+e); } } //calculate statistics - similarity score ent_avg_score=getMean(similarityScores); ent_max_score=getMax(similarityScores); ent_min_score=getMin(similarityScores); ent_median_score=getMedian(similarityScores); ent_std_score=getStd(similarityScores); //calculate statistics - support ent_avg_support=getMean(supports); ent_max_support=getMax(supports); ent_min_support=getMin(supports); ent_median_support=getMedian(supports); ent_std_support=getStd(supports); //calculate statistics - difference in similarity scores between first and second ranked entities unique_ent_cnt_dbpspot=0.0; unique_ent_scoreSum_dbpspot=0; List<Double> tempList=new ArrayList<>(); for (int i=0; i<similarityDifference.size(); i++){ if(similarityDifference.get(i)==-1){ unique_ent_cnt_dbpspot+=1; unique_ent_scoreSum_dbpspot+=similarityScores.get(i); } else{ tempList.add(similarityDifference.get(i)); } } unique_ent_cnt_dbpspot=unique_ent_cnt_dbpspot/allEntities.size(); if(unique_ent_scoreSum_dbpspot==0) unique_ent_scoreSum_dbpspot=-1; ent_avg_dif=getMean(tempList); ent_max_dif=getMax(tempList); ent_min_dif=getMin(tempList); ent_median_dif=getMedian(tempList); ent_std_dif=getStd(tempList); //calculate high precision content if(!StemFlag){ high_precision_content = (double)getHighPrecContent(url_check)/allEntities.size(); } } } catch (UnsupportedEncodingException | JSONException ex) { Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex); } } /** * Method to count the statistics for the entities and categories * @param url_check the url to count the statistics for * @param query the query term that which the url was a result of * @param StemFlag flag to use stemming or not */ public void countEntCat(String url_check,String query,boolean StemFlag) { try { ent_cnt_dbpspot=0; cat_cnt_dbpspot=0; ent_cnt_dbpspot_whole=0; cat_cnt_dbpspot_whole=0; ent_dif_cnt_dbpspot=0.0; ent_sim_cnt_dbpspot=0.0; ent_sup_cnt_dbpspot=0.0; extract(url_check,StemFlag);//we get the entities and categoriss query = query.toLowerCase(); String[] splitQuery = query.split("\\+");//we split the query with + because the queries to the Search APIs have + between the terms if(StemFlag){//we stem the query List<String> splitQuerylist = java.util.Arrays.asList(splitQuery); StemmerSnow stemmer = new StemmerSnow(); splitQuerylist = stemmer.stem(splitQuerylist); splitQuery = splitQuerylist.toArray(new String[splitQuerylist.size()]); } int ent_count=0;//counter to count if we matched the whole query to an entity int index; for(String s:entitiesString){ ent_count=0; for(String splitStr:splitQuery){ if(s.contains(splitStr)){ ent_cnt_dbpspot++; ent_count++; } } if(ent_count==splitQuery.length){//if the counter is equal to the splitQuery length, it means that all the query terms are included in the entity ent_cnt_dbpspot_whole++; } } int ent_count_all=0; int ent_count_dif=0; for (int i=0; i<allEntities.size(); i++){ for(String splitStr:splitQuery){ if(allEntities.get(i).contains(splitStr)){ ent_count_all++; ent_sup_cnt_dbpspot+=supports.get(i); ent_sim_cnt_dbpspot+=similarityScores.get(i); if(similarityDifference.get(i)!=-1){ ent_dif_cnt_dbpspot+=similarityDifference.get(i); ent_count_dif++; } } } } if (ent_count_all!=0){ ent_sup_cnt_dbpspot /= (double)ent_count_all; ent_sim_cnt_dbpspot /= (double)ent_count_all; } else{ ent_sup_cnt_dbpspot=-1.0; ent_sim_cnt_dbpspot=-1.0; } if (ent_count_dif!=0){ ent_dif_cnt_dbpspot /= (double)ent_count_dif; } else{ ent_dif_cnt_dbpspot=-1.0; } int cat_count=0;//counter to count if we matched the whole query to a category for(String s:typesDBspot){ cat_count=0; for(String splitStr:splitQuery){//if the counter is equal to the splitQuery length, it means that all the query terms are included in the category if(s.contains(splitStr)){ cat_cnt_dbpspot++; cat_count++; } } if(cat_count==splitQuery.length){ cat_cnt_dbpspot_whole++; } } } catch (Exception ex) { Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex); } } /** * Method to get the entities counter (partial query match) * @return entities counter */ public int getcountEnt(){return ent_cnt_dbpspot;} /** * Method to get the categories counter (partial query match) * @return categories counter that have a partial query match */ public int getcountCat(){return cat_cnt_dbpspot;} /** * Method to get the entities counter (whole query match) * @return entities counter that have whole query match */ public int getcountEntWhole(){return ent_cnt_dbpspot_whole;} /** * Method to get the categories counter (whole query match) * @return categories counter that have whole query match */ public int getcountCatWhole(){return cat_cnt_dbpspot_whole;} /** * Method to get the entities List * @return entities List */ public List<String> getEntities(){return entitiesString;} /** * Method to get the categories List * @return categories List */ public List<String> getCategories(){return typesDBspot;} /** * Method to get the entities average score * @return entities average score */ public double getEntitiesAvgScore(){return ent_avg_score;} /** * Method to get the entities max score * @return entities max score */ public double getEntitiesMaxScore(){return ent_max_score;} /** * Method to get the entities min score * @return entities min score */ public double getEntitiesMinScore(){return ent_min_score;} /** * Method to get the entities median score * @return entities median score */ public double getEntitiesMedianScore(){return ent_median_score;} /** * Method to get the standard deviation of entities' score * @return standard deviation of entities' score */ public double getEntitiesStdScore(){return ent_std_score;} /** * Method to get the entities average support * @return entities average support */ public double getEntitiesAvgSupport(){return ent_avg_support;} /** * Method to get the entities max support * @return entities max support */ public double getEntitiesMaxSupport(){return ent_max_support;} /** * Method to get the entities min support * @return entities min support */ public double getEntitiesMinSupport(){return ent_min_support;} /** * Method to get the entities median support * @return entities median support */ public double getEntitiesMedianSupport(){return ent_median_support;} /** * Method to get the standard deviation of entities' support * @return standard deviation of entities' support */ public double getEntitiesStdSupport(){return ent_std_support;} /** * Method to get the entities support-weighted average * @return entities support-weighted average */ public double getcountSupEnt(){return ent_sup_cnt_dbpspot;} /** * Method to get the entities similarity-weighted average * @return entities similarity-weighted average */ public double getcountSimEnt(){return ent_sim_cnt_dbpspot;} /** * Method to get the entities similarity difference-weighted average * @return entities similarity difference-weighted average */ public double getcountDifEnt() {return ent_dif_cnt_dbpspot;} /** * Method to get the entities average difference from second resource * @return entities average difference from second resource */ public double getEntitiesAvgDif(){return ent_avg_dif;} /** * Method to get the entities max difference from second resource * @return entities max difference from second resource */ public double getEntitiesMaxDif(){return ent_max_dif;} /** * Method to get the entities min difference from second resource * @return entities min difference from second resource */ public double getEntitiesMinDif(){return ent_min_dif;} /** * Method to get the entities median difference from second resource * @return entities median difference from second resource */ public double getEntitiesMedianDif(){return ent_median_dif;} /** * Method to get the entities standard deviation of difference from second resource * @return entities standard deviation of difference from second resource */ public double getEntitiesStdDif(){return ent_std_dif;} /** * Method to get the number of entities which are the only candidates * @return number of entities which are the only candidates */ public double getUniqueEntCnt() {return unique_ent_cnt_dbpspot;} /** * Method to get the total similarity score of entities which are the only candidates * @return total similarity score of entities which are the only candidates */ public double getUniqueEntScoreSum() {return unique_ent_scoreSum_dbpspot;} /** * Method to get high precision content * @return percentage of total content which is annotated using high precision settings */ public double getHighPrecEntities() {return high_precision_content;} /** * Calculates the mean of the list of doubles * @param data list of double values to get the mean from * @return the mean from the list given */ private double getMean(List<Double> data){ if (data.isEmpty()) return -1.0; double sum = 0.0; for(Double d : data) sum += d; return sum/data.size(); } /** * Calculates the max of the list of doubles * @param data list of double values to get the max from * @return the max from the list given */ private double getMax(List<Double> data){ if (data.isEmpty()) return -1.0; double max=data.get(0); for(Double d : data){ if(d>max) max=d; } return max; } /** * Calculates the min of the list of doubles * @param data list of double values to get the min from * @return the min from the list given */ private double getMin(List<Double> data){ if (data.isEmpty()) return -1.0; double min=data.get(0); for(Double d : data){ if(d<min) min=d; } return min; } /** * Calculates the standard deviation of the list of doubles * @param data list of double values to get the standard deviation from * @return the standard deviation from the list given */ private double getStd(List<Double> data){ if (data.isEmpty()) return -1.0; double mean = getMean(data); double temp = 0; for(Double d :data) temp += (mean-d)*(mean-d); temp=temp/data.size(); return Math.sqrt(temp); } /** * Calculates the median of the list of doubles * @param data list of double values to get the standard median from * @return the median from the list given */ private double getMedian(List<Double> data){ if (data.isEmpty()) return -1.0; List<Double> sorted=data; Collections.sort(sorted); if (sorted.size() % 2 == 0){ return (sorted.get((sorted.size() / 2) - 1) + sorted.get(sorted.size() / 2)) / 2.0; }else{ return sorted.get(sorted.size() / 2); } } /** * Method calculate the amount of entities annotated in the content of a url using high precision settings * @param url_check the url to be annotated * @return total number of entities with high precision settings */ private int getHighPrecContent(String url_check){ try { LOG.info("Querying API."); String spotlightResponse; String request = API_URL + "rest/annotate?" + "confidence=" + 0.6 + "&support=" + 2000 + "&url=" + URLEncoder.encode(url_check, "utf-8"); GetMethod getMethod = new GetMethod(request); getMethod.addRequestHeader(new Header("Accept", "application/json")); spotlightResponse = request(getMethod); assert spotlightResponse != null; JSONObject resultJSON = null; JSONArray entities = null; if(spotlightResponse.startsWith("{")){ resultJSON = new JSONObject(spotlightResponse); entities = resultJSON.getJSONArray("Resources"); return entities.length(); } return 0; } catch (UnsupportedEncodingException | JSONException | AnnotationException ex) { Logger.getLogger(DBpediaSpotlightClient.class.getName()).log(Level.SEVERE, null, ex); return 0; } } }