package doser.webclassify.annotation;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import doser.entitydisambiguation.dpo.DisambiguatedEntity;
import doser.entitydisambiguation.properties.Properties;
import doser.entitydisambiguation.table.logic.Type;
import doser.general.HelpfulMethods;
import doser.language.Languages;
import doser.tools.RDFGraphOperations;
import doser.tools.ServiceQueries;
import doser.webclassify.algorithm.EntityRelevanceAlgorithm;
import doser.webclassify.algorithm.EntitySignificanceAlgorithmPR_W2V;
import doser.webclassify.algorithm.EntitySignificanceAlgorithm_Doc2Vec;
import doser.webclassify.dpo.Paragraph;
public class AnnotateEntities {
public List<Map.Entry<DisambiguatedEntity, Integer>> createEntityDistributionParagraph(
Map<DisambiguatedEntity, Integer> map) {
return HelpfulMethods.sortByValue(map);
}
public List<Map.Entry<DisambiguatedEntity, Integer>> createEntityDistributionDocument(Set<Paragraph> paragraphs,
Languages lang) {
Map<DisambiguatedEntity, Integer> map = createEntityMap(paragraphs, lang);
return HelpfulMethods.sortByValue(map);
}
public List<DisambiguatedEntity> extractSignificantEntitiesInParagraph(Paragraph p, Languages lang) {
Set<Paragraph> set = new HashSet<Paragraph>();
set.add(p);
Map<DisambiguatedEntity, Integer> map = createEntityMap(set, lang);
List<DisambiguatedEntity> l = new ArrayList<DisambiguatedEntity>();
l.add(extractTopicEntity(map, p, lang));
return l;
}
public DisambiguatedEntity extractTopicEntity(Map<DisambiguatedEntity, Integer> map, Paragraph p, Languages lang) {
EntityRelevanceAlgorithm sig = new EntitySignificanceAlgorithm_Doc2Vec();
String topicEntityString = sig.process(map, p, lang);
DisambiguatedEntity topicEntity = null;
if (!topicEntityString.equalsIgnoreCase("")) {
topicEntity = new DisambiguatedEntity();
topicEntity.setEntityUri(topicEntityString);
topicEntity.setCategories(RDFGraphOperations.getDbpediaCategoriesFromEntity(topicEntityString));
topicEntity.setType(filterStandardDomain(RDFGraphOperations.getRDFTypesFromEntity(topicEntityString)));
List<String> labels = null;
if(lang.equals(Languages.german)) {
topicEntityString = new String(topicEntityString.getBytes(Charset.forName("ISO-8859-1")), Charset.forName("UTF-8"));
topicEntity.setEntityUri(topicEntityString);
labels = RDFGraphOperations.getDbPediaLabel_GER(topicEntity.getEntityUri());
// Hack - Return always a label
if(labels.size() == 0) {
String l = topicEntity.getEntityUri().replace("http://de.dbpedia.org/resource/", "");
l = l.replaceAll("_", " ");
labels.add(l);
}
} else {
labels = RDFGraphOperations.getDbPediaLabel(topicEntity.getEntityUri());
}
if (labels.size() > 0) {
topicEntity.setText(labels.get(0));
}
}
return topicEntity;
}
public Map<DisambiguatedEntity, Integer> createEntityMap(Set<Paragraph> p, Languages lang) {
Map<DisambiguatedEntity, Integer> map = new HashMap<DisambiguatedEntity, Integer>();
for (Paragraph para : p) {
JSONArray array = queryEntities(para.getContent(), lang);
if (array != null) {
for (int i = 0; i < array.length(); i++) {
try {
JSONObject obj = array.getJSONObject(i);
String uri = obj.getString("@URI");
String offset = obj.getString("@offset");
DisambiguatedEntity e = new DisambiguatedEntity();
e.setEntityUri(uri);
List<String> labels = null;
if (lang.equals(Languages.german)) {
uri = new String(uri.getBytes(Charset.forName("ISO-8859-1")), Charset.forName("UTF-8"));
e.setEntityUri(uri);
labels = RDFGraphOperations.getDbPediaLabel_GER(uri);
// Hack - Return always a label
if(labels.size() == 0) {
String l = uri.replace("http://de.dbpedia.org/resource/", "");
l = l.replaceAll("_", " ");
labels.add(l);
}
} else {
labels = RDFGraphOperations.getDbPediaLabel(uri);
}
if (labels.size() > 0) {
e.setText(labels.get(0));
}
if (map.containsKey(e)) {
// BugFix Issue: Only offset of the first entity is
// stored, if an entity occurs multiple times in a
// paragraph
Set<DisambiguatedEntity> keySet = map.keySet();
for (DisambiguatedEntity ent : keySet) {
if (ent.equals(e)) {
ent.addOffset(Integer.parseInt(offset));
break;
}
}
Integer amount = map.get(e);
map.put(e, ++amount);
} else {
map.put(e, 1);
}
e.addOffset(Integer.parseInt(offset));
e.setEntityUri(uri);
e.setType(filterStandardDomain(RDFGraphOperations.getRDFTypesFromEntity(uri)));
} catch (JSONException e) {
Logger.getRootLogger().error("Error: ", e);
}
}
}
}
return map;
}
private JSONArray queryEntities(String text, Languages lang) {
ArrayList<NameValuePair> postParameters = new ArrayList<NameValuePair>();
Header[] headers = { new BasicHeader("Accept", "application/json") };
String serviceUrl = "";
if (lang.equals(Languages.german)) {
postParameters.add(new BasicNameValuePair("text", text));
postParameters.add(new BasicNameValuePair("confidence", "0.70"));
postParameters.add(new BasicNameValuePair("support", "20"));
serviceUrl = Properties.getInstance().getDBpediaSpotLight_Ger_Rest();
} else {
postParameters.add(new BasicNameValuePair("text", text));
postParameters.add(new BasicNameValuePair("confidence", "0.2"));
postParameters.add(new BasicNameValuePair("support", "20"));
serviceUrl = Properties.getInstance().getDBpediaSpotLight_En_Rest();
}
UrlEncodedFormEntity ent = null;
try {
ent = new UrlEncodedFormEntity(postParameters);
} catch (UnsupportedEncodingException e1) {
Logger.getRootLogger().error("Error:", e1);
}
if (ent != null) {
String resStr = ServiceQueries.httpPostRequest(serviceUrl, ent, headers);
JSONObject resultJSON = null;
JSONArray entities = null;
try {
resultJSON = new JSONObject(resStr);
entities = resultJSON.getJSONArray("Resources");
} catch (JSONException e) {
Logger.getRootLogger().info("No Ressources found");
}
return entities;
}
return null;
}
private String filterStandardDomain(Set<Type> set) {
String res = "Misc";
for (Type t : set) {
if (t.getUri().equalsIgnoreCase("http://dbpedia.org/ontology/Person")) {
res = "Person";
break;
} else if (t.getUri().equalsIgnoreCase("http://dbpedia.org/ontology/Organisation")) {
res = "Organization";
break;
} else if (t.getUri().equalsIgnoreCase("http://www.ontologydesignpatterns.org/ont/d0.owl#Location")) {
res = "Location";
break;
}
}
return res;
}
}