package doser.entitydisambiguation.algorithms.collective.dbpedia;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import doser.entitydisambiguation.algorithms.AbstractDisambiguationAlgorithm;
import doser.entitydisambiguation.algorithms.SurfaceForm;
import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
import doser.lucene.query.TermQuery;
class LocationDisambiguation {
private static final float DOC2VECTHRESHOLD = 1.37f;
private EntityCentricKBDBpedia eckb;
public LocationDisambiguation(EntityCentricKBDBpedia eckb) {
super();
this.eckb = eckb;
}
void solve(List<SurfaceForm> reps) {
for (SurfaceForm c : reps) {
if (c.getCandidates().size() > 1 && !c.isMatchesInitial()) {
disambiguate(c);
}
}
}
private void disambiguate(SurfaceForm c) {
String context = AbstractDisambiguationAlgorithm.extractContext(
c.getPosition(), c.getContext(),
CollectiveAndContextDriver.PREPROCESSINGCONTEXTSIZE);
List<String> candidates = c.getCandidates();
String surfaceForm = c.getSurfaceForm();
Set<Document> sfDocuments = queryLuceneLabel(surfaceForm, candidates);
removeUnusedDocs(sfDocuments, candidates);
Set<Document> nonLocations = checkForLocation(sfDocuments);
// Dont care if no locations are available
if (nonLocations.size() < sfDocuments.size()) {
if (isLocation(nonLocations, c)) {
String s = solveLocations(sfDocuments, candidates,
c.getSurfaceForm(), context);
if (s != null) {
c.setDisambiguatedEntity(s);
}
}
}
}
private String solveLocations(Set<Document> relevantEntities,
List<String> allRelevantEntities, String surfaceForm, String context) {
List<String> strList = new ArrayList<String>();
// Preprocessing
surfaceForm = surfaceForm.toLowerCase();
for (Document d : relevantEntities) {
String type = d.get("Type");
if (type.equals("Location")) {
String mainlink = d.get("Mainlink");
String l = mainlink.toLowerCase().replaceAll(
"http://dbpedia.org/resource/", "");
String l_w = l.replaceAll("_", " ");
if (l.contains(",_")) {
String splitter[] = l.split(",_");
String addition = splitter[1].toLowerCase().replaceAll("_",
" ");
String first = splitter[0].toLowerCase();
int nrSpacesFirst = first.replaceAll("[^" + "_" + "]", "")
.length();
int nrSpacesSurfaceForm = surfaceForm.replaceAll(
"[^" + " " + "]", "").length();
if (!addition.equals(surfaceForm)
&& !checkAdditionAbb(surfaceForm, addition, first)
&& nrSpacesFirst == nrSpacesSurfaceForm) {
strList.add(mainlink);
}
} else if (surfaceForm.equals(l_w)
|| (surfaceForm.endsWith(".") && l_w
.contains(surfaceForm.replaceAll("\\.", "")))
|| checkFirstURLPart(surfaceForm, l_w)) {
strList.add(mainlink);
}
}
}
return solveFinalCandidates(strList, surfaceForm, context);
}
private boolean checkFirstURLPart(String sf, String urlpart) {
if (sf.contains(".")) {
sf = sf.replaceAll("\\.", "");
String[] splitter = urlpart.split(" ");
StringBuilder builder = new StringBuilder();
for (int i = 0; i < splitter.length; i++) {
builder.append(splitter[i].substring(0, 1));
}
if (builder.toString().equals(sf)) {
return true;
}
}
return false;
}
private String solveFinalCandidates(List<String> candidates, String sf,
String context) {
String result = null;
if (result == null) {
for (String can : candidates) {
String l = can.toLowerCase().replaceAll(
"http://dbpedia.org/resource/", "");
if (l.contains(",_")) {
String splitter[] = l.split(",_");
String addition = splitter[1].toLowerCase().replaceAll("_",
" ");
if (searchEvidenceInContext(context, addition, sf)) {
result = can;
break;
}
}
}
}
if (result == null) {
for (String can : candidates) {
String l = can.toLowerCase().replaceAll(
"http://dbpedia.org/resource/", "");
if (!l.contains(",_")) {
result = can;
break;
}
}
}
return result;
}
private boolean checkAdditionAbb(String sf, String addition, String first) {
if (!sf.endsWith(".")) {
return false;
}
if (sf.endsWith(".") && addition.contains(sf.replaceAll("\\.", ""))) {
return true;
}
if (sf.endsWith(".")) {
sf = sf.replaceAll("\\.", "");
if (first.contains("sf") && first.length() > 1) {
return false;
}
String[] splitter = first.split(" ");
StringBuilder builder = new StringBuilder();
for (int i = 0; i < splitter.length; i++) {
builder.append(splitter[i].substring(0, 1));
}
if (!builder.toString().equals(sf)) {
return true;
}
}
return false;
}
private boolean searchEvidenceInContext(String context, String word,
String sf) {
String conl = context.toLowerCase();
if (sf.equals(word)) {
return false;
}
String sfAbb = sf.replaceAll("[^\\w]", " ");
String[] splitter = word.split(" ");
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < splitter.length; i++) {
buffer.append(splitter[i].substring(0, 1));
buffer.append(" ");
}
if (buffer.toString().equals(sfAbb)) {
return false;
}
if (conl.contains(word)) {
return true;
}
context = context.toLowerCase().trim().replaceAll(" +", " ");
String[] words = context.toLowerCase().split(" ");
for (int i = 0; i < words.length; i++) {
String w = words[i].replaceAll("[^\\w\\s]", "");
if (words[i].equals(w + ".")
&& (word.startsWith(w) || word.endsWith(w))
&& words[i].length() > 3) {
System.out.println("Context adaptiert: " + words[i]);
return true;
}
}
return false;
}
private boolean isLocation(Set<Document> nonLocationSet, SurfaceForm sf) {
for (Document doc : nonLocationSet) {
String mainlink = doc.get("Mainlink");
float docSim = this.eckb.getDoc2VecSimilarity(sf.getSurfaceForm(),
sf.getContext(), mainlink);
if (docSim > DOC2VECTHRESHOLD) {
return false;
}
}
return true;
}
private void removeUnusedDocs(Set<Document> set, List<String> candidates) {
for (Iterator<Document> iterator = set.iterator(); iterator.hasNext();) {
Document d = (Document) iterator.next();
String mainLink = d.get("Mainlink");
if (!candidates.contains(mainLink)) {
iterator.remove();
}
}
}
private Set<Document> checkForLocation(Set<Document> set) {
Set<Document> nonLocations = new HashSet<Document>();
for (Document d : set) {
String type = d.get("Type");
if (!type.equals("Location")) {
nonLocations.add(d);
}
}
return nonLocations;
}
private Set<Document> queryLuceneLabel(String surfaceForm,
List<String> candidates) {
Set<Document> documents = new HashSet<Document>();
BooleanQuery query = new BooleanQuery();
String[] splitter = surfaceForm.toLowerCase().split(" ");
for (int i = 0; i < splitter.length; i++) {
query.add(new TermQuery(new Term("Label", splitter[i])), Occur.MUST);
}
final IndexSearcher searcher = eckb.getSearcher();
final IndexReader reader = searcher.getIndexReader();
try {
final TopDocs top = searcher.search(query, 25000);
final ScoreDoc[] score = top.scoreDocs;
for (int i = 0; i < score.length; i++) {
final Document doc = reader.document(score[i].doc);
documents.add(doc);
}
} catch (IOException e) {
Logger.getRootLogger().error("Lucene Searcher Error: ", e);
}
return documents;
}
}