package it.acubelab.smaph.linkback;
import it.unipi.di.acube.batframework.data.*;
import it.unipi.di.acube.batframework.utils.WikipediaApiInterface;
import it.acubelab.smaph.SmaphUtils;
import java.io.IOException;
import java.util.*;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
public class BaselineLinkBack implements LinkBack {
WikipediaApiInterface wikiApi;
public BaselineLinkBack(WikipediaApiInterface api){
this.wikiApi = api;
}
private class CompareTripleByScore implements
Comparator<Triple<Double, String[], Tag>> {
@Override
public int compare(Triple<Double, String[], Tag> o1,
Triple<Double, String[], Tag> o2) {
double diff = o1.getLeft() - o2.getLeft();
if (diff < 0)
return -1;
else if (diff == 0)
return 0;
else
return 1;
}
}
@Override
public HashSet<ScoredAnnotation> linkBack(String query,
HashMap<String[], Tag> boldsToEntities) {
// If more than one bold points to the same entity, keep the bold with
// smallest edit distance.
HashMap<String, Tag> boldToEntities = new HashMap<>();
for (String[] bolds : boldsToEntities.keySet()) {
String bestBold = null;
double bestDistance = Double.MAX_VALUE;
for (String bold : bolds) {
double minED = SmaphUtils.getMinEditDist(query, bold);
if (minED <= bestDistance) {
bestBold = bold;
bestDistance = minED;
}
}
String title = "";
try {
title = wikiApi.getTitlebyId(boldsToEntities.get(bolds).getConcept());
} catch (IOException e) {
e.printStackTrace();
}
if (bestDistance > SmaphUtils.getMinEditDist(query, title)){
bestDistance = SmaphUtils.getMinEditDist(query, title);
}
boldToEntities.put(bestBold, boldsToEntities.get(bolds));
}
// Compute, for each <bold, entity> pair, the list of covered query
// keywords
List<Triple<Double, String[], Tag>> edAndCoveredTokensAndEntity = new Vector<>();
for (String bold : boldToEntities.keySet()) {
List<String> coveredTokens = new Vector<>();
double minED = SmaphUtils
.getMinEditDist(query, bold, coveredTokens);
edAndCoveredTokensAndEntity
.add(new ImmutableTriple<Double, String[], Tag>(minED,
coveredTokens.toArray(new String[] {}),
boldToEntities.get(bold)));
}
// order by increasing edit distance
Collections.sort(edAndCoveredTokensAndEntity,
new CompareTripleByScore());
// Take the list of query tokens; bind them to their positions in the
// query
List<String> tokens = SmaphUtils.tokenize(query);
int[] tokenPositions = new int[tokens.size()];
int lastPos = 0;
for (int i = 0; i < tokenPositions.length; i++) {
lastPos = query.toLowerCase().indexOf(tokens.get(i), lastPos);
tokenPositions[i] = lastPos;
}
HashSet<Integer> toCover = new HashSet<>();
for (int i = 0; i < tokens.size(); i++)
toCover.add(i);
// Starting from the token with minimum edit distance, select
// annotations until all query tokens are covered or there are no
// entities left.
HashSet<ScoredAnnotation> result = new HashSet<>();
int i = 0;
while (!toCover.isEmpty() && i < edAndCoveredTokensAndEntity.size()) {
Triple<Double, String[], Tag> t = edAndCoveredTokensAndEntity
.get(i);
int minPos = -1, maxPos = -1;
for (String token : t.getMiddle()) {
int pos = tokens.indexOf(token);
if (!toCover.contains(pos))
continue;
if (pos != -1)
if (minPos == -1 || minPos > pos)
minPos = pos;
if (maxPos == -1 || maxPos < pos)
maxPos = pos;
}
if (minPos != -1) {
for (int j = minPos; j <= maxPos; j++)
toCover.remove(j);
int start = tokenPositions[minPos];
int end = tokenPositions[maxPos] + tokens.get(maxPos).length();
result.add(new ScoredAnnotation(start, end - start, t
.getRight().getConcept(), 1));
}
i++;
}
return result;
}
}