package doser.entitydisambiguation.knowledgebases;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.search.similarities.Similarity;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import doser.entitydisambiguation.algorithms.AbstractDisambiguationAlgorithm;
import doser.entitydisambiguation.algorithms.SurfaceForm;
import doser.word2vec.Data;
import doser.word2vec.Doc2VecJsonFormat;
import doser.word2vec.Word2VecJsonFormat;
public abstract class AbstractEntityCentricKBGeneral extends EntityCentricKnowledgeBase {
private final static Logger logger = LoggerFactory.getLogger(AbstractEntityCentricKBGeneral.class);
private static Cache<String, Float> w2vCache;
private static Cache<String, Float> d2vCache;
static {
w2vCache = CacheBuilder.newBuilder().maximumSize(5000).expireAfterWrite(60, TimeUnit.MINUTES).build();
d2vCache = CacheBuilder.newBuilder().maximumSize(5000).expireAfterWrite(60, TimeUnit.MINUTES).build();
}
public AbstractEntityCentricKBGeneral(String uri, boolean dynamic) {
super(uri, dynamic);
}
public AbstractEntityCentricKBGeneral(String uri, boolean dynamic, Similarity sim) {
super(uri, dynamic, sim);
}
/**
* Given a set of word2vec queries, this methods retrieves the corresponding
* word2vec similarities. If the similarities of a query is not cashed, we
* query the word2vec server and compute the similarities from scratch.
*
* @param set
* A set of query strings
* @return Returns a map containing the word2vec similarities of the given
* queries
*/
public Map<String, Float> getWord2VecSimilarities(Set<String> set) {
Map<String, Float> map = new HashMap<String, Float>();
Set<String> neededSimilarities = new HashSet<String>();
for (String s : set) {
Float val = w2vCache.getIfPresent(s);
if (val != null) {
map.put(s, val);
} else {
neededSimilarities.add(s);
}
}
if (neededSimilarities.size() > 0) {
Map<String, Float> computedSimilarities = queryWord2VecSimilarities(neededSimilarities);
w2vCache.putAll(computedSimilarities);
map.putAll(computedSimilarities);
}
return map;
}
public float getDoc2VecSimilarity(String sf, String context, String entity) {
String key = sf + context + entity;
Float val = d2vCache.getIfPresent(key);
if (val != null) {
return val + 1.0f;
} else {
return 0;
}
}
/**
* Retrieves the word2vec similarities of a set of entity pairs
*
*
* @param neededSimilarities
* A set of entity pairs whose word2vec similarities are needed.
* The strings are already in the appropriate format
* @return a map that contains the query strings as keys and the
* similarities as values
*/
private Map<String, Float> queryWord2VecSimilarities(Set<String> neededSimilarities) {
Word2VecJsonFormat format = new Word2VecJsonFormat();
format.setData(neededSimilarities);
format.setDomain(generateDomainName());
JSONArray res = Word2VecJsonFormat.performquery(format, "w2vsim");
Map<String, Float> map = new HashMap<String, Float>();
for (int i = 0; i < res.length(); i++) {
try {
JSONObject obj = res.getJSONObject(i);
String ents = obj.getString("ents");
float sim = (float) obj.getDouble("sim") + 1;
map.put(ents, sim);
} catch (JSONException e) {
logger.error("IOException in "+AbstractEntityCentricKBGeneral.class.getName(), e);
}
}
return map;
}
public void precomputeDoc2VecSimilarities(List<SurfaceForm> rep, int contextSize) {
Doc2VecJsonFormat format = new Doc2VecJsonFormat();
for (SurfaceForm sf : rep) {
String context = AbstractDisambiguationAlgorithm.extractContext(
sf.getPosition(), sf.getContext(), contextSize);
context = context.toLowerCase();
context = context.replaceAll("[\\.\\,\\!\\? ]+", " ");
Data doc = new Data();
List<String> candidates = sf.getCandidates();
List<String> toDoCandidates = new ArrayList<String>();
for(String can : candidates) {
if(!isInCache(sf.getSurfaceForm(), context, can)) {
toDoCandidates.add(can);
}
}
// if(!toDoCandidates.isEmpty()) {
String[] cans = new String[toDoCandidates.size()];
sf.getCandidates().toArray(cans);
doc.setCandidates(cans);
doc.setContext(context);
doc.setSurfaceForm(sf.getSurfaceForm());
doc.getQryNr();
format.addData(doc);
// }
}
JSONArray res = Word2VecJsonFormat.performquery(format, "d2vsim");
// We obtain the same order of surface forms
for (int i = 0; i < res.length(); i++) {
SurfaceForm c = rep.get(i);
try {
JSONObject obj = res.getJSONObject(i);
JSONArray simArray = obj.getJSONArray("sim");
for (int j = 0; j < simArray.length(); j++) {
float sim = (float) simArray.getDouble(j);
String entity = c.getCandidates().get(j);
d2vCache.put(c.getSurfaceForm() + c.getContext() + entity, sim);
}
} catch (JSONException e) {
logger.error("JSONException in "+AbstractEntityCentricKBGeneral.class.getName(), e);
}
}
}
private boolean isInCache(String surfaceForm, String context, String entity) {
String key = surfaceForm + context + entity;
return d2vCache.getIfPresent(key) != null;
}
protected abstract String generateDomainName();
public abstract String generateWord2VecFormatString(String source, String target);
public abstract String generateWord2VecFormatString(List<String> source, String target);
}