package doser.entitydisambiguation.knowledgebases;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.similarities.Similarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import doser.lucene.features.IEntityCentricExtFeatures;
/**
* Default Entity Centric Knowledge base class. All features are implemented
* default wise. If an implementation of an individual EnCenExtFeatures is
* required, this class can be inherited.
*
* If this class is inherited and the subclass integrates a new EnCenExtFeatures
* class definition, the subclass should also overwrite the
* getFeatureDefinition() method to return the correct instance of the new
* EnCenExtFeatures class.
*
* @author quhfus
*
*/
public class EntityCentricKnowledgeBase extends AbstractKnowledgeBase {
private final static Logger logger = LoggerFactory.getLogger(EntityCentricKnowledgeBase.class);
protected static final String TRIMLABELAMOUNT = ";;;";
protected static final String TRIMOCCOCC = ":::";
protected static final String KBOCCURRENCESFIELD = "Occurrences";
protected static final String KBMAINLINK = "Mainlink";
/**
* This hashmap stores the Prior values of the standard DbPedia, CalbC and
* eHealth index. Key: Lucene intern document id Value: Amount of
* occurrences of this entity
*/
protected Map<String, Integer> indexpriorHashMap;
/**
* This hashmap stores the Sense Prior values of the standard DbPedia, CalbC
* and eHealth index.
* <p>
* <li>- Key: Lucene intern document id <br>
* - Value: HashMap storing the label appearances <br>
*
* HashMap2:<br>
* - Key: Hash value of the appearing label <br>
* - Value: Number of occurrences of this label</li>
*/
protected Map<Integer, HashMap<Integer, Integer>> indexsensePriorHashMap;
protected Map<String, HashMap<Integer, Integer>> indexsensePriorHashMapBlanc;
/**
* This map stores the relations the entities can be associated with. Key:
* The source entity uri Value: HashSet of other entities that form a binary
* relation with the source entity.
*/
protected Map<Integer, HashSet<String>> indexRelation;
protected IEntityCentricExtFeatures externFeatureDef;
public EntityCentricKnowledgeBase(String uri, boolean dynamic, Similarity sim) {
super(uri, dynamic, sim);
this.externFeatureDef = new ECExternFeatures();
}
public EntityCentricKnowledgeBase(String uri, boolean dynamic) {
super(uri, dynamic);
this.externFeatureDef = new ECExternFeatures();
}
/**
* Returns the feature definition class.
*
* @return
*/
public IEntityCentricExtFeatures getFeatureDefinition() {
return this.externFeatureDef;
}
@Override
public void initialize() {
indexpriorHashMap = new HashMap<String, Integer>();
indexsensePriorHashMap = new HashMap<Integer, HashMap<Integer, Integer>>();
indexRelation = new HashMap<Integer, HashSet<String>>();
indexsensePriorHashMapBlanc = new HashMap<String, HashMap<Integer, Integer>>();
try {
final IndexReader iReader = super.getSearcher().getIndexReader();
final int maxDoc = iReader.numDocs();
for (int i = 0; i < maxDoc; i++) {
if ((i % 50000) == 0) {
logger.info(this.kbName()+ " Loaded Entities: " + i);
}
final String val = iReader.document(i).get(KBOCCURRENCESFIELD);
String entity = iReader.document(i).get("Mainlink").replaceAll("http://dbpedia.org/resource/", "");
if ((val != null) && !val.equalsIgnoreCase("")) {
final String[] splitter = val.split(TRIMLABELAMOUNT);
final HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>();
for (final String element : splitter) {
final String[] value = element.split(TRIMOCCOCC);
int check = 1;
try {
check = Integer.valueOf(value[1]);
} catch (final NumberFormatException e) {
logger.warn(this.kbName()+ " Warning NumberFormatException while Initialization:" + val);
}
hash.put(value[0].toLowerCase(Locale.US).hashCode(), check);
}
indexsensePriorHashMapBlanc.put(entity, hash);
}
}
iReader.close();
} catch (final IOException e) {
logger.error("IOException in "+EntityCentricKnowledgeBase.class.getName(), e);
}
}
protected String kbName() {
return "General KB";
}
class ECExternFeatures implements IEntityCentricExtFeatures {
private ECExternFeatures() {
super();
}
@Override
public float getPriorOfDocument(int docId) {
float res = 0.0f;
if (indexpriorHashMap.containsKey(docId)) {
final float prior = (float) Math.log(indexpriorHashMap.get(docId));
res = prior;
}
return res;
}
@Override
public float getSensePriorOfDocument(String keyword, int docId) {
float res = 0.0f;
if (indexsensePriorHashMap.containsKey(docId)) {
final HashMap<Integer, Integer> hash = indexsensePriorHashMap.get(docId);
if (hash.containsKey(keyword.toLowerCase(Locale.US).hashCode())) {
final int value = hash.get(keyword.toLowerCase(Locale.US).hashCode());
res = (float) Math.log(value + 1);
}
}
return res;
}
@Override
public int getOccurrences(String sf, String uri) {
String entity = uri.replaceAll("http://dbpedia.org/resource/", "");
int res = 0;
if (indexsensePriorHashMapBlanc.containsKey(entity)) {
final HashMap<Integer, Integer> hash = indexsensePriorHashMapBlanc.get(entity);
if (hash.containsKey(sf.toLowerCase().hashCode())) {
res = hash.get(sf.toLowerCase().hashCode());
}
}
return (res + 1);
}
@Override
public Set<String> getRelations(String url) {
if (indexRelation.containsKey(url.hashCode())) {
return indexRelation.get(url.hashCode());
}
return new HashSet<String>();
}
}
}