package doser.entitydisambiguation.knowledgebases;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import doser.entitydisambiguation.properties.Properties;
import doser.lucene.features.IEntityCentricExtFeatures;
public class EnCenKBCStable extends EntityCentricKnowledgeBase {
/**
* This hashmap stores the Sense Prior values of the table computer science
* index
* <p>
* <li>
* - Key: Lucene intern document id <br>
* - Value: HashMap storing the label appearances <br>
*
* HashMap2:<br>
* - Key: Hash value of the appearing label <br>
* - Value: Number of occurrences of this label</li>
*/
private static Map<Integer, HashMap<Integer, Integer>> cstableindexsensePriorHashMap;
public EnCenKBCStable(String uri, boolean dynamic, Similarity sim) {
super(uri, dynamic, sim);
this.externFeatureDef = new ECCSTableExternFeatures();
}
@Override
public void initialize() {
cstableindexsensePriorHashMap = new HashMap<Integer, HashMap<Integer, Integer>>();
// cstableindexrelationContextMap = new HashMap<Integer,
// HashMap<Integer, Integer>>();
final File file = new File(Properties.getInstance().getCSTableIndex());
try {
final Directory dir = FSDirectory.open(file);
final IndexReader iReader = DirectoryReader.open(dir);
final int maxDoc = iReader.numDocs();
for (int i = 0; i < maxDoc; i++) {
String val = iReader.document(i).get("occurrences");
if ((i % 50000) == 0) {
Logger.getRootLogger().info("Loaded Entities: " + i);
}
if ((val != null) && !val.equalsIgnoreCase("")) {
final String[] splitter = val.split(TRIMLABELAMOUNT);
final HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>();
for (final String element : splitter) {
final String[] value = element.split(TRIMOCCOCC);
int check = 1;
try {
check = Integer.valueOf(value[1]);
} catch (final NumberFormatException e) {
Logger.getRootLogger().error(e.getStackTrace());
}
hash.put(value[0].hashCode(), check);
}
cstableindexsensePriorHashMap.put(i, hash);
}
val = iReader.document(i).get("surroundinglabels");
if ((val != null) && !val.equalsIgnoreCase("")) {
final String[] splitter = val.split(TRIMLABELAMOUNT);
final HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>();
for (final String element : splitter) {
final String[] value = element.split(TRIMOCCOCC);
int check = 1;
try {
check = Integer.valueOf(value[1]);
} catch (final NumberFormatException e) {
Logger.getRootLogger().error(e.getStackTrace());
}
hash.put(value[0].hashCode(), check);
}
// cstableindexrelationContextMap.put(i, hash);
}
}
} catch (final IOException e) {
Logger.getRootLogger().error(e.getStackTrace());
}
}
private class ECCSTableExternFeatures implements IEntityCentricExtFeatures {
@Override
public float getPriorOfDocument(int docId) {
return 0;
}
@Override
public float getSensePriorOfDocument(String keyword, int docId) {
float res = 0.0f;
if (cstableindexsensePriorHashMap.containsKey(docId)) {
final HashMap<Integer, Integer> hash = cstableindexsensePriorHashMap
.get(docId);
if (hash.containsKey(keyword.toLowerCase(Locale.US).hashCode())) {
final int value = hash.get(keyword.toLowerCase(Locale.US)
.hashCode());
final float prior = (float) Math.log(value + 1);
res = prior;
}
}
return res;
}
@Override
public int getOccurrences(String sf, String uri) {
// TODO Auto-generated method stub
return 0;
}
@Override
public Set<String> getRelations(String url) {
return new HashSet<String>();
}
}
}