package doser.entitydisambiguation.algorithms.rules;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import doser.entitydisambiguation.algorithms.SurfaceForm;
import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
import doser.lucene.features.LuceneFeatures;
import doser.lucene.query.LearnToRankClause;
import doser.lucene.query.LearnToRankQuery;
import doser.tools.Inflector;
/**
* Falls eine Surface Form keine Kandidaten hat, allerdings aus mindestens 3
* Wörtern besteht, werden alle Wörter mit kleinergleich 3 Buchstaben entfernt
* und erneut angefragt. Dies geschieht ebenfalls nach der Entfernung von
* Sonderzeichen. Entsprechend werden die Kandidaten gesetzt.
*
* @author quh
*/
class NoCandidatesExpansionRules extends AbstractRule {
NoCandidatesExpansionRules(AbstractKnowledgeBase eckb) {
super(eckb);
}
@Override
public boolean applyRule(List<SurfaceForm> rep) {
for (SurfaceForm c : rep) {
if (c.getCandidates().size() == 0) {
c.setCandidates(queryCandidates(c.getSurfaceForm()));
}
}
return false;
}
private ArrayList<String> queryCandidates(String surfaceForm) {
ArrayList<String> lst = new ArrayList<String>();
String[] splitter = surfaceForm.split(" ");
if (splitter.length > 2) {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < splitter.length; i++) {
if (splitter[i].length() > 3) {
builder.append(splitter[i] + " ");
}
}
String builderstring = builder.toString();
if (builderstring.length() > 0) {
String newSf = builderstring.substring(0,
builderstring.length() - 1);
lst = queryLucene(surfaceForm);
if (lst.size() == 0) {
// Try again without special chars
newSf = newSf.replaceAll("[^a-zA-Z ]", "");
lst = queryLucene(newSf);
// If size is 0 anyway, still check Plural to singular
if (lst.size() == 0) {
String singular = Inflector.getInstance().singularize(
newSf);
if (!newSf.equalsIgnoreCase(singular)) {
// Try singular search
lst = queryCandidates(singular);
}
}
}
}
}
return lst;
}
private ArrayList<String> queryLucene(String surfaceForm) {
ArrayList<String> list = new ArrayList<String>();
final IndexSearcher searcher = eckb.getSearcher();
final IndexReader reader = searcher.getIndexReader();
LearnToRankQuery query = new LearnToRankQuery();
List<LearnToRankClause> features = new LinkedList<LearnToRankClause>();
DefaultSimilarity defaultSim = new DefaultSimilarity();
features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm,
"UniqueLabel", defaultSim), "Feature1", true));
try {
final TopDocs top = searcher.search(query, 150);
final ScoreDoc[] score = top.scoreDocs;
if (score.length <= 5) {
for (int i = 0; i < score.length; ++i) {
final Document doc = reader.document(score[i].doc);
list.add(doc.get("Mainlink"));
}
}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}
}