package doser.entitydisambiguation.algorithms.rules; import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import doser.entitydisambiguation.algorithms.SurfaceForm; import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; import doser.lucene.query.TermQuery; /** * Falls eine Surface Form eindeutig ist und weitere Surface Forms eine * Abkürzung darstellen, diese allerdings nicht eindeutig sind, wird dies sofort * aufgelöst. * * Beispiel: 1 Surface Form: Burlington Industries Inc (eindeutig) 2 Surface * Form: Burlington (ambiguous) ... * * * @author quh * */ class UnambiguousToAmbiguousRule extends AbstractRule { UnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) { super(eckb); } @Override public boolean applyRule(List<SurfaceForm> rep) { List<SurfaceForm> unambiguous = new LinkedList<SurfaceForm>(); for (SurfaceForm c : rep) { if (c.getCandidates().size() == 1) { String candidate = c.getCandidates().get(0); String type = queryType(candidate); if (type.equalsIgnoreCase("Person") || type.equalsIgnoreCase("Organisation")) { unambiguous.add(c); } } } for (SurfaceForm c : rep) { if (c.getCandidates().size() > 1) { HashMap<String, Integer> map = new HashMap<String, Integer>(); for (SurfaceForm un : unambiguous) { String type = queryType(un.getCandidates().get(0)); if ((isSubString(un.getSurfaceForm(), c.getSurfaceForm()) && c.getCandidates().contains(un.getCandidates().get(0)) && un.getPosition() < c.getPosition()) || (type.equalsIgnoreCase("Person") && isSubString(un.getSurfaceForm(), c.getSurfaceForm()) && un.getPosition() < c.getPosition())) { map.put(un.getCandidates().get(0), c.getPosition() - un.getPosition()); // c.setDisambiguatedEntity(un.getCandidates().get(0)); } } if (!map.isEmpty()) { int distance = Integer.MAX_VALUE; String can = ""; for (Map.Entry<String, Integer> entry : map.entrySet()) { if (entry.getValue() < distance) { distance = entry.getValue(); can = entry.getKey(); } } c.setDisambiguatedEntity(can); } } } return false; } private boolean isSubString(String s1, String s2) { if (s1.toLowerCase().contains(s2.toLowerCase())) { return true; } else return false; } private String queryType(String url) { String type = ""; IndexSearcher searcher = eckb.getSearcher(); Query q = new TermQuery(new Term("Mainlink", url)); try { TopDocs docs = searcher.search(q, 1); ScoreDoc[] scoredocs = docs.scoreDocs; if(scoredocs.length == 0) { type = "Misc"; } else { int nr = scoredocs[0].doc; Document doc = searcher.getIndexReader().document(nr); type = doc.get("Type"); } } catch (IOException e) { e.printStackTrace(); } return type; } }