package doser.entitydisambiguation.algorithms.rules; import java.io.IOException; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import doser.entitydisambiguation.algorithms.SurfaceForm; import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; import doser.lucene.query.TermQuery; class ContextRule extends AbstractRule { private static final int MINDISAMBIGUATEDSURFACEFORMS = 2; private static final int MINIMUMSURFACEFORMS = 10; private static final float SIMILARITYTHRESHOLD = 1.57f; private static final float SIMILARITYTHRESHOLDMISC = 1.53f; private EntityCentricKBDBpedia eckb; ContextRule(EntityCentricKBDBpedia eckb) { super(eckb); this.eckb = eckb; } @Override public boolean applyRule(List<SurfaceForm> rep) { if (rep.size() > MINIMUMSURFACEFORMS) { List<String> list = new LinkedList<String>(); for (SurfaceForm sf : rep) { if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) { list.add(sf.getCandidates().get(0)); } } if (list.size() >= MINDISAMBIGUATEDSURFACEFORMS) { Set<String> w2vFormatStrings = new HashSet<String>(); for (SurfaceForm sf : rep) { if (rep.size() > 1 && sf.getCandidates().size() > 1) { List<String> l = sf.getCandidates(); List<String> bestCandidate = new LinkedList<String>(); Set<String> levenshteinAdded = new HashSet<String>(); for (String s : l) { String query = this.eckb.generateWord2VecFormatString(list, s); w2vFormatStrings.add(query); Map<String, Float> similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings); float simValue = similarityMap.get(query); // Check for Appropriate entities String candidateWithoutUrl = s.replaceAll("http://dbpedia.org/resource/", "").toLowerCase(); if (levenshteinDistance(candidateWithoutUrl, sf.getSurfaceForm().toLowerCase()) <= 2) { System.out.println("LEVENSHTEIN DISTANCE ENTITY: " + s); } if (simValue > SIMILARITYTHRESHOLD || (queryType(s).equalsIgnoreCase("Misc") && simValue > SIMILARITYTHRESHOLDMISC)) { bestCandidate.add(s); } else if (levenshteinDistance(candidateWithoutUrl, sf.getSurfaceForm().toLowerCase()) <= 2) { bestCandidate.add(s); levenshteinAdded.add(s); } } // Disambiguate and assign entity if (!bestCandidate.isEmpty()) { boolean notOnlyLevenshtein = false; for (String s : bestCandidate) { if (!levenshteinAdded.contains(s)) { notOnlyLevenshtein = true; } } if (notOnlyLevenshtein) { sf.setCandidates(bestCandidate); System.out.println("Es bleibt übrig SurfaceForm: " + sf.getSurfaceForm() + " +" + bestCandidate.toString()); } } } } } } return false; } private String queryType(String url) { String type = ""; IndexSearcher searcher = eckb.getSearcher(); Query q = new TermQuery(new Term("Mainlink", url)); try { TopDocs docs = searcher.search(q, 1); ScoreDoc[] scoredocs = docs.scoreDocs; if (scoredocs.length == 0) { type = "Misc"; } else { int nr = scoredocs[0].doc; Document doc = searcher.getIndexReader().document(nr); type = doc.get("Type"); } } catch (IOException e) { e.printStackTrace(); } return type; } int levenshteinDistance(CharSequence lhs, CharSequence rhs) { int len0 = lhs.length() + 1; int len1 = rhs.length() + 1; // the array of distances int[] cost = new int[len0]; int[] newcost = new int[len0]; // initial cost of skipping prefix in String s0 for (int i = 0; i < len0; i++) cost[i] = i; // dynamically computing the array of distances // transformation cost for each letter in s1 for (int j = 1; j < len1; j++) { // initial cost of skipping prefix in String s1 newcost[0] = j; // transformation cost for each letter in s0 for (int i = 1; i < len0; i++) { // matching current letters in both strings int match = (lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1; // computing cost for each transformation int cost_replace = cost[i - 1] + match; int cost_insert = cost[i] + 1; int cost_delete = newcost[i - 1] + 1; // keep minimum cost newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace); } // swap cost/newcost arrays int[] swap = cost; cost = newcost; newcost = swap; } // the distance is the cost for transforming all letters in both strings return cost[len0 - 1]; } }