package doser.entitydisambiguation.algorithms.collective; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import doser.entitydisambiguation.algorithms.SurfaceForm; import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral; import doser.general.HelpfulMethods; public class CandidatePruning { private static final int NUMBEROFADDITIONALW2VENTITIES = 6; private static final int ENTITYTHRESHOLD = 6; private static final int MINIMUMSURFACEFORMS = 3; private static final float WORD2VECTHRESHOLD = 1.60f; private AbstractEntityCentricKBGeneral eckb; public CandidatePruning(AbstractEntityCentricKBGeneral eckb) { super(); this.eckb = eckb; } public void prune(List<SurfaceForm> rep) { List<SurfaceForm> unambiguous = new LinkedList<SurfaceForm>(); for (SurfaceForm c : rep) { if (c.getCandidates().size() == 1) { unambiguous.add(c); } } List<String> list = new LinkedList<String>(); for (SurfaceForm sf : rep) { if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) { list.add(sf.getCandidates().get(0)); } } for (SurfaceForm c : rep) { List<String> candidates = c.getCandidates(); if (candidates.size() > ENTITYTHRESHOLD) { Set<String> prunedCandidates = new HashSet<String>(); // Sense Prior Map<String, Integer> map = new HashMap<String, Integer>(); for (String candidate : candidates) { map.put(candidate, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), candidate)); } @SuppressWarnings("deprecation") List<Map.Entry<String, Integer>> l = HelpfulMethods.sortByValue(map); for (int i = 0; i < ENTITYTHRESHOLD; ++i) { prunedCandidates.add(l.get(i).getKey()); // System.out.println("SensePrior ADd: "+l.get(i).getKey()+" // "+l.get(i).getValue()); } // Doc2Vec ContextSimilarity Map<String, Float> map_doc2vec = new HashMap<String, Float>(); for (String candidate : candidates) { map_doc2vec.put(candidate, eckb.getDoc2VecSimilarity(c.getSurfaceForm(), c.getContext(), candidate)); } @SuppressWarnings("deprecation") List<Map.Entry<String, Float>> l_doc2vec = HelpfulMethods.sortByValue(map_doc2vec); int added = 0; int counter = 0; while (counter < l_doc2vec.size() && added < 4) { String key = l_doc2vec.get(counter).getKey(); if (!prunedCandidates.contains(key)) { prunedCandidates.add(key); added++; } counter++; } // for (int i = 0; i < ENTITYTHRESHOLD; ++i) { // prunedCandidates.add(l_doc2vec.get(i).getKey()); // } // Check for very relevant Candidates via given Word2Vec // similarities if (list.size() >= MINIMUMSURFACEFORMS) { Set<String> w2vFormatStrings = new HashSet<String>(); for (String can : candidates) { if (!prunedCandidates.contains(can)) { String query = this.eckb.generateWord2VecFormatString(list, can); w2vFormatStrings.add(query); } } Map<String, Float> similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings); Map<String, Integer> occmap = new HashMap<String, Integer>(); for (String can : candidates) { if (!prunedCandidates.contains(can)) { String query = this.eckb.generateWord2VecFormatString(list, can); float val = similarityMap.get(query); if (val > WORD2VECTHRESHOLD) { occmap.put(can, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), can)); // prunedCandidates.add(can); } } } @SuppressWarnings("deprecation") List<Map.Entry<String, Integer>> sortedl = HelpfulMethods.sortByValue(occmap); for (int i = 0; i < NUMBEROFADDITIONALW2VENTITIES; ++i) { if (i < sortedl.size()) { prunedCandidates.add(sortedl.get(i).getKey()); } } } c.setCandidates(new ArrayList<String>(prunedCandidates)); } } } }