package doser.entitydisambiguation.algorithms.rules; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import doser.entitydisambiguation.algorithms.SurfaceForm; import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; import doser.lucene.query.TermQuery; class CheckGeneralEntities extends AbstractRule { CheckGeneralEntities(EntityCentricKBDBpedia eckb) { super(eckb); } @Override public boolean applyRule(List<SurfaceForm> rep) { for (SurfaceForm c : rep) { String sf = c.getSurfaceForm().toLowerCase(); List<String> candidates = c.getCandidates(); String checked = null; // Surface Form - Candidate Match i.e. Saturday - // http://dbpedia.org/resource/Saturday for (String s : candidates) { String ent = s.replaceAll("http://dbpedia.org/resource/", "") .toLowerCase(); if (sf.equalsIgnoreCase(ent)) { checked = s; break; } } if (checked != null && !checkSurfaceFormSubset(sf, rep)) { List<String> keepCandidates = new LinkedList<String>(); for (String can : candidates) { String[] labels = null; IndexSearcher searcher = eckb.getSearcher(); IndexReader reader = searcher.getIndexReader(); TermQuery query = new TermQuery(new Term("Mainlink", can)); try { final TopDocs top = searcher.search(query, 1); final ScoreDoc[] score = top.scoreDocs; final Document doc = reader.document(score[0].doc); labels = doc.getValues("Label"); } catch (IOException e) { e.printStackTrace(); } // Check whether the candidate has label of the original // surface form if (labels != null) { boolean isIn = false; for (int i = 0; i < labels.length; ++i) { if (labels[i].toLowerCase().equalsIgnoreCase(sf)) { isIn = true; break; } } // If IN, keep this candidate if (isIn) { keepCandidates.add(can); } } } if (!keepCandidates.isEmpty()) { c.setCandidates(keepCandidates); if(keepCandidates.size() == 1) { System.out.println("**********************************************************************"); System.out.println(keepCandidates.toString()); System.out.println("**********************************************************************"); } } } } return false; } private boolean checkSurfaceFormSubset(String sf, List<SurfaceForm> reps) { boolean isIn = false; for (SurfaceForm c : reps) { String toCheck = c.getSurfaceForm().toLowerCase(); if (!toCheck.equalsIgnoreCase(sf) && toCheck.contains(sf)) { isIn = true; break; } } return isIn; } }