package doser.entitydisambiguation.algorithms.collective.dbpedia; import java.io.IOException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import doser.entitydisambiguation.algorithms.SurfaceForm; import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; import doser.entitydisambiguation.properties.Properties; import doser.lucene.query.TermQuery; public class AdditionalCandidateQuery { private Pattern punctuationPattern; private Pattern parenthesesPattern; private EntityCentricKBDBpedia eckb; private int taskNumber; AdditionalCandidateQuery(EntityCentricKBDBpedia eckb) { super(); this.eckb = eckb; this.punctuationPattern = Pattern.compile(" ([,!\\?\\.])"); this.parenthesesPattern = Pattern.compile("(.+)[\\(\\[]+.*"); } public SurfaceForm checkAdditionalSurfaceForms(EntityDisambiguationDPO dpo, int taskNumber) { if (Properties.getInstance().getCandidateExpansion()) { this.taskNumber = taskNumber; String mention = dpo.getSelectedText().replaceAll(" +", " "); /* Eliminate e.g. Bill , Gates to Bill, Gates */ Matcher regexMatcher = punctuationPattern.matcher(mention); StringBuffer buffer = new StringBuffer(); while (regexMatcher.find()) { String replacer = regexMatcher.group(1); replacer = Matcher.quoteReplacement(replacer); regexMatcher.appendReplacement(buffer, replacer); } regexMatcher.appendTail(buffer); String newSf = buffer.toString().trim(); if (!dpo.getSelectedText().equalsIgnoreCase(newSf) && !newSf.equalsIgnoreCase("")) { ScoreDoc[] scoredocs = queryIndex(newSf, false); if (scoredocs != null && scoredocs.length > 0) { SurfaceForm sf = prepareSurfaceForm(scoredocs, dpo, newSf); if (sf != null) { return sf; } } } /* Parenteses replacement */ regexMatcher = parenthesesPattern.matcher(mention); buffer = new StringBuffer(); try { if (regexMatcher.find()) { String replacer = regexMatcher.group(1); replacer = Matcher.quoteReplacement(replacer); regexMatcher.appendReplacement(buffer, replacer); } } catch (IllegalArgumentException e) { e.printStackTrace(); } regexMatcher.appendTail(buffer); newSf = buffer.toString().trim(); if (!dpo.getSelectedText().equalsIgnoreCase(newSf) && !newSf.equalsIgnoreCase("")) { ScoreDoc[] scoredocs = queryIndex(newSf, false); if (scoredocs != null && scoredocs.length > 0) { SurfaceForm sf = prepareSurfaceForm(scoredocs, dpo, newSf); if (sf != null) { return sf; } } } /* Replace numerations */ mention = mention.replaceAll("\\d\\.*", ""); mention = mention.replaceAll("\"", ""); mention = mention.replaceAll(" +", " "); mention = mention.trim(); if (!dpo.getSelectedText().equalsIgnoreCase(mention) && !mention.equalsIgnoreCase("")) { ScoreDoc[] scoredocs = queryIndex(mention, false); if (scoredocs != null && scoredocs.length > 0) { SurfaceForm sf = prepareSurfaceForm(scoredocs, dpo, mention); if (sf != null) { return sf; } } } /* Replace all special chars and normalize */ mention = mention.replaceAll("[^a-zA-Z ]", ""); mention = mention.replaceAll(" +", " "); mention = mention.trim(); if (!dpo.getSelectedText().equalsIgnoreCase(mention) && !mention.equalsIgnoreCase("")) { ScoreDoc[] scoredocs = queryIndex(mention, false); if (scoredocs != null && scoredocs.length > 0) { SurfaceForm sf = prepareSurfaceForm(scoredocs, dpo, mention); if (sf != null) { return sf; } } } /* * Perform FuzzyQuery if surface forms provides specific * characteristics */ String originalSf = dpo.getSelectedText(); if (originalSf.length() > 5 && originalSf.length() < 22) { ScoreDoc[] scoredocs = queryIndex(dpo.getSelectedText(), true); if (scoredocs != null && scoredocs.length > 0) { SurfaceForm sf = prepareSurfaceForm(scoredocs, dpo, dpo.getSelectedText()); if (sf != null) { return sf; } } } } /* Create Empty Surface Form */ ArrayList<String> l = new ArrayList<String>(); SurfaceForm sf = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, taskNumber, dpo.getStartPosition()); return sf; } private SurfaceForm prepareSurfaceForm(ScoreDoc[] score, EntityDisambiguationDPO dpo, String newsf) { IndexReader reader = eckb.getSearcher().getIndexReader(); SurfaceForm f = null; try { if (score.length == 1) { final Document doc = reader.document(score[0].doc); ArrayList<String> l = new ArrayList<String>(); l.add(doc.get("Mainlink")); f = new SurfaceForm(newsf, dpo.getContext(), l, taskNumber, dpo.getStartPosition()); } else if (score.length > 1) { ArrayList<String> l = new ArrayList<String>(); for (int j = 0; j < score.length; j++) { final Document doc = reader.document(score[j].doc); l.add(doc.get("Mainlink")); } f = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, taskNumber, dpo.getStartPosition()); } } catch (IOException e) { e.printStackTrace(); } return f; } private ScoreDoc[] queryIndex(String mention, boolean fuzzy) { ScoreDoc[] scoredocs = null; Query query = null; if (!fuzzy) { query = createQuery(mention, eckb); } else { query = new FuzzyQuery(new Term("UniqueLabel", mention.toLowerCase())); } IndexSearcher searcher = eckb.getSearcher(); TopDocs top = null; try { top = searcher.search(query, 1000); } catch (IOException e) { e.printStackTrace(); } if (top != null) { scoredocs = top.scoreDocs; } return scoredocs; } private Query createQuery(String sf, EntityCentricKBDBpedia kb) { String surfaceform = sf.toLowerCase(); TermQuery query = new TermQuery(new Term("UniqueLabel", surfaceform)); return query; } public static void main(String args[]) throws Exception { String test = "\\("; Pattern p = Pattern.compile("(.+)[\\(\\[]+.*"); Matcher regexMatcher = p.matcher(test); StringBuffer builder = new StringBuffer(); if (regexMatcher.find()) { String replacer = regexMatcher.group(1); replacer = Matcher.quoteReplacement(replacer); regexMatcher.appendReplacement(builder, replacer); } regexMatcher.appendTail(builder); System.out.println(builder.toString()); } }