package doser.entitydisambiguation.algorithms; import java.io.IOException; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.DefaultSimilarity; import doser.entitydisambiguation.backend.DisambiguationMainService; import doser.entitydisambiguation.backend.AbstractDisambiguationTask; import doser.entitydisambiguation.backend.DisambiguationTaskSingle; import doser.entitydisambiguation.dpo.DisambiguatedEntity; import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; import doser.entitydisambiguation.dpo.Response; import doser.entitydisambiguation.knowledgebases.EntityCentricKnowledgeBase; import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; import doser.lucene.features.LuceneFeatures; import doser.lucene.query.FuzzyLabelSimilarity; import doser.lucene.query.LearnToRankClause; import doser.lucene.query.LearnToRankQuery; public class EntityCentricAlgorithmDefault extends AbstractDisambiguationAlgorithm { private EntityCentricKnowledgeBase eckb; private DisambiguationTaskSingle task; EntityCentricAlgorithmDefault() { super(); } @Override public boolean checkAndSetInputParameter(AbstractDisambiguationTask task) { AbstractKnowledgeBase kb = task.getKb(); if (!(task instanceof DisambiguationTaskSingle)) { return false; } else if (!(kb instanceof EntityCentricKnowledgeBase)) { return false; } this.eckb = (EntityCentricKnowledgeBase) kb; this.task = (DisambiguationTaskSingle) task; return true; } @Override protected boolean preDisambiguation() { EntityDisambiguationDPO toDis = task.getEntityToDisambiguate(); boolean res = true; final Pattern pattern = Pattern.compile("^\\d*[.,]?\\d*$"); final String surfaceForms = toDis.getSelectedText(); final String str = surfaceForms; final Matcher matcher = pattern.matcher(str); if (matcher.find()) { res = false; } if (!res) { final List<DisambiguatedEntity> disEntityList = new LinkedList<DisambiguatedEntity>(); final DisambiguatedEntity disEntity = new DisambiguatedEntity(); disEntity.setEntityUri("http://dbpedia.org/resource/Number"); disEntity.setText("Number"); disEntity .setDescription("A number is a mathematical object used to count, label, and measure. In mathematics, the definition of number has been extended over the years to include such numbers " + "as zero, negative numbers, rational numbers, irrational numbers, and complex numbers. Mathematical operations are certain procedures that take one or more numbers as input and" + " produce a number as output. Unary operations take a single input number and produce a single output number. For example, the successor operation adds one to an integer, thus " + "the successor of 4 is 5. Binary operations take two input numbers and produce a single output number. Examples of binary operations include addition, subtraction, " + "multiplication, division, and exponentiation. The study of numerical operations is called arithmetic. A notational symbol that represents a number is called a numeral. " + "In addition to their use in counting and measuring, numerals are often used for labels, for ordering, and for codes. In common usage, the word number can mean the abstract " + "object, the symbol, or the word for the number."); disEntity.setConfidence(1); disEntityList.add(disEntity); Response response = new Response(); response.setSelectedText(toDis.getSelectedText()); response.setStartPosition(toDis.getStartPosition()); response.setDisEntities(disEntityList); List<Response> resList = new LinkedList<Response>(); resList.add(response); task.setResponse(resList); } return res; } @Override public void processAlgorithm() { final Query query = createQuery(task.getEntityToDisambiguate(), eckb); final IndexSearcher searcher = eckb.getSearcher(); final IndexReader reader = searcher.getIndexReader(); EntityDisambiguationDPO dpo = task.getEntityToDisambiguate(); try { final TopDocs top = searcher.search(query, task.getReturnNr()); final ScoreDoc[] score = top.scoreDocs; final List<DisambiguatedEntity> disList = new LinkedList<DisambiguatedEntity>(); final String[] entityMainLinks = new String[score.length]; for (int j = 0; j < score.length; j++) { final DisambiguatedEntity entity = new DisambiguatedEntity(); entity.setConfidence(score[j].score); final Document doc = reader.document(score[j].doc); final String mainLink = doc.get("Mainlink"); entity.setEntityUri(mainLink); entityMainLinks[j] = mainLink; entity.setText(doc.get("Label")); entity.setDescription(doc.get("Description")); // if (task.isRetrieveDocClasses()) { // entity.setDoc(doc); // } disList.add(entity); } // if (Properties.getInstance().getHBaseStorage()) { // // task.getOutput().storeQuery(dpo.getDocumentId(), // dpo.getSelectedText(), dpo.getStartPosition(), // entityMainLinks, dpo.getContext()); // } Response response = new Response(); response.setSelectedText(dpo.getSelectedText()); response.setStartPosition(dpo.getStartPosition()); response.setDisEntities(disList); List<Response> resList = new LinkedList<Response>(); resList.add(response); task.setResponse(resList); } catch (final IOException e) { Logger.getRootLogger().error(e.getStackTrace()); } eckb.release(); } private Query createQuery(EntityDisambiguationDPO dpo, EntityCentricKnowledgeBase kb) { LearnToRankQuery query = new LearnToRankQuery(); List<LearnToRankClause> features = new LinkedList<LearnToRankClause>(); FuzzyLabelSimilarity fuzzyLabelSim = new FuzzyLabelSimilarity(); DefaultSimilarity defaultSim = new DefaultSimilarity(); BM25Similarity bm25 = new BM25Similarity(); // Feature 1 features.add(query.add(LuceneFeatures.queryStringFuzzy( dpo.getSelectedText(), "Label", fuzzyLabelSim, Occur.MUST, DisambiguationMainService.MAXCLAUSECOUNT), "Feature1", true)); // Feature 2 features.add(query.add(LuceneFeatures.queryStringTerm( dpo.getSelectedText(), "Description", defaultSim, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature2", false)); // Feature 3 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Label", defaultSim, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature3", false)); // Feature 4 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Description", defaultSim, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature4", false)); // Feature 5 features.add(query.add(LuceneFeatures.queryLabelFuzzy( dpo.getSelectedText(), "Label", bm25), "Feature5", false)); // Feature 6 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Label", bm25, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature6", false)); // Feature 7 features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(), "Description", bm25, Occur.SHOULD, DisambiguationMainService.MAXCLAUSECOUNT), "Feature7", false)); // Feature 8 features.add(query.add( LuceneFeatures.queryPrior(kb.getFeatureDefinition()), "Feature8", false)); // Feature 9 features.add(query.add( LuceneFeatures.querySensePrior(dpo.getSelectedText(), kb.getFeatureDefinition()), "Feature9", false)); features.get(0).setWeight(0.0524974f); features.get(1).setWeight(0.01771f); features.get(2).setWeight(0.0615202f); features.get(3).setWeight(0.0933433f); features.get(4).setWeight(0.0915161f); features.get(5).setWeight(-0.0468604f); features.get(6).setWeight(-0.0947746f); features.get(7).setWeight(0.0423863f); features.get(8).setWeight(0.465053f); return query; } }