package doser.entitydisambiguation.algorithms;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import doser.entitydisambiguation.backend.DisambiguationMainService;
import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
import doser.entitydisambiguation.backend.DisambiguationTaskSingle;
import doser.entitydisambiguation.dpo.DisambiguatedEntity;
import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
import doser.entitydisambiguation.dpo.Response;
import doser.entitydisambiguation.knowledgebases.DocumentCentricKnowledgeBaseDefault;
import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
import doser.general.HelpfulMethods;
import doser.lucene.features.LuceneFeatures;
import doser.lucene.query.LearnToRankClause;
import doser.lucene.query.LearnToRankQuery;
/**
* Algorithmus verallgemeinern sodass nicht nur Calbc funktioniert.
*
* Allgemein optimieren. Stichwort HashMaps
*
* @author quhfus
*
*/
public class DocumentCentricAlgorithmDefault extends AbstractDisambiguationAlgorithm {
public static int CLASSIFICATIONDOCUMENTS = 101;
public static String CONCEPTFIELD = "concept";
private DocumentCentricKnowledgeBaseDefault dckb;
private DisambiguationTaskSingle task;
DocumentCentricAlgorithmDefault() {
super();
}
@Override
public boolean checkAndSetInputParameter(AbstractDisambiguationTask task) {
AbstractKnowledgeBase kb = task.getKb();
if (!(task instanceof DisambiguationTaskSingle)) {
return false;
} else if (!(kb instanceof DocumentCentricKnowledgeBaseDefault)) {
return false;
}
this.dckb = (DocumentCentricKnowledgeBaseDefault) kb;
this.task = (DisambiguationTaskSingle) task;
return true;
}
@Override
protected boolean preDisambiguation() {
EntityDisambiguationDPO toDis = task.getEntityToDisambiguate();
boolean res = true;
final Pattern pattern = Pattern.compile("^\\d*[.,]?\\d*$");
final String surfaceForms = toDis.getSelectedText();
final String str = surfaceForms;
final Matcher matcher = pattern.matcher(str);
if (matcher.find()) {
res = false;
}
if (!res) {
final List<DisambiguatedEntity> disEntityList = new LinkedList<DisambiguatedEntity>();
final DisambiguatedEntity disEntity = new DisambiguatedEntity();
disEntity.setEntityUri("http://dbpedia.org/resource/Number");
disEntity.setText("Number");
disEntity
.setDescription("A number is a mathematical object used to count, label, and measure. In mathematics, the definition of number has been extended over the years to include such numbers "
+ "as zero, negative numbers, rational numbers, irrational numbers, and complex numbers. Mathematical operations are certain procedures that take one or more numbers as input and"
+ " produce a number as output. Unary operations take a single input number and produce a single output number. For example, the successor operation adds one to an integer, thus "
+ "the successor of 4 is 5. Binary operations take two input numbers and produce a single output number. Examples of binary operations include addition, subtraction, "
+ "multiplication, division, and exponentiation. The study of numerical operations is called arithmetic. A notational symbol that represents a number is called a numeral. "
+ "In addition to their use in counting and measuring, numerals are often used for labels, for ordering, and for codes. In common usage, the word number can mean the abstract "
+ "object, the symbol, or the word for the number.");
disEntity.setConfidence(1);
disEntityList.add(disEntity);
Response response = new Response();
response.setSelectedText(toDis.getSelectedText());
response.setStartPosition(toDis.getStartPosition());
response.setDisEntities(disEntityList);
List<Response> resList = new LinkedList<Response>();
resList.add(response);
task.setResponse(resList);
}
return res;
}
@Override
public void processAlgorithm()
throws IllegalDisambiguationAlgorithmInputException {
Query query = createQuery(task.getEntityToDisambiguate());
final IndexSearcher searcher = dckb.getSearcher();
final IndexReader reader = searcher.getIndexReader();
HashMap<String, Integer> hashSaver = new HashMap<String, Integer>();
EntityDisambiguationDPO dpo = task.getEntityToDisambiguate();
try {
TopDocs top = searcher.search(query, CLASSIFICATIONDOCUMENTS);
ScoreDoc[] score = top.scoreDocs;
for (int i = 0; i < score.length; i++) {
Document doc = reader.document(score[i].doc);
String str = doc.get(CONCEPTFIELD);
String[] arr = createConceptArray(str);
for (int j = 0; j < arr.length; j++) {
if (hashSaver.get(arr[j]) != null) {
Integer val = hashSaver.get(arr[j]);
hashSaver.put(arr[j], ++val);
} else {
hashSaver.put(arr[j], 1);
}
}
}
List<Entry<String, Integer>> vals = HelpfulMethods
.sortByValue(hashSaver);
final List<DisambiguatedEntity> disList = new LinkedList<DisambiguatedEntity>();
for (int i = 0; i < task.getReturnNr(); i++) {
Entry<String, Integer> entry = vals.get(i);
final DisambiguatedEntity entity = new DisambiguatedEntity();
entity.setConfidence(entry.getValue());
entity.setEntityUri(entry.getKey());
entity.setText("Unknown");
entity.setDescription("Unknown");
disList.add(entity);
}
Response response = new Response();
response.setSelectedText(dpo.getSelectedText());
response.setStartPosition(dpo.getStartPosition());
response.setDisEntities(disList);
List<Response> resList = new LinkedList<Response>();
resList.add(response);
task.setResponse(resList);
} catch (IOException e) {
e.printStackTrace();
}
dckb.release();
}
private String[] createConceptArray(String str) {
List<String> lst = new LinkedList<String>();
str = str.trim();
String[] arr = str.split(" ");
for (int i = 0; i < arr.length; i++) {
if (!arr[i].equalsIgnoreCase("") && analyseConcept(arr[i])) {
lst.add(generateID(arr[i].toUpperCase()));
}
}
String[] result = new String[lst.size()];
lst.toArray(result);
return result;
}
private String generateID(String line) {
String[] splitter = line.split(":");
String link = "";
if (splitter[1].equalsIgnoreCase("uniprot")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "UN_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("entrezgene")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "NC_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("umls")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "LI_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("ncbi")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "NC_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("disease")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "LI_" + splitter[2];
}
return link;
}
private boolean analyseConcept(String str) {
String[] arr = str.split(":");
if (arr.length < 3) {
return false;
}
if (arr[2] == null || arr[2].equalsIgnoreCase("")) {
return false;
}
return true;
}
private Query createQuery(EntityDisambiguationDPO dpo) {
LearnToRankQuery query = new LearnToRankQuery();
List<LearnToRankClause> features = new LinkedList<LearnToRankClause>();
DefaultSimilarity defaultSim = new DefaultSimilarity();
// Feature 1
features.add(query.add(LuceneFeatures.queryLabelTerm(
dpo.getSelectedText(), "title", defaultSim), "Feature1",
true));
// Feature 2
features.add(query.add(LuceneFeatures.queryLabelTerm(
dpo.getSelectedText(), "abstract", defaultSim), "Feature2",
true));
// Feature 3
features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(),
"title", defaultSim, Occur.SHOULD,
DisambiguationMainService.MAXCLAUSECOUNT), "Feature3", false));
// Feature 4
features.add(query.add(LuceneFeatures.queryStringTerm(dpo.getContext(),
"abstract", defaultSim, Occur.SHOULD,
DisambiguationMainService.MAXCLAUSECOUNT), "Feature4", false));
features.get(0).setWeight(0.0056836f);
features.get(1).setWeight(0.0305069f);
features.get(2).setWeight(0.117543f);
features.get(3).setWeight(0.365259f);
return query;
}
}