/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.ontology.ontologyutilities.evaluationScripts;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyFileLoader;
import org.erasmusmc.peregrine.ConceptPeregrine;
import org.erasmusmc.peregrine.ReleasedTerm;
import org.erasmusmc.peregrine.ResultTerm;
import org.erasmusmc.peregrine.UMLSGeneChemTokenizer;
import org.erasmusmc.peregrine.ResultConcept;
import org.erasmusmc.peregrine.disambiguator.DisambiguationDetails;
import org.erasmusmc.peregrine.disambiguator.DisambiguatorRuleRegistry;
import org.erasmusmc.peregrine.disambiguator.GeneDisambiguator;
import org.erasmusmc.peregrine.disambiguator.HasSynonymRule;
import org.erasmusmc.peregrine.disambiguator.UMLSDisambiguator;
import org.erasmusmc.peregrine.disambiguator.DisambiguationDetails.EvaluationResult;
import org.erasmusmc.peregrine.disambiguator.DisambiguationDetails.EvaluationResult.ExtraData;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.TextFileUtilities;
//Gena and Biocreative evaluation sets for evaluating gene name identification in texts
public class GeneNameIdentificationEvaluation {
public static void main(String[] args){new GeneNameIdentificationEvaluation();}
public boolean verbose = true;
public boolean disambiguatorDetails = true;
//Pick evaluation set:
//public String set = "GENAHS"; //Homo Sapiens
//public String set = "BiocreativeMM"; //Mus Musculus
//public String set = "BiocreativeDM"; //Drosophila Melanogaster
//public String set = "BiocreativeSC"; //Saccharomyces Cerevisiae
public String set = "Biocreative2Training"; //Homo Sapiens
//public String set = "Biocreative2Test"; //Homo Sapiens
public GeneDisambiguator geneDisambiguator;
public UMLSDisambiguator umlsDisambiguator;
//Baseline:
//UMLS2006HomologeneV1.6, SBDTokenizer, BCII: Precision=0.744131455399061 Recall=0.8076433121019109 F-measure= 0.7745876603543067
//UMLS2006HomologeneV1.6, UMLSGeneChemTokenizer, BCII: Precision=0.7582017010935601 Recall=0.7949044585987262 F-measure= 0.7761194029850746
//Since Feb 2011:Precision=0.7309941520467836 Recall=0.7961783439490446 F-measure= 0.7621951219512195
//GeneListHumanMouseRatV5_0.ontology, UMLSGeneChemTokenizer, BCII: Precision=0.7676240208877284 Recall=0.7490445859872611 F-measure= 0.7582205029013538
//UMLS2010ABHomologeneJochemToxV1_3.ontology, UMLSGeneChemTokenizer, BCII: Precision=0.786774628879892 Recall=0.7426751592356687 F-measure= 0.764089121887287
public GeneNameIdentificationEvaluation(){
String normaliserCacheFile = "/home/public/Peregrine/standardNormCache2006.bin";
indexer = new ConceptPeregrine();
OntologyFileLoader loader = new OntologyFileLoader();
//Ontology ontology = loader.load("/home/khettne/Projects/UMLS2010ABHomologeneJochemToxV1_1.ontology");
//Ontology ontology = loader.load("/home/khettne/Projects/GeneList/GeneListHumanMouseRatV5_0.ontology");
//Ontology ontology = loader.load("/home/public/thesauri/Homologene_v1_6c.ontology");
Ontology ontology = loader.load("/home/public/thesauri/UMLS2006Homologene_v1_6c.ontology");
//Ontology ontology = OCUMLS2006Homologene.constructOntology();
indexer.setOntology(ontology);
indexer.normaliser.loadCacheBinary(normaliserCacheFile);
System.out.println("Releasing thesaurus");
indexer.tokenizer = new UMLSGeneChemTokenizer();
long start = System.currentTimeMillis();
indexer.release();
System.out.println("Release time: " + (System.currentTimeMillis()-start) + "ms");
geneDisambiguator = new GeneDisambiguator(indexer, 2000000, Integer.MAX_VALUE);
evaluate();
}
private void evaluate(){
getSettings(set);
loadGoldenStandard();
loadValidIDs();
int GlobalFP = 0;
int GlobalTP = 0;
int GlobalFN = 0;
for (String file : new ReadTextFile(fileList)){
Map<String, Occurrence> ids = file2ids.get(file);
if (ids == null)
ids = new HashMap<String, Occurrence>();
List<String> lines = TextFileUtilities.loadFromFile(file);
if (verbose){
System.out.println();
System.out.println(lines.toString());
}
indexer.index(lines.toString());
if (disambiguatorDetails){
DisambiguationDetails details = geneDisambiguator.disambiguateWithDetails(indexer);
outputDetails(details,validIDs, ids, indexer);
} else geneDisambiguator.disambiguate(indexer);
//umlsDisambiguator.disambiguate(indexer);
Set<String> TP = new HashSet<String>();
Set<String> FN = new HashSet<String>();
Set<String> FP = new HashSet<String>();
if (ids != null){
//Evaluate Peregrine output
for (ResultConcept concept : indexer.resultConcepts){
Set<String> geneIDs = ExtractGeneID(geneIDprefix, concept.conceptId, validIDs);
for (String geneID : geneIDs){
Occurrence occurrence = ids.get(geneID);
if (occurrence != null) {
TP.add(geneID);
occurrence.found = true;
if (verbose){
System.out.print("TP: ");
displayTerm(indexer, concept, geneID);
}
} else {
if (verbose){
System.out.print("FP: ");
displayTerm(indexer, concept, geneID);
}
FP.add(geneID);
}
}
}
for (Map.Entry<String, Occurrence> entry : ids.entrySet()){
if (!entry.getValue().found){
FN.add(entry.getKey());
if (verbose){
System.out.print("FN: ("+entry.getKey()+") ");
for (String name : entry.getValue().names) System.out.print(name+";");
System.out.println();
}
}
}
}
GlobalFP += FP.size();
GlobalTP += TP.size();
GlobalFN += FN.size();
if (verbose)
System.out.println(file + " True: "+(TP.size()+FN.size())+" TP:"+TP.size()+" FP:"+FP.size()+" FN:"+FN.size());
}
double P = (double)GlobalTP / (double)(GlobalFP+GlobalTP);
double R = (double)GlobalTP / (double)(GlobalTP+GlobalFN);
double F = (2*P*R) / (P + R);
System.out.println("TP="+GlobalTP+" FP="+GlobalFP+" FN="+GlobalFN);
System.out.println("Precision="+P+" Recall="+R+" F-measure= "+F);
}
private void loadValidIDs() {
if (!validIDsFile.equals("")){
List<String> lines = TextFileUtilities.loadFromFile(validIDsFile);
validIDs = new HashSet<String>();
for (String line : lines){
String[] cols = line.split("_");
DatabaseID id = new DatabaseID(cols[0], cols[1]);
validIDs.add(id.ID);
}
}
}
private void loadGoldenStandard() {
file2ids = new HashMap<String, Map<String, Occurrence>>();
List<String> lines = TextFileUtilities.loadFromFile(goldenStandardFile);
String previousFile = "";
Map<String, Occurrence> ids = null;
for (String line : lines){
String[] cols = line.split("\t");
if (!cols[0].equals(previousFile)){
previousFile = cols[0];
ids = new HashMap<String, Occurrence>();
file2ids.put(cols[0], ids);
}
Occurrence occurrence = new Occurrence();
for (int i = 2; i < cols.length; i++)
occurrence.names.add(cols[i]);
ids.put(cols[1], occurrence);
}
}
private void getSettings(String set) {
if (set.equals("GENAHS")){
fileList = "/home/public/datasets/GENA/GenaFilesHS.txt";
goldenStandardFile = "/home/public/datasets/GENA/GenaGoldenStandardHS.txt";
geneIDprefix = "EG";
validIDsFile = "";
}
if (set.equals("BiocreativeMM")){
System.out.println("Biocreative Mus Musculus test set selected");
fileList = "/home/public/datasets/Biocreative/BiocreativeFilesMM.txt";
goldenStandardFile = "/home/public/datasets/Biocreative/BiocreativeGoldenStandardMM.txt";
geneIDprefix = "MGI";
validIDsFile = "";
}
if (set.equals("BiocreativeDM")){
System.out.println("Biocreative Drosophila Melanogaster test set selected");
fileList = "/home/public/datasets/Biocreative/BiocreativeFilesDM.txt";
goldenStandardFile = "/home/public/datasets/Biocreative/BiocreativeGoldenStandardDM.txt";
geneIDprefix = "FB";
validIDsFile = "";
}
if (set.equals("BiocreativeSC")){
System.out.println("Biocreative Saccharomyces Cerevisiae test set selected");
fileList = "/home/public/datasets/Biocreative/BiocreativeFilesSC.txt";
goldenStandardFile = "/home/public/datasets/Biocreative/BiocreativeGoldenStandardSC.txt";
geneIDprefix = "SGD";
validIDsFile = "";
}
if (set.equals("Biocreative2Training")){
System.out.println("Biocreative 2 Training set selected");
fileList = "/home/public/datasets/Biocreative2/Training/Files.txt";
System.out.println("fl: "+fileList);
goldenStandardFile = "/home/public/datasets/Biocreative2/Training/GoldenStandardHS.txt";
geneIDprefix = "EG";
validIDsFile = "/home/public/datasets/Biocreative2/validIDs.txt";
}
if (set.equals("Biocreative2Test")){
System.out.println("Biocreative 2 Test set selected");
fileList = "/home/public/datasets/Biocreative2/Test/Files.txt";
System.out.println("fl: "+fileList);
goldenStandardFile = "/home/public/datasets/Biocreative2/Test/GoldenStandardHS.txt";
geneIDprefix = "EG";
validIDsFile = "/home/public/datasets/Biocreative2/validIDs.txt";
}
}
private static void displayTerm(ConceptPeregrine indexer, ResultConcept concept, String geneID) {
StringBuffer term = new StringBuffer();
for (Integer word : concept.terms.get(0).words){
term.append(indexer.tokenizer.tokens.get(word));
term.append(" ");
}
StringBuilder termIDs = new StringBuilder();
for (int termID : concept.terms.get(0).term.termId)
termIDs.append(termID + " ");
System.out.println(term.toString()+ " termid:"+termIDs.toString() + "\t("+ geneID+")");
}
private Set<String> ExtractGeneID(String geneIDprefix, int conceptid, Set<String> valid) {
Set<String> result = new HashSet<String>();
List<DatabaseID> databaseIDs = indexer.getOntology().getDatabaseIDsForConcept(conceptid);
if (databaseIDs != null)
for (DatabaseID databaseID : databaseIDs)
if (databaseID.database.equals(geneIDprefix))
result.add(databaseID.ID);
filterValidIDs(result, valid);
return result;
}
private void filterValidIDs(Set<String> result, Set<String> valid) {
if (valid != null){
Iterator<String> iterator = result.iterator();
while (iterator.hasNext()){
if (!valid.contains(iterator.next()))
iterator.remove();
}
}
}
private String fileList = "";
private String goldenStandardFile = "";
private String geneIDprefix = "";
private String validIDsFile = "";
private ConceptPeregrine indexer;
private Set<String> validIDs = null;
private Map<String, Map<String, Occurrence>> file2ids;
private class Occurrence{
boolean found = false;
List<String> names = new ArrayList<String>();
}
private void outputDetails(DisambiguationDetails details, Set<String> validIDs, Map<String, Occurrence> correctIDs, ConceptPeregrine indexer2) {
Set<Integer> removedCIDs = new HashSet<Integer>();
for (ResultConcept concept : details.removedConcepts)
removedCIDs.add(concept.conceptId);
for (Map.Entry<Integer, List<EvaluationResult>> entry : details.conceptID2EvaluationResult.entrySet()){
List<String> ids = new ArrayList<String>();
//Find resultconcept:
ResultConcept resultConcept = null;
for (ResultConcept concept : indexer.resultConcepts)
if (entry.getKey().equals(concept.conceptId)){
resultConcept = concept;
break;
}
for (ResultConcept concept : details.removedConcepts)
if (entry.getKey().equals(concept.conceptId)){
resultConcept = concept;
break;
}
//Find gene IDs:
for (DatabaseID databaseID : indexer.getOntology().getDatabaseIDsForConcept(entry.getKey()))
if (databaseID.database.equals(geneIDprefix) && validIDs.contains(databaseID.ID))
ids.add(databaseID.ID + (correctIDs.containsKey(databaseID.ID)?"+":"-"));
if (ids.size() == 0)
continue;
//Show details:
System.out.println("Evaluating concept: " + buildTerm(indexer, resultConcept.terms.get(0)) + "("+entry.getKey()+")");
for (EvaluationResult evaluationResult : entry.getValue()) {
String ruleName = DisambiguatorRuleRegistry.getRuleName(evaluationResult.ruleID);
System.out.println(ruleName + " (result: " + evaluationResult.result + ")");
if (evaluationResult.extraDatas != null)
for (ExtraData extraData : evaluationResult.extraDatas){
String typeString = ExtraData.typeStrings[extraData.type];
System.out.println("- " + typeString + ": " + extraData.value);
}
if (ruleName.equals(HasSynonymRule.class.getSimpleName())){
Set<ReleasedTerm> uniqueTerms = new HashSet<ReleasedTerm>();
for (ResultTerm term: resultConcept.terms) {
if (uniqueTerms.add(term.term) && uniqueTerms.size() != 1) {
System.out.println("- Synonym: " + buildTerm(indexer, term));
break;
}
}
}
}
if (removedCIDs.contains(entry.getKey()))
System.out.println("Concept "+StringUtilities.join(ids, ", ") + " removed");
else
System.out.println("Concept "+StringUtilities.join(ids, ", ") + " kept");
System.out.println();
}
}
private String buildTerm(ConceptPeregrine indexer, ResultTerm resultTerm){
StringBuffer term = new StringBuffer();
for (Integer word : resultTerm.words){
term.append(indexer.tokenizer.tokens.get(word));
term.append(" ");
}
return term.toString();
}
}