/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.EvaluationScripts;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.peregrine.ConceptPeregrine;
import org.erasmusmc.peregrine.ReleasedTerm;
import org.erasmusmc.peregrine.ResultConcept;
import org.erasmusmc.peregrine.ResultTerm;
import org.erasmusmc.peregrine.UMLSGeneChemTokenizer;
import org.erasmusmc.peregrine.disambiguator.DisambiguationDetails;
import org.erasmusmc.peregrine.disambiguator.DisambiguatorRuleRegistry;
import org.erasmusmc.peregrine.disambiguator.GeneDisambiguator;
import org.erasmusmc.peregrine.disambiguator.DisambiguationDetails.EvaluationResult;
import org.erasmusmc.peregrine.disambiguator.DisambiguationDetails.EvaluationResult.ExtraData;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.TextFileUtilities;
import org.erasmusmc.utilities.WriteTextFile;
public class IndexChemicalCorpus {
public boolean disambiguate = true;
private GeneDisambiguator geneDisambiguator;
public static String corpusFile = "/home/khettne/Projects/Jochem/Indexing/chemicals-test-corpus-01-05-2008.iob";
public void run(String home, String fileWithfilesToIndex, String indexresultFileName, String falsePositivePositionsFileName, String falseNegativePositionsFileName, String truePositivePositionsFileName, OntologyStore ontology){
Integer truePos = 0;
Integer falsePos = 0;
Integer falseNeg = 0;
WriteTextFile falsePosPositionsFile = new WriteTextFile(falsePositivePositionsFileName);
WriteTextFile falseNegPositionsFile = new WriteTextFile(falseNegativePositionsFileName);
WriteTextFile truePosPositionsFile = new WriteTextFile(truePositivePositionsFileName);
indexer.tokenizer = new UMLSGeneChemTokenizer();
indexer.biggestMatchOnly = true;
indexer.destroyOntologyDuringRelease = false;
indexer.setOntology(ontology);
System.out.println("Releasing thesaurus. "+StringUtilities.now());
indexer.release();
if (disambiguate){
geneDisambiguator = new GeneDisambiguator(indexer, 4000000, Integer.MAX_VALUE);
}
System.out.println("Indexing texts. "+StringUtilities.now());
List<String> files = TextFileUtilities.loadFromFile(fileWithfilesToIndex);
System.out.println("No of keys: " +positions.size());
for (String file : files){
List<String> textPairs = new ArrayList<String>();
List<String> kolarikPairs = positions.get(file.substring(0, file.indexOf(".txt")));
falsePosPositionsFile.writeln("### "+file.substring(0, file.indexOf(".txt")));
truePosPositionsFile.writeln("### "+file.substring(0, file.indexOf(".txt")));
falseNegPositionsFile.writeln("### "+file.substring(0, file.indexOf(".txt")));
List<String> text = TextFileUtilities.loadFromFile(home+file);
StringBuffer buffer = new StringBuffer();
for (String textline: text){
buffer.append(textline);
buffer.append(" ");
}
String textString = buffer.toString().trim();
indexer.index(textString);
if (disambiguate){
geneDisambiguator.disambiguate(indexer);
}
// DisambiguationDetails details = disambiguator.disambiguateWithDetails(indexer);
// outputDetails(details);
List<Integer> start = indexer.tokenizer.startpositions;
List<Integer> end = indexer.tokenizer.endpositions;
for (ResultConcept concept: indexer.resultConcepts){
boolean chemVoc = false;
boolean geneVoc = false;
for (Relation relation: ontology.getRelationsForConceptAsSubject(concept.conceptId, DefaultTypes.fromVocabulary)) {
if (ontology.getConcept(relation.object).getName().equals("CHEMICAL"))
chemVoc = true;
if (ontology.getConcept(relation.object).getName().equals("GENE"))
geneVoc = true;
}
if (chemVoc && !geneVoc){
// if (chemVoc){
List<ResultTerm> terms = concept.terms;
for (ResultTerm term: terms){
Integer first = term.words[0];
Integer last = term.words[term.words.length-1];
String startPos = start.get(first).toString();
Integer endP = end.get(last)+1;
String endPos = endP.toString();
String startAndEnd = startPos+"\t"+endPos;
if (!textPairs.contains(startAndEnd)){
textPairs.add(startAndEnd);
}
Count count = releasedTerm2Count.get(term.term);
if (count == null) {
count = new Count();
releasedTerm2Count.put(term.term, count);
}
count.count++;
}
}
}
if (kolarikPairs!=null){
boolean found = false;
for (String textPair: textPairs){
for (String kolarikPair: kolarikPairs){
if (kolairkPairMatches(textPair, kolarikPair)){
found = true;
/** Corrects for irregularities in corpus*/
if (!textPair.equals(kolarikPair)){
textPair = kolarikPair;
}
}
}
if (found){
truePos++;
truePosPositionsFile.writeln(textPair);
} else {
falsePos++;
falsePosPositionsFile.writeln(textPair);
}
found = false;
}
found = false;
for (String kolarikPair: kolarikPairs){
for (String textPair: textPairs){
if (kolairkPairMatches(kolarikPair, textPair)){
found = true;
}
}
if (!found){
falseNeg++;
falseNegPositionsFile.writeln(kolarikPair);
}
found = false;
}
} else if (!textPairs.isEmpty()){
for (String textPair: textPairs){
falsePos++;
falsePosPositionsFile.writeln(textPair);
}
}
}
generateResults(indexresultFileName, ontology);
Integer temp = truePos+falsePos;
Double precision = truePos.doubleValue()/temp.doubleValue();
temp = truePos+falseNeg;
Double recall = truePos.doubleValue()/temp.doubleValue();
System.out.println("True positives: "+truePos);
System.out.println("False positives: "+falsePos);
System.out.println("False negatives: "+falseNeg);
System.out.println("Precision: "+precision);
System.out.println("Recall: "+recall);
falsePosPositionsFile.close();
falseNegPositionsFile.close();
truePosPositionsFile.close();
}
public static boolean kolairkPairMatches(String textPair, String kolarikPair){
String columns[] = textPair.split("\t");
Integer a = Integer.parseInt(columns[0]);
Integer b = Integer.parseInt(columns[1]);
String cols[] = kolarikPair.split("\t");
Integer c = Integer.parseInt(cols[0]);
Integer d = Integer.parseInt(cols[1]);
if ((a.intValue()==c.intValue() && b.intValue()==d.intValue()) || (a.intValue()+1==c.intValue() && b.intValue()+1==d.intValue()) || (a.intValue()-1==c.intValue() && b.intValue()-1==d.intValue())){
return true;
}
return false;
}
private void generateResults(String filename, Ontology ontology) {
try {
FileOutputStream PSFFile = new FileOutputStream(filename);
BufferedWriter bufferedWrite = new BufferedWriter(new OutputStreamWriter(PSFFile), 1000000);
try {
for (Entry<ReleasedTerm, Count> entry: releasedTerm2Count.entrySet()) {
ReleasedTerm term = entry.getKey();
StringBuffer line = new StringBuffer();
line.append(entry.getValue().count);
line.append("\t");
int id = term.conceptId[0];
int tid = term.termId[0];
line.append(ontology.getConcept(id).getTerms().get(tid).text);
line.append("\t");
for (int cid: term.conceptId) {
line.append(cid);
line.append(";");
}
bufferedWrite.write(line.toString());
bufferedWrite.newLine();
}
bufferedWrite.close();
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
private class Count {
int count = 0;
}
protected class ReleasedTermComparator implements Comparator<ReleasedTerm> {
@Override
public int compare(ReleasedTerm arg0, ReleasedTerm arg1) {
int result = arg0.conceptId[0] - arg1.conceptId[0];
if (result == 0)
result = arg0.termId[0] - arg1.termId[0];
return result;
}
}
private Map<ReleasedTerm, Count> releasedTerm2Count = new TreeMap<ReleasedTerm, Count>(new ReleasedTermComparator());
private ConceptPeregrine indexer = new ConceptPeregrine();
private Map<String,List<String>> positions = getKolarikPositions(corpusFile);
public static Map<String,List<String>> getKolarikPositions(String corpusFile){
Map<String,List<String>> positions = new HashMap<String,List<String>>();
int tokenClassIndex = 4;
String tokenClass = "";
int startIndex = 1;
String start = "";
int endIndex = 2;
String end = "";
String pmid = "";
int entities = 0;
ReadTextFile textFile = new ReadTextFile(corpusFile);
Iterator<String> fileIterator = textFile.getIterator();
boolean inEntity = false;
while (fileIterator.hasNext()) {
String line = fileIterator.next();
String pair = "";
if (line.startsWith("###")){
pmid = line.substring(4);
}
else if (line.length()!=0){
String[] columns = line.split("\t");
tokenClass = columns[tokenClassIndex];
if (inEntity && (tokenClass.contains("|B-IUPAC") || tokenClass.contains("|B-TRIVIAL") || tokenClass.contains("|B-TRIVIALVAR") || tokenClass.contains("|B-FAMILY") || tokenClass.contains("|B-ABBREVIATION") || tokenClass.contains("|B-PARTIUPAC") || tokenClass.contains("|B-SUM"))){
pair = start+"\t"+end;
List<String> pairs = positions.get(pmid);
if (pairs==null){
pairs = new ArrayList<String>();
}
pairs.add(pair);
positions.put(pmid, pairs);
entities++;
start = columns[startIndex];
end = columns[endIndex];
}
else if (!inEntity && (tokenClass.contains("|B-IUPAC") || tokenClass.contains("|B-TRIVIAL") || tokenClass.contains("|B-TRIVIALVAR") || tokenClass.contains("|B-FAMILY") || tokenClass.contains("|B-ABBREVIATION") || tokenClass.contains("|B-PARTIUPAC") || tokenClass.contains("|B-SUM"))){
start = columns[startIndex];
end = columns[endIndex];
inEntity = true;
}
else if (inEntity && (tokenClass.contains("|O") || tokenClass.contains("|B-MODIFIER"))){
pair = start+"\t"+end;
List<String> pairs = positions.get(pmid);
if (pairs==null){
pairs = new ArrayList<String>();
}
pairs.add(pair);
positions.put(pmid, pairs);
entities++;
inEntity = false;
}
if (tokenClass.contains("|I-IUPAC") || tokenClass.contains("|I-TRIVIAL") || tokenClass.contains("|I-TRIVIALVAR") || tokenClass.contains("|I-FAMILY") || tokenClass.contains("|I-ABBREVIATION") || tokenClass.contains("|I-PARTIUPAC") || tokenClass.contains("|I-SUM")){
end = columns[endIndex];
}
}
}
System.out.println("Number of entities: "+entities);
return positions;
}
private void outputDetails(DisambiguationDetails details) {
for (Map.Entry<Integer, List<EvaluationResult>> entry : details.conceptID2EvaluationResult.entrySet()){
System.out.println("Evaluating concept: " + entry.getKey());
for (EvaluationResult evaluationResult : entry.getValue()) {
String ruleName = DisambiguatorRuleRegistry.getRuleName(evaluationResult.ruleID);
System.out.println(ruleName + " (result: " + evaluationResult.result + ")");
if (evaluationResult.extraDatas != null)
for (ExtraData extraData : evaluationResult.extraDatas){
String typeString = ExtraData.typeStrings[extraData.type];
System.out.println("- " + typeString + ": " + extraData.value);
}
}
}
}
}