/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package JochemBuilder.SharedCurationScripts;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;
import org.erasmusmc.medline.MedlineIterator;
import org.erasmusmc.medline.MedlineListener;
import org.erasmusmc.medline.MedlineRecord;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyFileLoader;
import org.erasmusmc.peregrine.ConceptPeregrine;
import org.erasmusmc.peregrine.ReleasedTerm;
import org.erasmusmc.peregrine.ResultConcept;
import org.erasmusmc.peregrine.ResultTerm;
import org.erasmusmc.peregrine.UMLSGeneChemTokenizer;
import org.erasmusmc.utilities.StringUtilities;
public class ChemicalOntologyFrequenceCount implements MedlineListener {
public static String pmidsFile = "/home/public/PMIDs/Random100.000.PMIDs";
public static String ontologyFile = "/home/khettne/Projects/Jochem/Jochem_V1_3.ontology";
public static String outputFile = "/home/khettne/Projects/Jochem/freq_Jochem_V1_3.txt";
public static void main(String[] args) {
System.out.println("Starting script. " + StringUtilities.now());
OntologyFileLoader loader = new OntologyFileLoader();
Ontology ontology = loader.load(ontologyFile);
new ChemicalOntologyFrequenceCount(ontology);
}
public ChemicalOntologyFrequenceCount(Ontology ontology) {
indexer.tokenizer = new UMLSGeneChemTokenizer();
indexer.setOntology(ontology);
medlineIterator.fetchSubstances = true;
medlineIterator.pmidsFile = pmidsFile;
System.out.println("Releasing thesaurus. " + StringUtilities.now());
indexer.destroyOntologyDuringRelease = false;
indexer.release();
System.out.println("Starting indexation cycles. " + StringUtilities.now());
medlineIterator.iterate(this);
System.out.println("Generating results. " + StringUtilities.now());
generateResults(outputFile, ontology);
System.out.println("Done. " + StringUtilities.now());
}
private void generateResults(String filename, Ontology ontology) {
try {
FileOutputStream PSFFile = new FileOutputStream(filename);
BufferedWriter bufferedWrite = new BufferedWriter(new OutputStreamWriter(PSFFile), 1000000);
try {
for (Entry<ReleasedTerm, Count> entry: releasedTerm2Count.entrySet()) {
ReleasedTerm term = entry.getKey();
StringBuffer line = new StringBuffer();
line.append(entry.getValue().count);
line.append("\t");
int id = term.conceptId[0];
int tid = term.termId[0];
line.append(ontology.getConcept(id).getTerms().get(tid).text);
line.append("\t");
for (int cid: term.conceptId) {
line.append(cid);
line.append(";");
}
bufferedWrite.write(line.toString());
bufferedWrite.newLine();
}
bufferedWrite.close();
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
@Override
public void processMedlineRecords(List<MedlineRecord> records) {
for (int i = 0; i < records.size(); i++) {
MedlineRecord currentRecord = records.get(i);
indexer.index(currentRecord.titleAbsMeshSubs());
for (ResultConcept concept: indexer.resultConcepts){
List<ResultTerm> terms = concept.terms;
for (ResultTerm term: terms) {
Count count = releasedTerm2Count.get(term.term);
if (count == null) {
count = new Count();
releasedTerm2Count.put(term.term, count);
}
count.count++;
}
}
}
}
private class Count {
int count = 0;
}
protected class ReleasedTermComparator implements Comparator<ReleasedTerm> {
@Override
public int compare(ReleasedTerm arg0, ReleasedTerm arg1) {
int result = arg0.conceptId[0] - arg1.conceptId[0];
if (result == 0)
result = arg0.termId[0] - arg1.termId[0];
return result;
}
}
private Map<ReleasedTerm, Count> releasedTerm2Count = new TreeMap<ReleasedTerm, Count>(new ReleasedTermComparator());
private MedlineIterator medlineIterator = new MedlineIterator();
private ConceptPeregrine indexer = new ConceptPeregrine();
}