/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.dataimport.genes;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyManager;
import org.erasmusmc.ontology.OntologyPSFLoader;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.ontology.ontologyutilities.ConceptMerger;
import org.erasmusmc.ontology.ontologyutilities.GeneTermVariantGenerator;
import org.erasmusmc.ontology.ontologyutilities.OntologyCurator;
import org.erasmusmc.utilities.StringUtilities;
import org.erasmusmc.utilities.TextFileUtilities;
public class HomologeneOntologyMerger {
public static Integer numberAddedToHomologeneID = 3000000;
public static String[] organisms = {"9606", "10090", "10116"};
public static String curationfile = "/home/public/Thesauri/homologene/GeneThesaurusCurationFile_may2007.txt";
public static void main(String[] args) {
String dir = "/home/public/Thesauri/homologene/";
/*
String homologenefile = "homologene.data";
String mousePSF = "MGD_may2007.psf";
String ratPSF = "RGD_may2007.psf";
String humanPSF = "GenelistHuman_v2.4.0.psf";
String dbname = "Homologene_newRule";
*/
String homologenefile = "homologene_may2007.data";
String mousePSF = "MGD_may2007.psf";
String ratPSF = "RGD_may2007.psf";
String humanPSF = "GenelistHuman_v2.4.0.psf";
String dbname = "Homologene_curated_may2007";
System.out.println(StringUtilities.now() + "\tInitializing");
OntologyManager ontologyManager = new OntologyManager();
Map<Integer, List<Concept>> conceptsToBeMapped = new TreeMap<Integer, List<Concept>>();
ontologyManager.deleteOntology(dbname);
OntologyStore target = new OntologyStore();
target.setName(dbname);
intializeTargetOntology(target); //Sets vocs & semtypes
Map<Integer, Integer> mapEntrezGene2Homologene = loadHomologeneFile(dir + homologenefile); //Loads mapping for all species
System.out.println(StringUtilities.now() + "\tLoading source thesauri");
OntologyPSFLoader loader = new OntologyPSFLoader();
loader.loadDefinitions = true;
loader.loadFromPSF(dir + mousePSF);
Ontology mouse = loader.ontology;
loader.loadFromPSF(dir + ratPSF);
Ontology rat = loader.ontology;
loader.loadFromPSF(dir + humanPSF);
Ontology human = loader.ontology;
System.out.println(StringUtilities.now() + "\tMapping concepts based on homologene file");
//The following procedures check for each gene whether there is a mention in homologene.
//If not, the gene is added to the target ontology, else, it is added to conceptsToBeMapped
//(DatabaseIDs are already added to the target ontology)
checkIfIDsMapped(mouse, target, mapEntrezGene2Homologene, conceptsToBeMapped, -1002);
checkIfIDsMapped(human, target, mapEntrezGene2Homologene, conceptsToBeMapped, -1001);
checkIfIDsMapped(rat, target, mapEntrezGene2Homologene, conceptsToBeMapped, -1003);
//Merge concepts in conceptsToBeMapped, and add them to the target ontology:
for (Integer homologene: conceptsToBeMapped.keySet()) {
Concept newconcept = new Concept(homologene + numberAddedToHomologeneID);
List<Concept> concepts = conceptsToBeMapped.get(homologene);
Set<String> newterms = new HashSet<String>();
String name = "";
Integer id = 3000000;
for (Concept concept: concepts) {
if (concept.getID() < id) { //use concept with lowest ID to give name
id = concept.getID();
name = concept.getName();
}
List<TermStore> terms = concept.getTerms();
for (TermStore term: terms) {
newterms.add(term.text);
}
}
newconcept.setName(name);
List<TermStore> newtermlist = new ArrayList<TermStore>();
newtermlist.add(new TermStore(name));
for (String term: newterms) {
if (!term.equals(name)) {
TermStore newterm = new TermStore(term);
newtermlist.add(newterm);
}
}
newconcept.setTerms(newtermlist);
target.setRelation(new Relation(newconcept.getID(), DefaultTypes.fromVocabulary, -1000));
target.setRelation(new Relation(newconcept.getID(), DefaultTypes.fromVocabulary, -1004));
target.setRelation(new Relation(newconcept.getID(), DefaultTypes.isOfSemanticType, -116));
DatabaseID databaseID = new DatabaseID("HO", homologene.toString());
target.setDatabaseIDForConcept(newconcept.getID(), databaseID);
target.setConcept(newconcept);
}
AddLLIDsFromHomologene(target, mapEntrezGene2Homologene);
System.out.println(StringUtilities.now() + "\tNumber of concepts mapped due to homologene: " +conceptsToBeMapped.size());
System.out.println(StringUtilities.now() + "\tCurating ontology");
CurateOntology(target);
System.out.println(StringUtilities.now() + "\tMapping concepts based on overlapping terms");
MergeIdenticalConcepts(target);
System.out.println(StringUtilities.now() + "\tGenerating spelling variants");
GenerateSpellingVariants(target);
System.out.println(StringUtilities.now() + "\tSaving ontology to database");
ontologyManager.dumpStoreInDatabase(target);
loader.ontology = target;
loader.saveToPSF("/temp/Homologen.psf");
System.out.println(StringUtilities.now() + "\tDone.");
}
private static void AddLLIDsFromHomologene(OntologyStore target, Map<Integer, Integer> mapEntrezGene2Homologene) {
for (Map.Entry<Integer, Integer> entry : mapEntrezGene2Homologene.entrySet()){
//Check if homologene concept exists:
if (target.getConcept(entry.getValue()+numberAddedToHomologeneID) != null)
target.setDatabaseIDForConcept(entry.getValue()+numberAddedToHomologeneID, new DatabaseID("LL", entry.getKey().toString()));
}
}
private static Map<Integer, Integer> loadHomologeneFile(String filename) {
List<String> homologenelines = TextFileUtilities.loadFromFile(filename);
Set<String> orgs = new TreeSet<String>();
for (String organism : organisms)
orgs.add(organism);
Map<Integer, Integer> mapEntrezGene2Homologene = new TreeMap<Integer, Integer>();
for (String line: homologenelines) {
String[] cells = line.split("\t");
if (orgs.contains(cells[1]))
mapEntrezGene2Homologene.put(Integer.parseInt(cells[2]), Integer.parseInt(cells[0]));
}
return mapEntrezGene2Homologene;
}
public static void intializeTargetOntology(Ontology target) {
Concept voc = new Concept(-1000);
voc.setName("GENE");
target.setConcept(voc);
Concept voc2 = new Concept(-1001);
voc2.setName("HUMAN");
target.setConcept(voc2);
Concept voc3 = new Concept(-1002);
voc3.setName("MOUSE");
target.setConcept(voc3);
Concept voc4 = new Concept(-1003);
voc4.setName("RAT");
target.setConcept(voc4);
Concept voc5 = new Concept(-1004);
voc5.setName("HOMOLOGENE");
target.setConcept(voc5);
Concept semtype = new Concept(-116);
semtype.setName("Amino Acid, Peptide, or Protein");
target.setConcept(semtype);
}
public static void CurateOntology(OntologyStore ontology) {
OntologyCurator ontologyCurator = new OntologyCurator(curationfile);
ontologyCurator.curateAndPrepare(ontology);
}
public static void MergeIdenticalConcepts(OntologyStore ontology) {
ConceptMerger.mergeIdentical=true;
ConceptMerger.mergeWhenSubSet=true;
ConceptMerger.minDiceForMerge=0.4;
ConceptMerger.minDiceForMergeWhenPreferredTermsMatch=0.35;
ConceptMerger.greedyConceptMerge(ontology);
}
public static void GenerateSpellingVariants(OntologyStore ontology) {
GeneTermVariantGenerator.generateVariants(ontology);
String curationfile = "/home/public/Thesauri/homologene/GeneThesaurusCurationFile.txt";
OntologyCurator ontologyCurator = new OntologyCurator(curationfile);
ontologyCurator.curateAndPrepare(ontology);
}
public static void checkIfIDsMapped(Ontology checkontology, Ontology targetOntology, Map<Integer, Integer> homologeneMapping, Map<Integer, List<Concept>> conceptsToBeMapped, Integer sourceVoc) {
Iterator<Concept> iterator = checkontology.getConceptIterator();
while (iterator.hasNext()) {
Concept concept = iterator.next();
if (concept.getID() > 0){
boolean commit = true;
List<DatabaseID> dbids = checkontology.getDatabaseIDsForConcept(concept.getID());
if (dbids != null) {
for (DatabaseID databaseID: dbids) {
if (databaseID.database == "LL") {
Integer entrezGeneID = Integer.parseInt(databaseID.ID);
Integer homologeneID = homologeneMapping.get(entrezGeneID);
if (homologeneID != null) {
commit = false;
List<Concept> concepts = conceptsToBeMapped.get(homologeneID);
if (concepts == null) {
concepts = new ArrayList<Concept>();
conceptsToBeMapped.put(homologeneID, concepts);
}
concepts.add(concept);
for (DatabaseID dbid: dbids) {
targetOntology.setDatabaseIDForConcept(homologeneID + numberAddedToHomologeneID, dbid);
}
}
}
}
}
if (commit) {
targetOntology.setConcept(concept);
if (dbids != null) {
for (DatabaseID dbid: dbids) {
targetOntology.setDatabaseIDForConcept(concept.getID(), dbid);
}
}
targetOntology.setRelation(new Relation(concept.getID(), DefaultTypes.fromVocabulary, -1000));
targetOntology.setRelation(new Relation(concept.getID(), DefaultTypes.fromVocabulary, sourceVoc));
targetOntology.setRelation(new Relation(concept.getID(), DefaultTypes.isOfSemanticType, -116));
}
}
}
}
/*public static void addHomologene2LLlinks(Ontology ontology, String linkFile){
List<String> lines = TextFileUtilities.loadFromFile(linkFile);
for (String line: lines){
String[] cells = line.split("\t");
Set<Integer> entries = ontology.getConceptIDs(new DatabaseID("HO",cells[0]));
if (entries != null){
for(Integer id: entries){
ontology.setDatabaseIDForConcept(id,new DatabaseID("LL",cells[1]));
}
}
}
}*/
}