/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.dataimport;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import javax.swing.tree.DefaultMutableTreeNode;
//import org.erasmusmc.applications.ontologyviewer.OntologyViewerPanel;
import org.erasmusmc.collections.SortedIntListSet;
import org.erasmusmc.databases.integersetstore.IntegerSetStore;
import org.erasmusmc.databases.mysql.MySQLgenericQuery;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.ontology.Concept;
import org.erasmusmc.ontology.DefaultTypes;
import org.erasmusmc.ontology.Ontology;
import org.erasmusmc.ontology.OntologyManager;
import org.erasmusmc.ontology.OntologyPSFLoader;
import org.erasmusmc.ontology.OntologyStore;
import org.erasmusmc.ontology.Relation;
import org.erasmusmc.ontology.TermStore;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
public class GeneOntology {
private String goFile;
// public static void main(String[] args){
// GeneOntology go = new GeneOntology("/home/data/gene ontology/go_200804-assocdb.rdf-xml");
// OntologyManager ontologyManager = new OntologyManager();
// Ontology ontology = ontologyManager.fetchClient("UMLS2006Homologene_v1_6");
// go.dumpPMIDs(ontology, "/home/temp/gocid2pmid.txt");
// }
public GeneOntology(String goFile){
this.goFile = goFile;
}
public void doAll(){
OntologyStore ontology = buildOntology();
dumpPMIDs(ontology, "/tmp/go2pmid.txt");
//dumpInMySQL();
dumpInPSF(ontology, "/tmp/go.psf");
//showTree();
}
/**
* Creates an ontologyStore containing the GO
* @return
*/
public OntologyStore buildOntology() {
System.out.println("Building ontology");
OntologyStore ontology = new OntologyStore();
int conceptID = 6000000;
Concept concept = null;
Map<String, Integer> go2cid = new HashMap<String, Integer>();
String goCode = "";
ReadTextFile file = new ReadTextFile(goFile);
for (String fullLine : file){
String line = fullLine.trim();
if (line.startsWith("<go:term")){
concept = new Concept(conceptID);
concept.setTerms(new ArrayList<TermStore>());
conceptID++;
ontology.setConcept(concept);
}
if (line.startsWith("<go:accession>")){
goCode = StringUtilities.findBetween(line, "<go:accession>","</go:accession>");
DatabaseID databaseID = new DatabaseID("GO", goCode);
ontology.setDatabaseIDForConcept(concept.getID(), databaseID);
go2cid.put(goCode, concept.getID());
}
if (line.startsWith("<go:name>") && concept.getTerms().size() == 0)
concept.getTerms().add(new TermStore(StringUtilities.findBetween(line, "<go:name>","</go:name>")));
if (line.startsWith("<go:synonym>")){
String term = StringUtilities.findBetween(line, "<go:synonym>","</go:synonym>");
if (term.startsWith("GO:")){ //Alternative ID
DatabaseID databaseID = new DatabaseID("GO", term);
ontology.setDatabaseIDForConcept(concept.getID(), databaseID);
go2cid.put(term, concept.getID());
} else
concept.getTerms().add(new TermStore(term));
}
if (line.startsWith("<go:definition>"))
concept.setDefinition(StringUtilities.findBetween(line, "<go:definition>","</go:definition>"));
}
System.out.println("Concepts found: " + ontology.size());
System.out.println("Creating hierarchy");
buildHierarchy(ontology);
System.out.println("Finished building ontology");
return ontology;
}
/**
* Adds GO relationships to an ontology
* @param ontology
*/
private void buildHierarchy(OntologyStore ontology) {
Map<String, List<String>> go2parents = new HashMap<String, List<String>>();
String goCode = "";
ReadTextFile file = new ReadTextFile(goFile);
for (String fullLine : file){
String line = fullLine.trim();
if (line.startsWith("<go:accession>")){
goCode = StringUtilities.findBetween(line, "<go:accession>","</go:accession>");
}
if (line.startsWith("<go:part_of rdf:resource=\"http://www.geneontology.org/go#")){
String parentGOcode = StringUtilities.findBetween(line, "<go:part_of rdf:resource=\"http://www.geneontology.org/go#","\" />");
List<String> parents = go2parents.get(goCode);
if (parents == null){
parents = new ArrayList<String>();
go2parents.put(goCode, parents);
}
parents.add(parentGOcode);
}
if (line.startsWith("<go:is_a rdf:resource=\"http://www.geneontology.org/go#")){
String parentGOcode = StringUtilities.findBetween(line, "<go:is_a rdf:resource=\"http://www.geneontology.org/go#","\" />");
List<String> parents = go2parents.get(goCode);
if (parents == null){
parents = new ArrayList<String>();
go2parents.put(goCode, parents);
}
parents.add(parentGOcode);
}
}
Map<String, Integer> go2cid = getGO2CID(ontology);
for (Entry<String, List<String>> entry : go2parents.entrySet()){
Integer childCID = go2cid.get(entry.getKey());
for (String parent : entry.getValue()){
Integer parentCID = go2cid.get(parent);
if (parentCID == null) System.out.println("not found: " + parent);
else if (childCID == null) System.out.println("not found: " + entry.getKey());
else ontology.setRelation(new Relation(parentCID, DefaultTypes.isParentOf, childCID));
}
}
}
public void dumpInPSF(Ontology ontology, String filename) {
OntologyPSFLoader loader = new OntologyPSFLoader();
loader.ontology = (OntologyStore)ontology;
loader.saveToPSF(filename);
}
// commented unused method, causes compilation trouble [RvS]
// public void showTree(Ontology ontology){
// OntologyViewerPanel panel = new OntologyViewerPanel(ontology, DefaultTypes.isParentOf);
// JDialog dialog = panel;
// dialog.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
// dialog.pack();
// dialog.setVisible(true);
// }
private Map<String, Integer> getGO2CID(Ontology ontology) {
System.out.println(StringUtilities.now() + "\tFetching cids for GO concepts");
Map<String, Integer> go2cid = new HashMap<String, Integer>();
Iterator<Concept> iterator = ontology.getConceptIterator();
while (iterator.hasNext()){
Concept concept = iterator.next();
List<DatabaseID> databaseIDs = ontology.getDatabaseIDsForConcept(concept.getID());
for (DatabaseID databaseID : databaseIDs){
if (databaseID.database.equals("GO"))
go2cid.put(databaseID.ID, concept.getID());
}
}
System.out.println(StringUtilities.now() + "\tCids fetched");
return go2cid;
}
/**
* Creates an integersetstore containing PMIDS for the GO concepts in the ontology.
* @param ontology
* @param filename Name of the IntegerSetStore
*/public void dumpPMIDs(Ontology ontology, String filename) {
System.out.println(StringUtilities.now() + "\tFetching PMIDS");
String goCode = "";
boolean isPMID = false;
Map<Integer, List<Integer>> cid2pmids = new HashMap<Integer, List<Integer>>();
Map<String, Integer> go2cid = getGO2CID(ontology);
ReadTextFile file = new ReadTextFile(goFile);
for (String fullLine : file){
String line = fullLine.trim();
if (line.startsWith("<go:accession>")){
goCode = StringUtilities.findBetween(line, "<go:accession>","</go:accession>");
}
if (isPMID){
String pmid = StringUtilities.findBetween(line, "<go:reference>","</go:reference>");
if (StringUtilities.isNumber(pmid)){
Integer cid = go2cid.get(goCode);
if (cid == null){
System.err.println("GO code " + goCode + " not found in thesaurus");
} else {
List<Integer> pmids = cid2pmids.get(cid);
if (pmids == null){
pmids = new ArrayList<Integer>();
cid2pmids.put(cid, pmids);
}
pmids.add(Integer.parseInt(pmid));
}
}
}
isPMID = line.equals("<go:database_symbol>PMID</go:database_symbol>");
}
System.out.println(StringUtilities.now() + "\tDumping to IntegerSetStore");
IntegerSetStore integerSetStore = new IntegerSetStore(filename);
for (Entry<Integer, List<Integer>> entry : cid2pmids.entrySet()){
SortedIntListSet pmids = new SortedIntListSet();
for (Integer pmid : entry.getValue())
pmids.add(pmid);
integerSetStore.set(entry.getKey(), pmids);
}
integerSetStore.close();
/*System.out.println(StringUtilities.now() + "\tDumping to file");
WriteTextFile out = new WriteTextFile(filename);
for (Entry<Integer, List<Integer>> entry : cid2pmids.entrySet()){
StringBuffer line = new StringBuffer();
line.append(entry.getKey());
line.append("\t");
for (Integer pmid : entry.getValue()){
line.append(pmid);
line.append(";");
}
out.writeln(line.toString());
}
out.close();*/
System.out.println(StringUtilities.now() + "\tDone");
}
public void dumpDatasetsInMySQL(Ontology geneOntology, Ontology umls, String server, String database, String username, String password, int molfunc, int celcomp, int bioproc){
System.out.println(StringUtilities.now() + "\tInserting GO");
fetchRelations(geneOntology, DefaultTypes.isParentOf);
DefaultMutableTreeNode tree = buildTree(geneOntology);
Enumeration<DefaultMutableTreeNode> categories = tree.children();
MySQLgenericQuery query = new MySQLgenericQuery(server, database, username, password);
int conceptsetid = -1;
while (categories.hasMoreElements()){
DefaultMutableTreeNode category = categories.nextElement();
String categoryName = category.getUserObject().toString();
System.out.println("Adding GO category "+categoryName);
Set<Integer> conceptIDs = new HashSet<Integer>();
Enumeration<DefaultMutableTreeNode> members = category.breadthFirstEnumeration();
List<String> stringList = new ArrayList<String>();
while (members.hasMoreElements()){
Concept concept = (Concept)members.nextElement().getUserObject();
for (DatabaseID databaseID : geneOntology.getDatabaseIDsForConcept(concept.getID())){
Set<Integer> cuis = umls.getConceptIDs(databaseID);
conceptIDs.addAll(cuis);
}
}
if(categoryName.equals("cellular_component")) {
conceptsetid = celcomp;
}
else if(categoryName.equals("molecular_function")) {
conceptsetid = molfunc;
}
else if(categoryName.equals("biological_process")) {
conceptsetid = bioproc;
}
for(Integer cui: conceptIDs) {
stringList.add("('" + cui + "','" + conceptsetid + "')");
}
query.nonThreadedUpdate("DELETE FROM set_2_concept WHERE conceptsetid='" + conceptsetid + "'");
query.nonThreadedUpdate("INSERT INTO set_2_concept (conceptid, conceptsetid) VALUES " + StringUtilities.join(stringList, ","));
}
}
private DefaultMutableTreeNode buildTree(Ontology ontology) {
DefaultMutableTreeNode top;
List<Integer> parents = findParents();
if (parents.size() == 1) {
top = addNode(ontology, parents.get(0));
} else {
top = new DefaultMutableTreeNode("root");
for (Integer parent : parents){
top.add(addNode(ontology, parent));
}
}
return top;
}
//Stuff for builing tree:
private void fetchRelations(Ontology ontology, int relationType) {
List<Relation> relations = ontology.getRelations();
for (Relation relation : relations){
if (relation.predicate == relationType){
List<Integer> children = cui2children.get(relation.subject);
if (children == null){
children = new ArrayList<Integer>();
cui2children.put(relation.subject, children);
}
children.add(relation.object);
hasParent.add(relation.object);
}
}
}
private DefaultMutableTreeNode addNode(Ontology ontology, Integer cui) {
DefaultMutableTreeNode node = new DefaultMutableTreeNode(ontology.getConcept(cui));
inTree.add(cui);
List<Integer> children = cui2children.get(cui);
if (children != null)
for (Integer child : children)
node.add(addNode(ontology, child));
return node;
}
private List<Integer> findParents() {
List<Integer> parents = new ArrayList<Integer>();
for (Integer cui : cui2children.keySet()){
if (!hasParent.contains(cui)) parents.add(cui);
}
return parents;
}
private Set<Integer> hasParent = new HashSet<Integer>();
private Set<Integer> inTree = new HashSet<Integer>();
private Map<Integer, List<Integer>> cui2children = new HashMap<Integer, List<Integer>>();
private static final long serialVersionUID = 1L;
/*
public void dumpGOA(String filename) {
ReadTextFile file = new ReadTextFile(goFile);
WriteTextFile output = new WriteTextFile(filename);
String evidence = null;
boolean gene = false;
String databaseSymbol = null;
for (String line : file){
String trimLine = line.trim();
if (trimLine.startsWith("<go:accession>")){
goCode = StringUtilities.findBetween(trimLine, "<go:accession>","</go:accession>");
} else if (trimLine.startsWith("<go:evidence evidence_code=")){
evidence = StringUtilities.findBetween(trimLine, "<go:evidence evidence_code=\"","\"");
} else if (trimLine.equals("<go:gene_product rdf:parseType=\"Resource\">")){
gene = true;
} else if (gene && trimLine.startsWith("<go:database_symbol>")){
databaseSymbol = StringUtilities.findBetween(trimLine, "<go:database_symbol>","</go:database_symbol>");
} else if (gene && trimLine.startsWith("<go:reference>")){
String databaseID = StringUtilities.findBetween(trimLine, "<go:reference>","</go:reference>");
DatabaseID id = new DatabaseID(databaseSymbol, databaseID);
output.writeln(goCode + "\t" + id.database + "\t" + id.ID + "\t" + evidence);
gene = false;
} else if (trimLine.equals("</go:gene_product>")){
gene = false;
}
}
output.close();
}
*/
}