/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes.ontologyBuilder; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.erasmusmc.collections.CountingSet; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.WriteTextFile; public class GeneList extends ArrayList<Gene>{ private static final long serialVersionUID = 2929874217342122759L; private Map<Integer, String> taxonID2name = initTaxonNames(); private Map<Integer, Integer> taxonID2CID = initTaxonCIDs(); public void printStatistics(){ int symbolCount = 0; int nameCount = 0; int idCount = 0; CountingSet<Integer> taxonCounts = new CountingSet<Integer>(); for (Gene gene : this){ symbolCount += gene.symbols.size(); nameCount += gene.names.size(); idCount += gene.ids.size(); taxonCounts.addAll(gene.taxonIDs); } System.out.println("TaxonIDs:"); taxonCounts.printCounts(); System.out.println("genes: " + this.size() + " symbols: " + symbolCount + " names: " + nameCount + " databaseIDs: " + idCount); } private Map<Integer, Integer> initTaxonCIDs() { Map<Integer,Integer> map = new HashMap<Integer, Integer>(); map.put(9606, -1001); map.put(10090, -1002); map.put(10116, -1003); map.put(83333, -1004); map.put(4932, -1005); map.put(7955, -1006); map.put(6239, -1007); map.put(5833, -1008); map.put(9031, -1009); map.put(7227, -1010); return map; } private Map<Integer, String> initTaxonNames() { Map<Integer,String> map = new HashMap<Integer, String>(); map.put(9606, "HSAPIENS"); map.put(10090, "MMUSCULUS"); map.put(10116, "RNORVEGICUS"); map.put(83333, "ECOLI"); map.put(4932, "SCEREVISIAE"); map.put(7955, "DRERIO"); map.put(6239, "CELEGANS"); map.put(5833, "FPLASMODIUM"); map.put(9031, "GGALLUS"); map.put(7227, "DMELANOGASTER"); return map; } public void saveToSimpleFile(String filename){ WriteTextFile out = new WriteTextFile(filename); for (Gene geneInfo : this){ StringBuilder sb = new StringBuilder(); if (geneInfo.preferredSymbol != null) sb.append(geneInfo.preferredSymbol); sb.append("\t"); sb.append(join(geneInfo.symbols)); sb.append("\t"); sb.append(join(geneInfo.names)); sb.append("\t"); sb.append(join(geneInfo.ids)); sb.append("\t"); sb.append(join(geneInfo.taxonIDs)); sb.append("\t"); sb.append(geneInfo.source); out.writeln(sb.toString()); } out.close(); } private static String join(Collection<?> items){ StringBuilder sb = new StringBuilder(); for (Object item : items){ if (sb.length() != 0) sb.append(";"); sb.append(item.toString().replace(";", " ")); } return sb.toString(); } public static GeneList loadFromSimpleFile(String filename){ GeneList geneList = new GeneList(); for (String line : new ReadTextFile(filename)){ String[] cols = line.split("\t"); Gene gene = new Gene(cols[5]); if (cols[0].length() != 0) gene.preferredSymbol = cols[0]; for (String symbol : cols[1].split(";")) gene.symbols.add(symbol); for (String name : cols[2].split(";")) gene.names.add(name); for (String id : cols[3].split(";")) { String[] parts = id.split("_"); gene.ids.add(new DatabaseID(parts[0],parts[1])); } for (String taxon : cols[4].split(";")) gene.taxonIDs.add(Integer.parseInt(taxon)); geneList.add(gene); } return geneList; } public OntologyStore convertToOntologyStore(int startCID){ OntologyStore ontology = new OntologyStore(); Concept semtype = new Concept(-116); semtype.setName("Amino Acid, Peptide, or Protein"); ontology.setConcept(semtype); Concept voc = new Concept(-1000); voc.setName("GENE"); ontology.setConcept(voc); for (Gene gene : this){ Concept concept = new Concept(startCID++); List<TermStore> terms = new ArrayList<TermStore>(); if (gene.preferredSymbol != null) terms.add(new TermStore(gene.preferredSymbol)); for (String symbol : gene.symbols) if (gene.preferredSymbol == null || !gene.preferredSymbol.equals(symbol)) terms.add(new TermStore(symbol)); for (String name : gene.names) terms.add(new TermStore(name)); for (DatabaseID id : gene.ids) ontology.setDatabaseIDForConcept(concept.getID(), id); concept.setTerms(terms); ontology.setConcept(concept); addVocsAndSemTypes(ontology, gene, concept.getID()); } return ontology; } private void addVocsAndSemTypes(Ontology ontology, Gene gene, Integer cid) { ontology.setRelation(new Relation(cid, DefaultTypes.fromVocabulary, -1000)); for (Integer taxonID : gene.taxonIDs){ int vocCID = getVocCID(ontology, taxonID); ontology.setRelation(new Relation(cid, DefaultTypes.fromVocabulary, vocCID)); } ontology.setRelation(new Relation(cid, DefaultTypes.isOfSemanticType, -116)); } private int getVocCID(Ontology ontology, Integer taxonID) { int cid = taxonID2CID.get(taxonID); if (ontology.getConcept(cid) == null){ Concept concept = new Concept(cid); concept.setName(taxonID2name.get(taxonID)); ontology.setConcept(concept); } return cid; } }