/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.DefaultTypes; import org.erasmusmc.ontology.Ontology; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.Relation; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.TextFileUtilities; /** Loads the EC numbers from the enzyme file of Swiss-prot and inserts them into a gene/protein ontology**/ public class ECnumberImport { public static String classFile = "/data/Swiss-Prot/enzclass_pimped.txt"; public static String enzymeFile = "/data/Swiss-Prot/enzyme.dat"; public static String dataFile = "/data/Swiss-Prot/uniprot_sprot.dat"; public static void addECnumbers(Ontology ontology, String classFile, String enzymeFile, String dataFile){ ECnumberImport.classFile = classFile; ECnumberImport.enzymeFile = enzymeFile; ECnumberImport.dataFile = dataFile; new ECnumberImport(ontology); } public ECnumberImport(Ontology ontology) { this.ontology = ontology; System.out.println(StringUtilities.now() + "\tLoading protein ontology"); if (ontology instanceof OntologyStore) ((OntologyStore)ontology).createIndexForDatabaseIDs(); System.out.println(StringUtilities.now() + "\tLoading enzyme files"); List<String> classLines = TextFileUtilities.loadFromFile(classFile); List<String> enzymeLines = TextFileUtilities.loadFromFile(enzymeFile); Concept concept = new Concept(-1999); concept.setName("ENZYME"); ontology.setConcept(concept); System.out.println(StringUtilities.now() + "\tProcessing enzyme files"); processClassLines(classLines); processLines(enzymeLines); System.out.println(StringUtilities.now() + "\tFetching EC numbers per protein"); processDataFile(dataFile); System.out.println(StringUtilities.now() + "\tGenerating protein - enzyme class relations"); generateRelations(); } private void generateRelations() { //List<String> out = new ArrayList<String>(); for (Concept concept : ontology){ //Find protein 2 EC relations int proteinConceptID = concept.getID(); List<DatabaseID> databaseIDs = ontology.getDatabaseIDsForConcept(proteinConceptID); Set<Integer> ecConceptIDs = new HashSet<Integer>(); for (DatabaseID databaseID: databaseIDs) { if (databaseID.database.equals("SP")) { List<Integer> ecs = SPID2EC.get(databaseID.ID); if (ecs != null) ecConceptIDs.addAll(ecs); } } //Create relations: for (Integer ecConceptID: ecConceptIDs) { Relation relation = new Relation(ecConceptID, DefaultTypes.isParentOf, proteinConceptID); ontology.setRelation(relation); //out.add(ontology.getConcept(proteinConceptID).getName() + "\t" + ontology.getConcept(ecConceptID).getName()); } //Find EC 2 EC superclass relations: if (concept.getName().startsWith("EC ")) { String[] parts = concept.getName().split("\\."); if (parts.length == 4){ for (int i = 3; i > 0; i--) { parts[i] = "-"; String className = StringUtilities.join(parts, "."); Integer superclassID = ECclass2cid.get(className); if (superclassID != null && superclassID != proteinConceptID) { Relation relation = new Relation(superclassID, DefaultTypes.isParentOf, proteinConceptID); ontology.setRelation(relation); //out.add(ontology.getConcept(proteinConceptID).getName() + "\t" + ontology.getConcept(superclassID).getName()); break; } } } } } System.out.println("Relations added: " + ontology.getRelations().size()); //TextFileUtilities.saveToFile(out, "/home/temp/relations.txt"); } private void processDataFile(String filename) { String spidline = ""; ReadTextFile in = new ReadTextFile(filename); for (String line : in){ if (line.startsWith("AC")) spidline = line; if (line.startsWith("DE")) { String[] cols = line.substring(6).split("\\("); for (String col: cols) { String[] cols2 = col.split("\\)"); if (cols2.length > 0 && cols2[0].startsWith("EC ")) { //synonym is an EC number Integer cid = EC2CID.get(cols2[0].trim()); if (cid != null) { String[] spids = spidline.substring(5).split(";"); for (String spid: spids) { List<Integer> cids = SPID2EC.get(spid.trim()); if (cids == null) { cids = new ArrayList<Integer>(); SPID2EC.put(spid.trim(), cids); } cids.add(cid); } } } } } } } private void processClassLines(List<String> classLines) { Concept concept = null; for (String line: classLines) { if (line.length() > 1) { String[] cols = line.split("\t"); concept = new Concept(conceptid); String ECnumber = cols[0]; //TermStore term = createTerm(ECnumber); EC2CID.put(ECnumber, concept.getID()); //List<TermStore> terms = new ArrayList<TermStore>(); //terms.add(term); //concept.setTerms(terms); concept.setName(ECnumber); concept.setDefinition(cols[1]); ECclass2cid.put(ECnumber, concept.getID()); ontology.setConcept(concept); setVoc(concept); conceptid++; } } } private void processLines(List<String> enzymeLines) { Concept concept = null; for (String line: enzymeLines) { if (line.startsWith("ID")) { concept = new Concept(conceptid); String ECnumber = "EC " + line.substring(3).trim(); //TermStore term = createTerm(ECnumber);//new TermStore(ECnumber); EC2CID.put(ECnumber, concept.getID()); //List<TermStore> terms = new ArrayList<TermStore>(); //terms.add(term); //concept.setTerms(terms); concept.setName(ECnumber); ontology.setConcept(concept); setVoc(concept); conceptid++; } if (line.startsWith("DE")) if (!line.contains("Transferred entry") && !line.contains("Deleted entry")) concept.setDefinition(line.substring(3, line.length() - 1).trim()+"\n\n"); /*if (line.startsWith("AN")){ String term = line.substring(3, line.length() - 1).trim(); if (StringUtilities.parenthesisMatch(term)) concept.getTerms().add(createTerm(term)); else System.out.println("Discarted: " + term); } */ if (line.startsWith("CC") && concept != null) concept.setDefinition(concept.getDefinition() + " " + line.substring(3).replace("-!-", "").trim()); /*if (line.startsWith("DR")){ String cols[] = line.substring(3).split(";"); for (String col : cols){ String parts[] = col.split(","); Set<Integer> proteinIDs = ontology.getConceptIDs(new DatabaseID("SP", parts[0])); if (proteinIDs != null){ for (Integer proteinID : proteinIDs){ ontology.setRelation(new Relation(conceptid, DefaultTypes.isParentOf, proteinID)); } } } }*/ } } private void setVoc(Concept concept) { Relation relation = new Relation(concept.getID(), DefaultTypes.fromVocabulary, -1999); ontology.setRelation(relation); } /* private TermStore createTerm(String string) { TermStore term = new TermStore(string); term.caseSensitive = false; term.orderSensitive = true; term.normalised = false; return term; } */ private Map<String, List<Integer>> SPID2EC = new HashMap<String, List<Integer>>(); private Map<String, Integer> EC2CID = new HashMap<String, Integer>(); private Map<String, Integer> ECclass2cid = new HashMap<String, Integer>(); private Ontology ontology; private int conceptid = 50000000; }