/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.ontology.Concept; import org.erasmusmc.ontology.OntologyPSFLoader; import org.erasmusmc.ontology.OntologyStore; import org.erasmusmc.ontology.TermStore; import org.erasmusmc.textMining.LVG.LVGNormaliser; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; /** * Extracts information from an Entrez-Gene XML file. * * @author Schuemie * */public class EntrezGeneGenes { public boolean filterBadTypes = true; public static void main(String[] args){ EntrezGeneGenes egg = new EntrezGeneGenes(); egg.run(); } public void run(){ System.out.println(StringUtilities.now()+"\tLoading normaliser cache"); normaliser.loadCacheBinary("/home/public/Peregrine/standardNormCache2006.bin"); System.out.println(StringUtilities.now()+"\tProcessing file"); processFile("/home/data/Entrez-gene/Homo_sapiens.xml"); System.out.println(StringUtilities.now()+"\tDumping output"); //dumpOutputToPSFFormat("/home/schuemie/EG_Rat.psf"); dumpOutputToJansFormat("/home/data/geneprotein_EG_norm.txt"); //dumpOutputToSimpleFormat("/home/schuemie/EntrezGeneSimple.txt"); //TextFileUtilities.saveToFile(discontinued, "/home/schuemie/WithdrawnIDs_Fly.txt"); //System.out.println(discontinued.size() + " genes written to " + "/home/schuemie/WithdrawnIDs.txt"); } private void dumpOutputToPSFFormat(String filename) { OntologyStore ontology = new OntologyStore(); for (GeneInfo geneInfo : geneInfos){ Concept concept = new Concept(Integer.parseInt(getID(geneInfo, "LL_"))); List<TermStore> terms = new ArrayList<TermStore>(); for (String symbol : geneInfo.symbols){ terms.add(new TermStore(symbol)); } for (String name : geneInfo.names){ terms.add(new TermStore(name)); } for (String id : geneInfo.ids){ String[] idparts = id.split("_"); if (idparts.length < 2) System.out.println(id); else ontology.setDatabaseIDForConcept(concept.getID(), new DatabaseID(idparts[0], idparts[1])); } concept.setTerms(terms); ontology.setConcept(concept); } OntologyPSFLoader loader = new OntologyPSFLoader(); loader.ontology = ontology; loader.saveToPSF(filename); } private void dumpOutputToSimpleFormat(String filename) { try { FileOutputStream PSFFile = new FileOutputStream(filename); BufferedWriter bufferedWrite = new BufferedWriter( new OutputStreamWriter(PSFFile),1000000); try { for (GeneInfo geneInfo : geneInfos){ bufferedWrite.write(info2simpleline(geneInfo)); bufferedWrite.newLine(); } bufferedWrite.flush(); bufferedWrite.close(); }catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e){ e.printStackTrace(); } } private String info2simpleline(GeneInfo geneInfo) { StringBuffer line = new StringBuffer(); line.append(geneInfo.symbols.get(0)); // Preferred symbol for (int i = 1; i < geneInfo.symbols.size(); i++){ //Other symbols line.append(";"); line.append(geneInfo.symbols.get(i)); } line.append("\t"); boolean first = true; for (String name : geneInfo.names){ //Gene names if (first) first = false; else line.append(";"); line.append(name); } line.append("\t"); line.append(getID(geneInfo, "LL_")); return line.toString(); } private void dumpOutputToJansFormat(String filename) { try { FileOutputStream PSFFile = new FileOutputStream(filename); BufferedWriter bufferedWrite = new BufferedWriter( new OutputStreamWriter(PSFFile),1000000); try { for (GeneInfo geneInfo : geneInfos){ bufferedWrite.write(info2line(geneInfo)); bufferedWrite.newLine(); } bufferedWrite.flush(); bufferedWrite.close(); }catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e){ e.printStackTrace(); } System.out.println(geneInfos.size() + " genes written to " + filename); } private String info2line(GeneInfo geneInfo) { StringBuffer line = new StringBuffer(); line.append(getID (geneInfo, "OM_")); line.append("|"); line.append(getID (geneInfo, "GD_")); line.append("|"); line.append(getID (geneInfo, "LL_")); line.append("|"); line.append(getID (geneInfo, "RQ_")); line.append("|"); line.append(getID (geneInfo, "UG_")); line.append("|"); line.append(getID (geneInfo, "SP_")); line.append("|"); line.append(getID (geneInfo, "HG_")); line.append("|"); line.append(geneInfo.symbols.get(0)); line.append("|"); // Preferred symbol for (int i = 1; i < geneInfo.symbols.size(); i++){ //Other symbols if (i != 1) line.append("\t"); line.append(geneInfo.symbols.get(i)); } line.append("|"); boolean first = true; for (String name : geneInfo.names){ //Normalised gene names if (first) first = false; else line.append("\t"); line.append(normaliser.normalise(name)); } line.append("|"); first = true; for (String name : geneInfo.names){ //Gene names if (first) first = false; else line.append("\t"); line.append(name); } return line.toString(); } private String getID(GeneInfo geneInfo, String prefix){ StringBuffer result = new StringBuffer(); Set<String> unique = new HashSet<String>(); for (String id : geneInfo.ids) if (id.startsWith(prefix)) unique.add(id); for (String id : unique) { if (result.length() != 0) result.append("\t"); result.append(id.replace(prefix,"")); } return result.toString(); } private void processFile(String filename) { ReadTextFile file = new ReadTextFile(filename); Iterator<String> iterator = file.getIterator(); while(iterator.hasNext()){ processLine(iterator.next()); } } private boolean live = false; private boolean genetypeOk = false; private GeneInfo currentGeneInfo = new GeneInfo(); private String dbTag = ""; private String llid = ""; private void processLine(String line) { if (line.toLowerCase().contains("</entrezgene>")) { count++; if (count % 100 == 0) System.out.println(count + " genes analysed."); } String trimline = line.trim(); if (trimline.startsWith("<Gene-track_geneid>")) { llid = getValue(trimline); currentGeneInfo.ids.add("LL_"+getValue(trimline)); } if (trimline.startsWith("<Gene-track_status value=") && trimline.contains(("live"))) live = true; if (trimline.startsWith("<Entrezgene_type value=") && (trimline.contains(("protein-coding")) || trimline.contains(("unknown")) || trimline.contains(("other")))) genetypeOk = true; if (trimline.startsWith("<Gene-ref_locus>")) currentGeneInfo.symbols.add(getValue(trimline)); if (trimline.startsWith("<Gene-ref_desc>")) currentGeneInfo.names.add(getValue(trimline)); if (trimline.startsWith("<Gene-ref_syn_E>")) currentGeneInfo.symbols.add(getValue(trimline)); if (trimline.startsWith("<Prot-ref_name_E>")) currentGeneInfo.names.add(getValue(trimline)); String id = ""; if (trimline.startsWith("<Gene-ref_locus-tag>")) id = getValue(trimline); if (trimline.startsWith("<Dbtag_db>")) dbTag = getValue(trimline); if (trimline.startsWith("<Object-id_id>")) id = dbTag + ":" + getValue(trimline); if (trimline.startsWith("<Object-id_str>")) id = dbTag + ":" + getValue(trimline); if (!id.equals("")){ if (id.startsWith("MIM")) currentGeneInfo.ids.add(id.replace("MIM:", "OM_")); if (id.startsWith("HGNC")) currentGeneInfo.ids.add(id.replace("HGNC:", "HG_")); if (id.startsWith("GDB")) currentGeneInfo.ids.add(id.replace("GDB:GDB:", "GD_")); if (id.startsWith("UniGene")) currentGeneInfo.ids.add(id.replace("UniGene:", "UG_")); if (id.startsWith("MGI")) currentGeneInfo.ids.add(id.replace("MGI:", "MGI_")); if (id.startsWith("SGD")) currentGeneInfo.ids.add(id.replace("SGD:", "SGD_")); if (id.startsWith("FLYBASE")) currentGeneInfo.ids.add(id.replace("FLYBASE:", "FB_")); if (id.startsWith("UniProt:")) currentGeneInfo.ids.add(id.replace("UniProt:", "SP_")); if (id.startsWith("UniProtKB/Swiss-Prot:")) currentGeneInfo.ids.add(id.replace("UniProtKB/Swiss-Prot:", "SP_")); if (id.startsWith("WormBase")) currentGeneInfo.ids.add(id.replace("WormBase:", "WB_")); if (id.startsWith("RATMAP")) currentGeneInfo.ids.add(id.replace("RATMAP:", "RM_")); if (id.startsWith("RGD")) currentGeneInfo.ids.add(id.replace("RGD:", "RGD_")); } if (trimline.startsWith("</Entrezgene>")) { if (filterBadTypes) filterBadTerms(currentGeneInfo); if (live && (genetypeOk || !filterBadTypes) && (currentGeneInfo.symbols.size() != 0)) geneInfos.add(currentGeneInfo); else if (!live) discontinued.add(llid); live = false; genetypeOk = false; currentGeneInfo = new GeneInfo(); } } private void filterBadTerms(GeneInfo geneInfo) { Iterator<String> symbolIterator = geneInfo.symbols.iterator(); boolean hasGoodSymbol = false; while (symbolIterator.hasNext()){ String symbol = symbolIterator.next(); if (!symbol.startsWith("LOC")) hasGoodSymbol = true; } Iterator<String> nameIterator = geneInfo.names.iterator(); while (nameIterator.hasNext()){ String name = nameIterator.next().toLowerCase(); if (name.contains("similar") || name.contains("putative") || name.contains("hypothetical")) nameIterator.remove(); } if (!hasGoodSymbol && geneInfo.names.size() == 0) geneInfo.symbols.clear(); } private int count = 0; private String getValue(String line) { int x = line.indexOf(">"); int y = line.lastIndexOf("<"); return line.substring(x+1,y); } /* //Process line for table file private void processLine(String line){ String[] cols = line.split("\t"); if (cols[0].equals(taxonomyID)){ GeneInfo geneInfo = new GeneInfo(); geneInfo.symbols.add(cols[2]); //Preferred symbol String[] symbols = cols[4].split("[|]"); for (String symbol : symbols) geneInfo.symbols.add(symbol); geneInfo.ids.add("LL_"+cols[1]); String[] ids = cols[5].split("[|]"); for (String id : ids){ if (id.startsWith("MIM")) geneInfo.ids.add(id.replace("MIM:", "OM_")); if (id.startsWith("HGNC")) geneInfo.ids.add(id.replace("HGNC:", "HG_")); if (id.startsWith("GDB")) geneInfo.ids.add(id.replace("GDB:", "GD_")); if (id.startsWith("UNIGENE")) geneInfo.ids.add(id.replace("UNIGENE:", "UG_")); if (id.startsWith("MGI")) geneInfo.ids.add(id.replace("MGI:", "MGI_")); if (id.startsWith("SGD")) geneInfo.ids.add(id.replace("SGD:", "SGD_")); if (id.startsWith("FLYBASE")) geneInfo.ids.add(id.replace("FLYBASE:", "FB_")); } if (!cols[8].equals("") && !geneInfo.symbols.contains(cols[8])) geneInfo.names.add(cols[8]); if (!cols[10].equals("") && !geneInfo.symbols.contains(cols[10])&& !geneInfo.names.contains(cols[10])) geneInfo.names.add(cols[10]); geneInfos.add(geneInfo); } } */ private class GeneInfo{ List<String> ids = new ArrayList<String>(); List<String> symbols = new ArrayList<String>(); Set<String> names = new HashSet<String>(); } private List<GeneInfo> geneInfos = new ArrayList<GeneInfo>(); private List<String> discontinued = new ArrayList<String>(); private LVGNormaliser normaliser = new LVGNormaliser(); }