/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes.ontologyBuilder; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import org.erasmusmc.ids.DatabaseID; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; public class EntrezGeneParser implements DatabaseParser { public static boolean filterBadTypes = true; private boolean live; private boolean genetypeOk; private Gene gene ; private String dbTag; private String llid; private GeneList geneList; private List<String> discontinued; private Set<Integer> allowedTaxonIDs; private boolean geneOrProtein; @Override public GeneList parse(String filename, Set<Integer> allowedTaxonIDs) { this.allowedTaxonIDs = allowedTaxonIDs; geneList = new GeneList(); live = false; genetypeOk = false; gene = new Gene(getTag()); dbTag = ""; llid = ""; discontinued = new ArrayList<String>(); geneOrProtein = false; ReadTextFile file = new ReadTextFile(filename); Iterator<String> iterator = file.getIterator(); while(iterator.hasNext()){ processLine(iterator.next()); } return geneList; } private void processLine(String line) { String trimline = line.trim(); if (trimline.startsWith("<Gene-track_geneid>")) { llid = getValue(trimline); gene.ids.add(new DatabaseID("EG",getValue(trimline))); } if (trimline.startsWith("<Gene-track_status value=") && trimline.contains(("live"))) live = true; if (trimline.startsWith("<Entrezgene_type value=") && (trimline.contains(("protein-coding")) || trimline.contains(("unknown")) || trimline.contains(("other")))) genetypeOk = true; if (dbTag.equals("taxon") && trimline.startsWith("<Object-id_id>")){ dbTag = ""; String taxon = getValue(trimline); gene.taxonIDs.add(Integer.parseInt(taxon)); } if (trimline.startsWith("<Dbtag_db>")) dbTag = getValue(trimline); if (geneOrProtein) { if (trimline.startsWith("</Entrezgene_gene>") || trimline.startsWith("</Entrezgene_prot>")) geneOrProtein = false; else extractGeneOrProteinInfo(trimline); } if (trimline.startsWith("<Entrezgene_gene>") || trimline.startsWith("<Entrezgene_prot>")) geneOrProtein = true; if (trimline.startsWith("</Entrezgene>")) { if (allowedTaxonIDs.contains(gene.taxonIDs.iterator().next())){ if (filterBadTypes) filterBadTerms(gene); if (live && (genetypeOk || !filterBadTypes) && (gene.symbols.size() != 0)) geneList.add(gene); else if (!live) discontinued.add(llid); } live = false; genetypeOk = false; gene = new Gene(getTag()); } } private void extractGeneOrProteinInfo(String trimline) { if (trimline.startsWith("<Gene-ref_locus>")) { gene.preferredSymbol = getValue(trimline); gene.symbols.add(getValue(trimline)); } if (trimline.startsWith("<Gene-ref_desc>")) if (!(gene.taxonIDs.contains(10090) && StringUtilities.mapToWords(getValue(trimline)).size() < 3) && !gene.taxonIDs.contains(6239)) gene.names.add(getValue(trimline)); if (trimline.startsWith("<Gene-ref_syn_E>")) gene.symbols.add(getValue(trimline)); if (trimline.startsWith("<Prot-ref_name_E>")) if (gene.taxonIDs.contains(4932) && StringUtilities.mapToWords(getValue(trimline)).size() > 3) System.out.println("Prot name ignored: " + getValue(trimline)); // In yeast, field is abused for descriptions else gene.names.add(getValue(trimline)); String id = null; //if (trimline.startsWith("<Gene-ref_locus-tag>")) id = getValue(trimline); if (trimline.startsWith("<Object-id_id>")) id = getValue(trimline); if (trimline.startsWith("<Object-id_str>")) id = getValue(trimline); if (id != null){ if (dbTag.equals("MIM")) gene.ids.add(new DatabaseID("OM", id)); if (dbTag.equals("HGNC")) gene.ids.add(new DatabaseID("HG", id)); if (dbTag.equals("GDB")) gene.ids.add(new DatabaseID("GD", id.substring("GDB:".length()))); if (dbTag.equals("UniGene")) gene.ids.add(new DatabaseID("UG", id)); if (dbTag.equals("MGI")) gene.ids.add(new DatabaseID("MGI", id)); if (dbTag.equals("SGD")) gene.ids.add(new DatabaseID("SGD", id)); if (dbTag.equals("FLYBASE")) gene.ids.add(new DatabaseID("FB", id)); if (dbTag.equals("UniProt:")) gene.ids.add(new DatabaseID("UP", id)); if (dbTag.equals("UniProtKB/Swiss-Prot:")) gene.ids.add(new DatabaseID("UP", id)); if (dbTag.equals("WormBase")) gene.ids.add(new DatabaseID("WB", id)); if (dbTag.equals("RATMAP")) gene.ids.add(new DatabaseID("RM", id)); if (dbTag.equals("RGD")) gene.ids.add(new DatabaseID("RGD", id)); if (dbTag.equals("EcoGene")) gene.ids.add(new DatabaseID("ECO", id)); if (dbTag.equals("ZFIN")) gene.ids.add(new DatabaseID("ZFIN", id)); if (dbTag.equals("WormBase")) gene.ids.add(new DatabaseID("WB", id)); } } private void filterBadTerms(Gene geneInfo) { Iterator<String> symbolIterator = geneInfo.symbols.iterator(); boolean hasGoodSymbol = false; while (symbolIterator.hasNext()){ String symbol = symbolIterator.next(); if (!symbol.startsWith("LOC")) hasGoodSymbol = true; } Iterator<String> nameIterator = geneInfo.names.iterator(); while (nameIterator.hasNext()){ String name = nameIterator.next().toLowerCase(); if (name.contains("similar") || name.contains("putative") || name.contains("hypothetical") || name.contains("predicted") || name.contains("uncharacterized") || name.contains("conserved") || name.contains("expressed") || name.contains("deletion") || name.contains("duplication")) nameIterator.remove(); } if (!hasGoodSymbol && geneInfo.names.size() == 0) geneInfo.symbols.clear(); } private String getValue(String line) { int x = line.indexOf(">"); int y = line.lastIndexOf("<"); return line.substring(x+1,y); } @Override public String getTag() { return "EG"; } }