/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.dataimport.genes.ontologyBuilder;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.utilities.ReadTextFile;
import org.erasmusmc.utilities.StringUtilities;
public class UniProtParser implements DatabaseParser {
private GeneList geneList;
private Set<Integer> taxonIDs;
private static final Pattern digitsDotsSlashesPattern = Pattern.compile("^[0-9\\-\\\\/\\.]+$");
public static void main(String[] args){
UniProtParser parser = new UniProtParser();
Set<Integer> taxonIDs = new HashSet<Integer>();
taxonIDs.add(9606);
GeneList geneList = parser.parse("/home/data/Swiss-Prot/human.txt", taxonIDs);
geneList.saveToSimpleFile("/home/temp/uniprot.txt");
}
@Override
public GeneList parse(String filename, Set<Integer> allowedTaxonIDs) {
geneList = new GeneList();
this.taxonIDs = allowedTaxonIDs;
processFile(filename);
return geneList;
}
private void processFile(String filename) {
Gene gene = new Gene(getTag());
for (String line : new ReadTextFile(filename)){
String prefix = line.substring(0,2);
String content;
if (line.length() > 5)
content = line.substring(5);
else
content = "";
if (prefix.equals("ID")){
String symbol = content.substring(0,content.indexOf('_')).trim();
if (isValidTerm(symbol)){
gene.symbols.add(symbol);
gene.preferredSymbol = symbol;
}
} else if (prefix.equals("AC")){
for (String upid : content.split(";"))
if (upid.length() != 0)
gene.ids.add(new DatabaseID("UP",upid.trim()));
} else if (prefix.equals("DE")){
String fullname = StringUtilities.findBetween(content, "Full=", ";").trim();
if (fullname.length() != 0 && isValidTerm(fullname))
gene.names.add(fullname);
String symbol = StringUtilities.findBetween(content, "Short=", ";").trim();
if (symbol.length() != 0 && isValidTerm(symbol))
gene.symbols.add(symbol);
} else if (prefix.equals("GN")){
String geneSymbol = StringUtilities.findBetween(content, "Name=", ";").trim();
if (isValidTerm(geneSymbol)){
gene.preferredSymbol = geneSymbol; //overrides protein symbol
gene.symbols.add(geneSymbol);
}
String synonyms = StringUtilities.findBetween(content, "Synonyms=", ";");
for (String synonym : synonyms.split(","))
if (isValidTerm(synonym))
gene.symbols.add(synonym.trim());
} else if (prefix.equals("OX")){
String taxonID = StringUtilities.findBetween(content, "NCBI_TaxID=", ";");
gene.taxonIDs.add(Integer.parseInt(taxonID));
} else if (prefix.equals("DR")){
String egID = StringUtilities.findBetween(content, "GeneID; ", ";");
if (egID.length() != 0)
gene.ids.add(new DatabaseID("EG", egID));
String hgncID = StringUtilities.findBetween(content, "HGNC; HGNC:", ";");
if (hgncID.length() != 0)
gene.ids.add(new DatabaseID("HG", hgncID));
String omimID = StringUtilities.findBetween(content, "MIM; ", ";");
if (omimID.length() != 0 && content.contains("gene"))
gene.ids.add(new DatabaseID("OM", omimID));
String uniGeneID = StringUtilities.findBetween(content, "UniGene; ", ";");
if (uniGeneID.length() != 0)
gene.ids.add(new DatabaseID("UG", uniGeneID));
String ecoID = StringUtilities.findBetween(content, "EcoGene; ", ";");
if (ecoID.length() != 0)
gene.ids.add(new DatabaseID("ECO", ecoID));
String sgdID = StringUtilities.findBetween(content, "SGD; ", ";");
if (sgdID.length() != 0)
gene.ids.add(new DatabaseID("SGD", sgdID));
String mgiID = StringUtilities.findBetween(content, "MGI; MGI:", ";");
if (mgiID.length() != 0)
gene.ids.add(new DatabaseID("MGI", mgiID));
String rgdID = StringUtilities.findBetween(content, "RGD; ", ";");
if (rgdID.length() != 0)
gene.ids.add(new DatabaseID("RGD", rgdID));
String flybaseID = StringUtilities.findBetween(content, "FlyBase; ", ";");
if (flybaseID.length() != 0)
gene.ids.add(new DatabaseID("FB", flybaseID));
String zfinID = StringUtilities.findBetween(content, "ZFIN; ", ";");
if (zfinID.length() != 0)
gene.ids.add(new DatabaseID("ZFIN", zfinID));
String wormbaseID = StringUtilities.findBetween(content, "WormBase; ", ";");
if (wormbaseID.length() != 0)
gene.ids.add(new DatabaseID("WB", wormbaseID));
} else if (prefix.equals("//")){
if (taxonIDs.contains(gene.taxonIDs.iterator().next()))
geneList.add(gene);
gene = new Gene(getTag());
}
}
}
@Override
public String getTag() {
return "UP";
}
private boolean isValidTerm(String term) {
term = term.trim().toLowerCase();
if ((term.length()<3) || (digitsDotsSlashesPattern.matcher(term).find()))
return false;
if (term.contains("similar") || term.contains("putative") || term.contains("hypothetical") || term.contains("predicted") || term.contains("uncharacterized"))
return false;
return true;
}
}