/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.dataimport.genes.ontologyBuilder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.utilities.ReadTextFile;
public class HGNCParser implements DatabaseParser {
private GeneList geneList;
private Set<Integer> taxonIDs;
private static final Pattern digitsDotsSlashesPattern = Pattern.compile("^[0-9\\-\\\\/\\.]+$");
private static final Pattern numberPattern = Pattern.compile("^[0-9]+$");
public static void main(String[] args){
Set<Integer> taxonIDs = new HashSet<Integer>();
taxonIDs.add(9606);
HGNCParser parser = new HGNCParser();
GeneList geneList = parser.parse("/data/HUGO/alldata.txt" , taxonIDs);
geneList.printStatistics();
geneList.saveToSimpleFile("/home/temp/HGNC.txt");
}
@Override
public String getTag() {
return "HG";
}
@Override
public GeneList parse(String filename, Set<Integer> allowedTaxonIDs) {
geneList = new GeneList();
this.taxonIDs = allowedTaxonIDs;
processFile(filename);
return geneList;
}
private void processFile(String filename) {
String id;
Boolean firstLine = true;
for (String line : new ReadTextFile(filename)){
if (firstLine) {
firstLine = false;
continue; // skip header line
}
Gene gene = new Gene(getTag());
gene.taxonIDs.add(9606);
String[] fields = line.split("\t");
String status = fields[3];
if (fields.length < 35 || !status.equalsIgnoreCase("approved"))
continue;
String hgid = fields[0].trim();
String preferredSymbol = fields[1].trim();
String preferredName = fields[2].trim();
String symbols = fields[7].trim();
String names = fields[8].trim();
String gdbid = fields[31].trim();
String egid = fields[32].trim();
String omid = fields[33].trim();
String upid = fields[34].trim();
if (isValidTerm(preferredSymbol)) {
gene.preferredSymbol = preferredSymbol;
gene.symbols.add(preferredSymbol);
}
for (String symbol : safeSplit(symbols)) {
if (isValidTerm(symbol) && !symbol.startsWith("LOC"))
gene.symbols.add(symbol);
}
if (isValidTerm(preferredName))
gene.names.add(preferredName);
for (String name : safeSplit(names)) {
if (isValidTerm(name))
gene.names.add(name);
}
id = hgid.replaceFirst("HGNC:", "");
if (numberPattern.matcher(id).matches())
gene.ids.add(new DatabaseID("HG", id));
if (numberPattern.matcher(egid).matches())
gene.ids.add(new DatabaseID("EG", egid));
id = gdbid.replaceFirst("GDB:", "");
if (numberPattern.matcher(id).matches())
gene.ids.add(new DatabaseID("GD", id));
if (numberPattern.matcher(omid).matches())
gene.ids.add(new DatabaseID("OM", omid));
if (upid.length() != 0)
gene.ids.add(new DatabaseID("UP", upid));
if (taxonIDs.contains(gene.taxonIDs.iterator().next()))
geneList.add(gene);
}
}
private String[] safeSplit(String line){
List<String> result = new ArrayList<String>();
StringBuilder sb = new StringBuilder();
int open = 0;
for (int i = 0; i < line.length(); i++){
char ch = line.charAt(i);
if (ch == '"')
open = (open == 0 ? open++ : open--);
else if (ch == ',') {
if (open == 0){
result.add(sb.toString().trim());
sb = new StringBuilder();
}
else
sb.append(ch);
}
else
sb.append(ch);
}
result.add(sb.toString().trim());
return result.toArray(new String[result.size()]);
}
private boolean isValidTerm(String term) {
term = term.trim().toLowerCase();
if ((term.length() < 3) || (digitsDotsSlashesPattern.matcher(term).find()))
return false;
// if (term.contains("similar") || term.contains("putative") || term.contains("hypothetical") || term.contains("predicted") || term.contains("uncharacterized"))
// return false;
return true;
}
}