/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.dataimport.genes.ontologyBuilder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.erasmusmc.ids.DatabaseID;
import org.erasmusmc.utilities.ReadTextFile;
public class OMIMParser implements DatabaseParser {
private GeneList geneList;
private Set<Integer> taxonIDs;
private static final Pattern digitsDotsSlashesPattern = Pattern.compile("^[0-9\\-\\\\/\\.]+$");
private static final Pattern numberPattern = Pattern.compile("^[0-9]+$");
public static void main(String[] args){
Set<Integer> taxonIDs = new HashSet<Integer>();
taxonIDs.add(9606);
OMIMParser parser = new OMIMParser();
GeneList geneList = parser.parse("/data/OMIM/genemap.txt" , taxonIDs);
geneList.printStatistics();
geneList.saveToSimpleFile("/home/temp/OMIM.txt");
}
@Override
public String getTag() {
return "OM";
}
@Override
public GeneList parse(String filename, Set<Integer> allowedTaxonIDs) {
geneList = new GeneList();
this.taxonIDs = allowedTaxonIDs;
processFile(filename);
return geneList;
}
private void processFile(String filename) {
for (String line : new ReadTextFile(filename)){
Gene gene = new Gene(getTag());
gene.taxonIDs.add(9606);
String[] fields = safeSplit(line);
if (fields.length < 10)
continue;
String omid = fields[9].trim();
String symbols = fields[5];
String fullname = fields[7].trim();
if (numberPattern.matcher(omid).matches()){
//if (StringUtilities.isNumber(omid)) {
gene.ids.add(new DatabaseID("OM", omid.trim()));
if (fullname.length() != 0 && isValidTerm(fullname))
gene.names.add(fullname);
for (String symbol : symbols.split(" *[,.] +")) {
if (symbol.length() != 0 && isValidTerm(symbol))
gene.symbols.add(symbol);
}
if (taxonIDs.contains(gene.taxonIDs.iterator().next()))
geneList.add(gene);
}
}
}
private String[] safeSplit(String line){
List<String> result = new ArrayList<String>();
StringBuilder sb = new StringBuilder();
int open = 0;
for (int i = 0; i < line.length(); i++){
char ch = line.charAt(i);
if (ch == '(')
open++;
else if (ch == ')')
open--;
if (ch == '|'){
if (open == 0){
result.add(sb.toString());
sb = new StringBuilder();
} else
sb.append(' ');
} else
sb.append(ch);
}
result.add(sb.toString());
return result.toArray(new String[result.size()]);
}
private boolean isValidTerm(String term) {
term = term.trim().toLowerCase();
if ((term.length() < 3) || (digitsDotsSlashesPattern.matcher(term).find()))
return false;
// if (term.contains("similar") || term.contains("putative") || term.contains("hypothetical") || term.contains("predicted") || term.contains("uncharacterized"))
// return false;
return true;
}
}