/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.dataimport.genes; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.erasmusmc.utilities.ReadTextFile; import org.erasmusmc.utilities.StringUtilities; import org.erasmusmc.utilities.WriteTextFile; public class MapUnigene2LLID { public static void main(String[] args){ new MapUnigene2LLID(); } public MapUnigene2LLID(){ System.out.println(StringUtilities.now() + "\tLoading dataset"); //loadData("/home/schuemie/leiden/Human_genes.txt"); //loadData("/home/schuemie/leiden/Mouse_genes.txt"); loadData("/home/schuemie/leiden/top1007targets.txt"); System.out.println(StringUtilities.now() + "\tRetrieving LLIDs"); loadMapping("/data/UniGene/Hs.data"); //loadMapping("/data/UniGene/Mm.data"); System.out.println(StringUtilities.now() + "\tMerging"); //mergeData("/home/schuemie/leiden/Human_genes.txt" , "/home/schuemie/leiden/Human_genes_LLIDs.txt"); //mergeData("/home/schuemie/leiden/Mouse_genes.txt" , "/home/schuemie/leiden/Mouse_genes_LLIDs.txt"); mergeData("/home/schuemie/leiden/top1007targets.txt" , "/home/schuemie/leiden/top1007targets_LLIDs.txt"); } private void loadData(String filename) { ReadTextFile file = new ReadTextFile(filename); Iterator<String> iterator = file.getIterator(); while (iterator.hasNext()){ String line = iterator.next(); relevantIDs.add(line.split("\t")[0]); //First column is assumed to be Unigene ID } } private void loadMapping(String filename) { ReadTextFile file = new ReadTextFile(filename); Iterator<String> iterator = file.getIterator(); String llid = ""; while (iterator.hasNext()){ String line = iterator.next(); if (line.equals("//")) llid = ""; else if (line.startsWith("LOCUSLINK")){ llid = line.substring(12); } else if (line.startsWith("SEQUENCE ACC=")){ String acc = StringUtilities.findBetween(line, "ACC=", "."); if (relevantIDs.contains(acc)) acs2llid.put(acc, llid); } } } private void mergeData(String filename, String outfilename) { ReadTextFile file = new ReadTextFile(filename); WriteTextFile outfile = new WriteTextFile(outfilename); Iterator<String> iterator = file.getIterator(); while (iterator.hasNext()){ String line = iterator.next(); String id = line.split("\t")[0]; //First column is assumed to be Unigene ID String llid = acs2llid.get(id); if (llid == null) llid = ""; outfile.writeln(llid + "\t" + line); //LLID appended before original string } outfile.close(); } private Map<String, String> acs2llid = new HashMap<String, String>(); private Set<String> relevantIDs = new HashSet<String>(); }