/* * The MIT License (MIT) * * Copyright (c) 2007-2015 Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.broad.igv.feature; import org.broad.igv.Globals; import org.broad.igv.feature.tribble.IGVBEDCodec; import org.broad.igv.util.ParsingUtils; import org.broad.igv.util.ResourceLocator; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.collections.PrimitivePair; import java.io.*; import java.util.*; /** * @author jrobinso */ public class FeatureFileUtils { static Set<String> types = new HashSet(Arrays.asList("SINE", "LINE", "LTR", "DNA", "Simple_repeat", "Low_complexity", "Satellite", "RNA", "Other", "Unknown", "Uncategorized")); /** * Compute feature density in units of # / window. Note: This function could be extended to other file types by * using codecs. * * @param iFile - a bed file * @param windowSize * @param step */ static void computeBedDensity(String iFile, String oFile, int windowSize, int step) throws IOException { //FeatureCodec codec = CodecFactory.getCodec(iFile, null); BufferedReader br = null; PrintWriter pw = null; try { br = ParsingUtils.openBufferedReader(iFile); pw = new PrintWriter(new BufferedWriter(new FileWriter(oFile))); String nextLine; Map<Integer, Window> openWindows = new LinkedHashMap<Integer, Window>(); int lastWindowOutput = 0; String lastChr = null; while ((nextLine = br.readLine()) != null) { if (nextLine.startsWith("track") || nextLine.startsWith("#")) continue; String[] tokens = Globals.tabPattern.split(nextLine); if (tokens.length < 3) continue; String chr = tokens[0]; int start = Integer.parseInt(tokens[1]); int end = Integer.parseInt(tokens[2]); if (!chr.equals(lastChr)) { lastWindowOutput = 0; for (Window window : openWindows.values()) { int w = window.idx; int windowCenter = w * step + (windowSize / 2); int windowStart = windowCenter - step / 2; int windowEnd = windowStart + step; pw.println(lastChr + "\t" + windowStart + "\t" + windowEnd + "\t" + window.count); } openWindows.clear(); lastChr = chr; } int startWindow = start / step; int endWindow = (end + windowSize) / step; for (int w = startWindow; w < endWindow; w++) { Window window = openWindows.get(w); if (window == null) { window = new Window(w); openWindows.put(w, window); } window.increment(); } // File is sorted by start position, will never see windows < startWindow aganin if (startWindow > lastWindowOutput) { for (int w = lastWindowOutput; w < startWindow; w++) { Window window = openWindows.get(w); if (window != null) { int windowCenter = w * step + (windowSize / 2); int windowStart = windowCenter - step / 2; int windowEnd = windowStart + step; pw.println(chr + "\t" + windowStart + "\t" + windowEnd + "\t" + window.count); } } for (int w = lastWindowOutput; w < startWindow; w++) { openWindows.remove(w); } lastWindowOutput = startWindow - 1; } } } finally { if (br != null) { br.close(); } if (pw != null) { pw.close(); } } } /** * Utility to convert a file with the following format to "bed" * ILMN_2087817 chrY:9529429:9529478:+ // "1" based? * ILMN_2204360 chrY:9598604:9598615:- chrY:9590985:9591022:- * * @param iFile * @param oFile * @param includeMultiMappings if false probes with multiple location mappings are filtered out */ public static void probeToBed(String iFile, String oFile, boolean includeMultiMappings) throws IOException { BufferedReader br = null; PrintWriter pw = null; try { br = ParsingUtils.openBufferedReader(iFile); pw = new PrintWriter(new FileWriter(oFile)); String nextLine; br.readLine(); // eat header while ((nextLine = br.readLine()) != null) { String[] tokens = Globals.commaPattern.split(nextLine); String probe = tokens[0]; if (tokens.length < 1 || (tokens.length > 2 && !includeMultiMappings)) continue; for (int i = 1; i < tokens.length; i++) { String loc = tokens[i]; String[] locParts = Globals.colonPattern.split(loc); String chr = locParts[0]; int start = Integer.parseInt(locParts[1]) - 1; int end = Integer.parseInt(locParts[2]); String strand = locParts[3]; pw.println(chr + "\t" + start + "\t" + end + "\t" + probe + "\t1000\t" + strand); } } } finally { if (br != null) br.close(); if (pw != null) pw.close(); } } public static void splitRepeatMasker(String iFile, String outputDirectory, String prefix) throws IOException { BufferedReader br = null; Map<String, PrintWriter> pws = new HashMap(); try { br = new BufferedReader(new FileReader(iFile)); File dir = new File(outputDirectory); for (String type : types) { File f = new File(dir, prefix + type + ".bed"); pws.put(type, new PrintWriter(new BufferedWriter(new FileWriter(f)))); } String nextLine; while ((nextLine = br.readLine()) != null) { if (nextLine.startsWith("#")) continue; String[] tokens = nextLine.split("\t"); String type = getType(tokens[5]); // Rerrange columns for legal bed pws.get(type).println(tokens[0] + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[4] + "\t" + tokens[3]); } } finally { if (br != null) { br.close(); } for (PrintWriter pw : pws.values()) { pw.close(); } } } public static String getType(String s) { s = s.replace("?", ""); if (s.contains("RNA")) { return "RNA"; } else if (s.equals("RC")) { return "Other"; } else if (s.equals("repClass")) { return "Other"; } else if (types.contains(s)) { return s; } else { return "Uncategorized"; } } /** * Given a GFF File, creates a new GFF file for each type. Any feature type * which is part of a "gene" ( {@link SequenceOntology#geneParts} ) are put in the same file, * others are put in different files. So features of type "gene", "exon", and "mrna" * would go in gene.gff, but features of type "myFeature" would go in myFeature.gff. * * @param gffFile * @param outputDirectory * @throws IOException */ public static void splitGffFileByType(String gffFile, String outputDirectory) throws IOException { BufferedReader br = new BufferedReader(new FileReader(gffFile)); String nextLine; String ext = "." + gffFile.substring(gffFile.length() - 4); Map<String, PrintWriter> writers = new HashMap(); while ((nextLine = br.readLine()) != null) { nextLine = nextLine.trim(); if (!nextLine.startsWith("#")) { String[] tokens = Globals.tabPattern.split(nextLine.trim().replaceAll("\"", ""), -1); String type = tokens[2]; if (SequenceOntology.geneParts.contains(type)) { type = "gene"; } if (!writers.containsKey(type)) { writers.put(type, new PrintWriter(new FileWriter(new File(outputDirectory, type + ext)))); } } } br.close(); br = new BufferedReader(new FileReader(gffFile)); PrintWriter currentWriter = null; while ((nextLine = br.readLine()) != null) { nextLine = nextLine.trim(); if (nextLine.startsWith("#")) { for (PrintWriter pw : writers.values()) { pw.println(nextLine); } } else { String[] tokens = Globals.tabPattern.split(nextLine.trim().replaceAll("\"", ""), -1); String type = tokens[2]; if (SequenceOntology.geneParts.contains(type)) { type = "gene"; } currentWriter = writers.get(type); if (currentWriter != null) { currentWriter.println(nextLine); } else { System.out.println("No writer for: " + type); } } } br.close(); for (PrintWriter pw : writers.values()) { pw.close(); } } static class Window { int idx; int count; Window(int idx) { this.idx = idx; } void increment() { count++; } } /** * Create a "canonical" gene file from an annotation UCSC refseq type annotation file. The "canonical" file * represents all the isoforms of a gene as a single feature containing the union of all exons from the * splice forms for the gene. This might or might not represent an actual transcript (although it usually does). * * @param iFile * @param outputFile */ static void createCanonicalGeneFile(String iFile, String outputFile) throws IOException { BufferedReader br = null; PrintWriter pw = null; try { br = new BufferedReader(new FileReader(iFile)); pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); FeatureParser parser = AbstractFeatureParser.getInstanceFor(new ResourceLocator(iFile), null); List<Feature> features = parser.loadFeatures(br, null); IGVBEDCodec codec = new IGVBEDCodec(); Map<String, List<BasicFeature>> genes = new HashMap<String, List<BasicFeature>>(); for (Feature f : features) { BasicFeature transcript = (BasicFeature) f; String geneName = transcript.getName(); List<BasicFeature> genelist = genes.get(geneName); if (genelist == null) { genelist = new ArrayList<BasicFeature>(); genes.put(geneName, genelist); } // Loop through genes with this name to find one that overlaps boolean foundOverlap = false; for (BasicFeature gene : genelist) { if (gene.overlaps(transcript)) { gene.setThickStart(Math.min(gene.getThickStart(), transcript.getThickStart())); gene.setThickEnd(Math.max(gene.getThickEnd(), transcript.getThickEnd())); mergeExons(gene, transcript.getExons()); foundOverlap = true; break; } } if (!foundOverlap) { genelist.add(transcript); } } for (List<BasicFeature> geneList : genes.values()) { for (BasicFeature gene : geneList) { pw.println(codec.encode(gene)); } } } finally { if (br != null) br.close(); if (pw != null) pw.close(); } } /** * Create a bed file of "TSS regions", define as the 20 bp region downstream of the start of the feature. * <p/> * It is assume that iFile is sorted by start position. * * @param iFile * @param outputFile */ static void createTSSFile(String iFile, String outputFile) throws IOException { BufferedReader br = null; PrintWriter pw = null; try { br = new BufferedReader(new FileReader(iFile)); pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); FeatureParser parser = AbstractFeatureParser.getInstanceFor(new ResourceLocator(iFile), null); List<Feature> features = parser.loadFeatures(br, null); IGVBEDCodec codec = new IGVBEDCodec(); Map<String, List<BasicFeature>> genes = new HashMap<String, List<BasicFeature>>(); int lastTSS = -1; for (Feature f : features) { BasicFeature transcript = (BasicFeature) f; int tss = transcript.getStrand() == Strand.POSITIVE ? f.getStart() : f.getEnd(); if (tss != lastTSS) { int tssEnd = transcript.getStrand() == Strand.POSITIVE ? tss + 20 : tss - 20; pw.println(transcript.getChr() + "\t" + Math.min(tss, tssEnd) + "\t" + Math.max(tss, tssEnd)); lastTSS = tss; } } } finally { if (br != null) br.close(); if (pw != null) pw.close(); } } /* 0 `bin` smallint(5) unsigned NOT NULL, 1 `name` varchar(255) NOT NULL, 2 `chrom` varchar(255) NOT NULL, 3 `strand` char(1) NOT NULL, 4 `txStart` int(10) unsigned NOT NULL, 5 `txEnd` int(10) unsigned NOT NULL, 6 `cdsStart` int(10) unsigned NOT NULL, 7 `cdsEnd` int(10) unsigned NOT NULL, 8 `exonCount` int(10) unsigned NOT NULL, 9 `exonStarts` longblob NOT NULL, 10 `exonEnds` longblob NOT NULL, 11 `score` int(11) DEFAULT NULL, 12 `name2` varchar(255) NOT NULL, 13 `cdsStartStat` enum('none','unk','incmpl','cmpl') NOT NULL, 14 `cdsEndStat` enum('none','unk','incmpl','cmpl') NOT NULL, 15 `exonFrames` longblob NOT NULL, */ static void refgeneToBed(String iFile, String outputFile) throws IOException { BufferedReader br = null; PrintWriter pw = null; try { br = new BufferedReader(new FileReader(iFile)); pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); String nextLine; while ((nextLine = br.readLine()) != null) { if (nextLine.startsWith("#") || nextLine.startsWith("track") || nextLine.startsWith("browser")) { pw.println(nextLine); continue; } String[] tokens = Globals.whitespacePattern.split(nextLine); String chr = tokens[2]; int start = Integer.parseInt(tokens[4]); String end = tokens[5]; String name = tokens[12]; String score = "1000"; String strand = tokens[3]; String thickStart = tokens[6]; String thickEnd = tokens[7]; String itemRGB = "."; int blockCount = Integer.parseInt(tokens[8]); String exonStarts = tokens[9]; String[] stok = Globals.commaPattern.split(exonStarts); String[] etok = Globals.commaPattern.split(tokens[10]); String blockStarts = ""; String blockSizes = ""; for (int i = 0; i < blockCount; i++) { final int bs = Integer.parseInt(stok[i]); blockStarts += String.valueOf(bs - start); blockSizes += String.valueOf(Integer.parseInt(etok[i]) - bs); if (i != blockCount - 1) { blockStarts += ","; blockSizes += ","; } } pw.println(chr + "\t" + start + "\t" + end + "\t" + name + "\t" + score + "\t" + strand + "\t" + thickStart + "\t" + thickEnd + "\t" + itemRGB + "\t" + blockCount + "\t" + blockSizes + "\t" + blockStarts); } } finally { if (br != null) br.close(); if (pw != null) pw.close(); } } private static void mergeExons(BasicFeature gene, List<Exon> exons) { Set<IExon> exonProxies = new HashSet<IExon>(gene.getExons()); for (Exon exon : gene.getExons()) { exonProxies.add(Exon.getExonProxy(exon)); } for (Exon exon : exons) { IExon proxy = Exon.getExonProxy(exon); if (!exonProxies.contains(proxy)) { gene.addExon(exon); } } FeatureUtils.sortFeatureList(gene.getExons()); } static void covertProbeMapToBedFile(String probeMapFile, String bedFile) throws FileNotFoundException, IOException { BufferedReader br = new BufferedReader(new FileReader(probeMapFile)); PrintWriter pw = new PrintWriter(new FileWriter(bedFile)); String nextLine; while ((nextLine = br.readLine()) != null) { String[] tokens = nextLine.split("\t"); Locus locus = Locus.fromString(tokens[1].trim()); pw.println( locus.getChr() + "\t" + locus.getStart() + "\t" + locus.getEnd() + "\t" + tokens[0].trim()); } br.close(); pw.close(); } static void splitEmblFileByType(String emblFile, String outputDirectory) throws IOException { BufferedReader br = new BufferedReader(new FileReader(emblFile)); String nextLine; Set<String> codes = new HashSet(); while ((nextLine = br.readLine()) != null) { if (nextLine.startsWith("FT") && (nextLine.length() > 19)) { String code = nextLine.substring(5, 19).trim(); if (code.length() > 0) { codes.add(code); } } } br.close(); Map<String, PrintWriter> writers = new HashMap(); for (String code : codes) { writers.put(code, new PrintWriter(new FileWriter(new File(outputDirectory, code + ".embl")))); } br = new BufferedReader(new FileReader(emblFile)); PrintWriter currentWriter = null; while ((nextLine = br.readLine()) != null) { if (nextLine.startsWith("ID")) { for (PrintWriter pw : writers.values()) { pw.println(nextLine); } } else if (nextLine.startsWith("FT")) { String code = nextLine.substring(5, 19).trim(); if (code.length() > 0) { currentWriter = writers.get(code); } if (currentWriter != null) { currentWriter.println(nextLine); } } else { currentWriter = null; } } br.close(); for (PrintWriter pw : writers.values()) { pw.close(); } } public static void main(String[] args) throws IOException { refgeneToBed(args[0], args[1]); } }