/* * The MIT License (MIT) * * Copyright (c) 2016 University of California San Diego * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * * */ package org.broad.igv.util; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.tribble.FeatureCodec; import org.broad.igv.Globals; import org.broad.igv.feature.genome.ChromosomeNameComparator; import org.broad.igv.feature.genome.FastaIndexedSequence; import org.broad.igv.feature.genome.GenomeManager; import org.broad.igv.feature.tribble.CodecFactory; import org.broad.igv.sam.PicardAlignment; import org.broad.igv.sam.reader.AlignmentReader; import org.broad.igv.sam.reader.AlignmentReaderFactory; import org.broad.igv.tools.sort.AsciiSorter; import org.broad.igv.tools.sort.SortableRecord; import java.io.*; import java.lang.reflect.Array; import java.util.*; /** * Utilities for creating compact files for tutorials */ public class TutorialUtils { public static void main(String[] args) throws IOException { sliceFasta(args[0], args[1], args[2]); //sampleVCF(args[0], args[1], Integer.parseInt(args[2])); //sliceVCF(args[0], args[1], args[2], Integer.parseInt(args[3]), Integer.parseInt(args[4])); //extractFasta(args[0], args[1], args[2]); // extractAlignments(args[0], args[1], args[2]); //extractFeatures(args[0], args[1], args[2]); } static void extractFasta(String inputFasta, String outputFasta, String regionsFile) throws IOException { FastaIndexedSequence inFasta = new FastaIndexedSequence(inputFasta); List<Region> regions = parseRegions(new File(regionsFile)); PrintWriter outFasta = null; try { outFasta = new PrintWriter(new BufferedWriter(new FileWriter(outputFasta))); for (Region r : regions) { byte[] sequence = inFasta.getSequence(r.chr, r.start, r.end); outFasta.println(">" + r.name); outFasta.println(new String(sequence)); } } finally { if (outFasta != null) outFasta.close(); } } static void extractAlignments(String inputFile, String outputFile, String regionsFile) throws IOException { AlignmentReader reader = AlignmentReaderFactory.getReader(inputFile, true); PrintWriter out = null; List<Region> regions = parseRegions(new File(regionsFile)); Map<String, List<Region>> regionMap = new HashMap<>(); for (Region r : regions) { List<Region> rlist = regionMap.get(r.chr); if (rlist == null) { rlist = new ArrayList<>(); regionMap.put(r.chr, rlist); } rlist.add(r); } try { out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); out.println("@HD VN:1.5 SO:coordinate"); for (Region r : regions) { out.println("@SQ\tSN:" + r.name + "\tLN:" + (r.end - r.start)); } for (Region r : regions) { CloseableIterator<PicardAlignment> iter = reader.query(r.chr, r.start, r.end, false); while (iter.hasNext()) { PicardAlignment alignment = iter.next(); SAMRecord record = alignment.getRecord(); record.setReferenceName(r.name); record.setAlignmentStart(record.getAlignmentStart() - r.start); if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { if (record.getMateReferenceName().equals(record.getReferenceName())) { // todo -- handle interchr record.setMateReferenceName(r.name); record.setMateAlignmentStart(record.getMateAlignmentStart() - r.start); } else { // Try to find new mate chr String newMateChr = null; List<Region> rlist = regionMap.get(record.getMateReferenceName()); if (rlist != null) { for (Region r2 : rlist) { if (r2.contains(record.getMateAlignmentStart())) { newMateChr = r.name; break; } } } if (newMateChr != null) { record.setMateReferenceName(newMateChr); } else { record.setMateUnmappedFlag(true); } } } out.print(record.getSAMString()); } iter.close(); } } finally { if (out != null) out.close(); } } static void extractFeatures(String inputFile, String outputFile, String regionsFile) throws IOException { PrintWriter out = null; List<Region> regions = parseRegions(new File(regionsFile)); Map<String, IntervalTree<List<Feature>>> featureMap = loadFeatures(inputFile); try { out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); for (Region r : regions) { IntervalTree<List<Feature>> featureTree = featureMap.get(r.chr); if (featureTree != null) { List<Interval<List<Feature>>> intervals = featureTree.findOverlapping(r.start, r.end); for (Interval<List<Feature>> interval : intervals) { List<Feature> features = interval.getValue(); for (Feature f : features) { if (f.start >= r.start) { String s = f.tanslate(r.name, r.start); out.println(s); } } } } } } finally { if (out != null) out.close(); } } static List<Region> parseRegions(File file) throws IOException { List<Region> regions = new ArrayList<>(); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String nextLine; while ((nextLine = reader.readLine()) != null) { if (nextLine.startsWith("#")) continue; String[] tokens = Globals.whitespacePattern.split(nextLine); if (tokens.length > 3) { regions.add(new Region(tokens[0], Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2]), tokens[3])); } } } finally { if (reader != null) reader.close(); } return regions; } static void sliceVCF(String inputFile, String outputFile, String chr, int start, int end) throws IOException { BufferedReader reader = null; PrintWriter out = null; try { reader = ParsingUtils.openBufferedReader(inputFile); out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); String nextLine; while ((nextLine = reader.readLine()) != null) { if (nextLine.startsWith("#")) { out.println(nextLine); } else { String[] tokens = Globals.tabPattern.split(nextLine); String c = tokens[0]; if (c.equals(chr)) { int pos = Integer.parseInt(tokens[1]); if (pos >= start && pos <= end) { out.println(nextLine); } if (pos > end) { break; } } } } } finally { if (out != null) out.close(); if (reader != null) reader.close(); } } // Keep every nth sample (genotype). static void sampleVCF(String inputFile, String outputFile, int n) throws IOException { BufferedReader reader = null; PrintWriter out = null; try { reader = new BufferedReader(new FileReader(inputFile)); out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); String nextLine; while ((nextLine = reader.readLine()) != null) { if (nextLine.startsWith("##")) { out.println(nextLine); } else { String[] tokens = Globals.tabPattern.split(nextLine); for(int i=0; i<9; i++) { out.print(tokens[i]); if(i < 8) out.print('\t'); } for(int i=9; i<tokens.length; i+=n) { if(i < tokens.length) { out.print('\t'); out.print(tokens[i]); } } out.println(); } } } finally { if (out != null) out.close(); if (reader != null) reader.close(); } } static void sliceFasta(String inputFasta, String outputFasta, String chr) throws IOException { BufferedReader reader = null; PrintWriter outFasta = null; try { reader = ParsingUtils.openBufferedReader(inputFasta); outFasta = new PrintWriter(new BufferedWriter(new FileWriter(outputFasta))); String nextLine; boolean chrFound = false; while ((nextLine = reader.readLine()) != null) { if(chrFound) { if(nextLine.startsWith(">")) { break; // Done } else { outFasta.println(nextLine); } } else { if(nextLine.startsWith(">" + chr)) { outFasta.println(nextLine); chrFound = true; } } } } finally { if (outFasta != null) outFasta.close(); } } // // // private static List<Region> mergeRegions(List<Region> regions) { // // List<Region> mergedRegions = new ArrayList<>(regions.size()); // // Region lastRegion = regions.get(0); // for (int i = 0; i < regions.size(); i++) { // Region r = regions.get(i); // if (r.chr.equals(lastRegion.chr)) { // if (r.start <= lastRegion.end) { // lastRegion.end = Math.max(lastRegion.end, r.end); // extend region // } else { // mergedRegions.add(lastRegion); // } // } else { // mergedRegions.add(lastRegion); // } // lastRegion = r; // } // return mergedRegions; // // } private static Comparator<Feature> getPositionComparator() { Comparator<Feature> comp = new Comparator<Feature>() { private Comparator<String> nameComparator = ChromosomeNameComparator.get(); public int compare(Feature o1, Feature o2) { int nameComp = nameComparator.compare(o1.chr, o2.chr); if (nameComp == 0) { return o1.start - o2.start; } else { return nameComp; } } }; return comp; } static class Region { String chr; int start; int end; String name; public Region(String chr, int start, int end, String name) { this.end = end; this.chr = chr; this.start = start; this.name = name; } public boolean contains(int p) { return p >= this.start && p <= this.end; } } // Hardcoded for genePred ext format (refGene.txt) static class Feature { String chr; int start; int end; String[] tokens; Feature(String[] tokens) { this.chr = tokens[2]; this.start = Integer.parseInt(tokens[4]); this.end = Integer.parseInt(tokens[5]); this.tokens = tokens; } boolean overlaps(String chr, int start, int end) { return chr.equals(this.chr) && end > this.start && start <= this.end; } String tanslate(String newChr, int offset) { tokens[2] = newChr; tokens[4] = String.valueOf(this.start - offset); tokens[5] = String.valueOf(this.end - offset); tokens[6] = String.valueOf(Integer.parseInt(tokens[6]) - offset); tokens[7] = String.valueOf(Integer.parseInt(tokens[7]) - offset); String newExonStart = ""; String[] exonStarts = tokens[9].split(","); for (String es : exonStarts) { newExonStart += String.valueOf(Integer.parseInt(es) - offset) + ","; } tokens[9] = newExonStart; String newExonEnd = ""; String[] exonEnds = tokens[10].split(","); for (String es : exonEnds) { newExonEnd += String.valueOf(Integer.parseInt(es) - offset) + ","; } tokens[10] = newExonEnd; String record = tokens[0]; for (int i = 1; i < tokens.length; i++) { record += "\t" + tokens[i]; } return record; } } /* ( string name; "Name of gene (usually transcript_id from GTF)" string chrom; "Chromosome name" char[1] strand; "+ or - for strand" uint txStart; "Transcription start position" uint txEnd; "Transcription end position" uint cdsStart; "Coding region start" uint cdsEnd; "Coding region end" uint exonCount; "Number of exons" uint[exonCount] exonStarts; "Exon start positions" uint[exonCount] exonEnds; "Exon end positions" int score; "Score" string name2; "Alternate name (e.g. gene_id from GTF)" string cdsStartStat; "enum('none','unk','incmpl','cmpl')" string cdsEndStat; "enum('none','unk','incmpl','cmpl')" lstring exonFrames; "Exon frame offsets {0,1,2}" ) */ //585 NR_046018 chr1 + 11873 14409 14409 14409 3 11873,12612,13220, 12227,12721,14409, 0 DDX11L1 unk unk -1,-1,-1, //585 NR_024540 chr1 - 14361 29370 29370 29370 11 14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320, 14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370, 0 WASH7P unk unk -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // Return a map of interval trees, keyed by chromosome name. static Map<String, IntervalTree<List<Feature>>> loadFeatures(String file) throws IOException { BufferedReader reader = null; reader = ParsingUtils.openBufferedReader(file); String nextLine; String lastChr = null; List<Feature> currentFeatureList = new ArrayList<>(); int currentMin = Integer.MAX_VALUE; int currentMax = 0; Map<String, IntervalTree<List<Feature>>> map = new HashMap<>(); List<Feature> features = new ArrayList<>(); while ((nextLine = reader.readLine()) != null) { if (nextLine.startsWith("#") || nextLine.startsWith("track") || nextLine.startsWith("browser")) continue; String[] tokens = Globals.whitespacePattern.split(nextLine); Feature f = new Feature(tokens); features.add(f); } features.sort(getPositionComparator()); for (Feature f : features) { if (lastChr == null) { currentMin = f.start; currentMax = f.end; currentFeatureList.add(f); IntervalTree<List<Feature>> tree = new IntervalTree<>(); map.put(f.chr, tree); lastChr = f.chr; } else { if (!f.chr.equals(lastChr)) { // New tree IntervalTree<List<Feature>> tree = map.get(lastChr); tree.insert(new Interval(currentMin, currentMax, currentFeatureList)); if (map.containsKey(f.chr)) { System.out.println(); } tree = new IntervalTree<>(); map.put(f.chr, tree); lastChr = f.chr; currentFeatureList = new ArrayList<>(); currentFeatureList.add(f); currentMin = f.start; currentMax = f.end; } else if (currentFeatureList.size() > 10) { // New interval IntervalTree<List<Feature>> tree = map.get(lastChr); tree.insert(new Interval(currentMin, currentMax, currentFeatureList)); currentFeatureList = new ArrayList<>(); currentFeatureList.add(f); currentMin = f.start; currentMax = f.end; } else { // Update interval currentMin = Math.min(currentMin, f.start); currentMax = Math.max(currentMax, f.end); currentFeatureList.add(f); } } } return map; } }