TutorialUtils.java example

Explorer
igv-master
- src
- test
  - src
/*
 *  The MIT License (MIT)
 *
 * Copyright (c) 2016 University of California San Diego
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 *
 */

package org.broad.igv.util;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.tribble.FeatureCodec;
import org.broad.igv.Globals;
import org.broad.igv.feature.genome.ChromosomeNameComparator;
import org.broad.igv.feature.genome.FastaIndexedSequence;
import org.broad.igv.feature.genome.GenomeManager;
import org.broad.igv.feature.tribble.CodecFactory;
import org.broad.igv.sam.PicardAlignment;
import org.broad.igv.sam.reader.AlignmentReader;
import org.broad.igv.sam.reader.AlignmentReaderFactory;
import org.broad.igv.tools.sort.AsciiSorter;
import org.broad.igv.tools.sort.SortableRecord;

import java.io.*;
import java.lang.reflect.Array;
import java.util.*;

/**
 * Utilities for creating compact files for tutorials
 */

public class TutorialUtils {


    public static void main(String[] args) throws IOException {
        sliceFasta(args[0], args[1], args[2]);
        //sampleVCF(args[0], args[1], Integer.parseInt(args[2]));
        //sliceVCF(args[0], args[1], args[2], Integer.parseInt(args[3]), Integer.parseInt(args[4]));
        //extractFasta(args[0], args[1], args[2]);
        // extractAlignments(args[0], args[1], args[2]);
        //extractFeatures(args[0], args[1], args[2]);
    }

    static void extractFasta(String inputFasta, String outputFasta, String regionsFile) throws IOException {

        FastaIndexedSequence inFasta = new FastaIndexedSequence(inputFasta);

        List<Region> regions = parseRegions(new File(regionsFile));

        PrintWriter outFasta = null;

        try {
            outFasta = new PrintWriter(new BufferedWriter(new FileWriter(outputFasta)));

            for (Region r : regions) {

                byte[] sequence = inFasta.getSequence(r.chr, r.start, r.end);

                outFasta.println(">" + r.name);
                outFasta.println(new String(sequence));
            }
        } finally {
            if (outFasta != null) outFasta.close();
        }
    }

    static void extractAlignments(String inputFile, String outputFile, String regionsFile)
            throws IOException {

        AlignmentReader reader = AlignmentReaderFactory.getReader(inputFile, true);
        PrintWriter out = null;

        List<Region> regions = parseRegions(new File(regionsFile));
        Map<String, List<Region>> regionMap = new HashMap<>();
        for (Region r : regions) {
            List<Region> rlist = regionMap.get(r.chr);
            if (rlist == null) {
                rlist = new ArrayList<>();
                regionMap.put(r.chr, rlist);
            }
            rlist.add(r);
        }

        try {
            out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
            out.println("@HD VN:1.5 SO:coordinate");
            for (Region r : regions) {
                out.println("@SQ\tSN:" + r.name + "\tLN:" + (r.end - r.start));
            }

            for (Region r : regions) {

                CloseableIterator<PicardAlignment> iter = reader.query(r.chr, r.start, r.end, false);
                while (iter.hasNext()) {
                    PicardAlignment alignment = iter.next();
                    SAMRecord record = alignment.getRecord();
                    record.setReferenceName(r.name);
                    record.setAlignmentStart(record.getAlignmentStart() - r.start);

                    if (record.getReadPairedFlag() && !record.getMateUnmappedFlag()) {
                        if (record.getMateReferenceName().equals(record.getReferenceName())) {  // todo -- handle interchr
                            record.setMateReferenceName(r.name);
                            record.setMateAlignmentStart(record.getMateAlignmentStart() - r.start);
                        } else {
                            // Try to find new mate chr
                            String newMateChr = null;
                            List<Region> rlist = regionMap.get(record.getMateReferenceName());
                            if (rlist != null) {
                                for (Region r2 : rlist) {
                                    if (r2.contains(record.getMateAlignmentStart())) {
                                        newMateChr = r.name;
                                        break;
                                    }
                                }
                            }
                            if (newMateChr != null) {
                                record.setMateReferenceName(newMateChr);
                            } else {
                                record.setMateUnmappedFlag(true);
                            }
                        }
                    }
                    out.print(record.getSAMString());
                }
                iter.close();
            }
        } finally {
            if (out != null) out.close();
        }
    }


    static void extractFeatures(String inputFile, String outputFile, String regionsFile)
            throws IOException {

        PrintWriter out = null;

        List<Region> regions = parseRegions(new File(regionsFile));

        Map<String, IntervalTree<List<Feature>>> featureMap = loadFeatures(inputFile);

        try {
            out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
            for (Region r : regions) {

                IntervalTree<List<Feature>> featureTree = featureMap.get(r.chr);

                if (featureTree != null) {

                    List<Interval<List<Feature>>> intervals = featureTree.findOverlapping(r.start, r.end);

                    for (Interval<List<Feature>> interval : intervals) {
                        List<Feature> features = interval.getValue();
                        for (Feature f : features) {
                            if (f.start >= r.start) {
                                String s = f.tanslate(r.name, r.start);
                                out.println(s);
                            }
                        }
                    }
                }
            }
        } finally {
            if (out != null) out.close();
        }
    }


    static List<Region> parseRegions(File file) throws IOException {

        List<Region> regions = new ArrayList<>();

        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String nextLine;
            while ((nextLine = reader.readLine()) != null) {

                if (nextLine.startsWith("#")) continue;

                String[] tokens = Globals.whitespacePattern.split(nextLine);
                if (tokens.length > 3) {
                    regions.add(new Region(tokens[0], Integer.parseInt(tokens[1]), Integer.parseInt(tokens[2]), tokens[3]));
                }

            }
        } finally {
            if (reader != null) reader.close();
        }

        return regions;
    }

    static void sliceVCF(String inputFile, String outputFile, String chr, int start, int end) throws IOException {

        BufferedReader reader = null;
        PrintWriter out = null;


        try {
            reader =  ParsingUtils.openBufferedReader(inputFile);
            out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));

            String nextLine;
            while ((nextLine = reader.readLine()) != null) {
                if (nextLine.startsWith("#")) {
                    out.println(nextLine);
                } else {
                    String[] tokens = Globals.tabPattern.split(nextLine);
                    String c = tokens[0];

                    if (c.equals(chr)) {
                        int pos = Integer.parseInt(tokens[1]);
                        if (pos >= start && pos <= end) {
                            out.println(nextLine);
                        }
                        if (pos > end) {
                            break;
                        }
                    }
                }
            }
        } finally {
            if (out != null) out.close();
            if (reader != null) reader.close();
        }
    }



    // Keep every nth sample (genotype).
    static void sampleVCF(String inputFile, String outputFile, int n) throws IOException {

        BufferedReader reader = null;
        PrintWriter out = null;

        try {
            reader = new BufferedReader(new FileReader(inputFile));
            out = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));

            String nextLine;
            while ((nextLine = reader.readLine()) != null) {
                if (nextLine.startsWith("##")) {
                    out.println(nextLine);
                } else {
                    String[] tokens = Globals.tabPattern.split(nextLine);
                    for(int i=0; i<9; i++) {
                        out.print(tokens[i]);
                        if(i < 8) out.print('\t');
                    }

                    for(int i=9; i<tokens.length; i+=n) {

                        if(i < tokens.length) {
                            out.print('\t');
                            out.print(tokens[i]);
                        }
                    }

                    out.println();
                }
            }
        } finally {
            if (out != null) out.close();
            if (reader != null) reader.close();
        }
    }

    static void sliceFasta(String inputFasta, String outputFasta, String chr) throws IOException {

        BufferedReader reader = null;
        PrintWriter outFasta = null;

        try {
            reader = ParsingUtils.openBufferedReader(inputFasta);
            outFasta = new PrintWriter(new BufferedWriter(new FileWriter(outputFasta)));

            String nextLine;
            boolean chrFound = false;
            while ((nextLine = reader.readLine()) != null) {

                if(chrFound) {
                    if(nextLine.startsWith(">")) {
                        break;  // Done
                    }
                    else {
                        outFasta.println(nextLine);
                    }
                }

                else {
                    if(nextLine.startsWith(">" + chr)) {
                        outFasta.println(nextLine);
                        chrFound = true;
                    }
                }
            }


        } finally {
            if (outFasta != null) outFasta.close();
        }
    }


//
//
//    private static List<Region> mergeRegions(List<Region> regions) {
//
//        List<Region> mergedRegions = new ArrayList<>(regions.size());
//
//        Region lastRegion = regions.get(0);
//        for (int i = 0; i < regions.size(); i++) {
//            Region r = regions.get(i);
//            if (r.chr.equals(lastRegion.chr)) {
//                if (r.start <= lastRegion.end) {
//                    lastRegion.end = Math.max(lastRegion.end, r.end);   // extend region
//                } else {
//                    mergedRegions.add(lastRegion);
//                }
//            } else {
//                mergedRegions.add(lastRegion);
//            }
//            lastRegion = r;
//        }
//        return mergedRegions;
//
//    }

    private static Comparator<Feature> getPositionComparator() {
        Comparator<Feature> comp = new Comparator<Feature>() {
            private Comparator<String> nameComparator = ChromosomeNameComparator.get();

            public int compare(Feature o1, Feature o2) {

                int nameComp = nameComparator.compare(o1.chr, o2.chr);
                if (nameComp == 0) {
                    return o1.start - o2.start;
                } else {
                    return nameComp;
                }
            }
        };
        return comp;
    }


    static class Region {
        String chr;
        int start;
        int end;
        String name;

        public Region(String chr, int start, int end, String name) {
            this.end = end;
            this.chr = chr;
            this.start = start;
            this.name = name;
        }

        public boolean contains(int p) {
            return p >= this.start && p <= this.end;
        }
    }

    // Hardcoded for genePred ext format (refGene.txt)
    static class Feature {

        String chr;
        int start;
        int end;
        String[] tokens;

        Feature(String[] tokens) {
            this.chr = tokens[2];
            this.start = Integer.parseInt(tokens[4]);
            this.end = Integer.parseInt(tokens[5]);
            this.tokens = tokens;
        }

        boolean overlaps(String chr, int start, int end) {
            return chr.equals(this.chr) && end > this.start && start <= this.end;
        }

        String tanslate(String newChr, int offset) {
            tokens[2] = newChr;
            tokens[4] = String.valueOf(this.start - offset);
            tokens[5] = String.valueOf(this.end - offset);
            tokens[6] = String.valueOf(Integer.parseInt(tokens[6]) - offset);
            tokens[7] = String.valueOf(Integer.parseInt(tokens[7]) - offset);

            String newExonStart = "";
            String[] exonStarts = tokens[9].split(",");
            for (String es : exonStarts) {
                newExonStart += String.valueOf(Integer.parseInt(es) - offset) + ",";
            }
            tokens[9] = newExonStart;

            String newExonEnd = "";
            String[] exonEnds = tokens[10].split(",");
            for (String es : exonEnds) {
                newExonEnd += String.valueOf(Integer.parseInt(es) - offset) + ",";
            }
            tokens[10] = newExonEnd;

            String record = tokens[0];
            for (int i = 1; i < tokens.length; i++) {
                record += "\t" + tokens[i];
            }

            return record;
        }
    }

    /*
        (
    string name;        	"Name of gene (usually transcript_id from GTF)"
    string chrom;       	"Chromosome name"
    char[1] strand;     	"+ or - for strand"
    uint txStart;       	"Transcription start position"
    uint txEnd;         	"Transcription end position"
    uint cdsStart;      	"Coding region start"
    uint cdsEnd;        	"Coding region end"
    uint exonCount;     	"Number of exons"
    uint[exonCount] exonStarts; "Exon start positions"
    uint[exonCount] exonEnds;   "Exon end positions"
    int score;            	"Score"
    string name2;       	"Alternate name (e.g. gene_id from GTF)"
    string cdsStartStat; 	"enum('none','unk','incmpl','cmpl')"
    string cdsEndStat;   	"enum('none','unk','incmpl','cmpl')"
    lstring exonFrames; 	"Exon frame offsets {0,1,2}"
    )
     */
    //585	NR_046018	chr1	+	11873	14409	14409	14409	3	11873,12612,13220,	12227,12721,14409,	0	DDX11L1	unk	unk	-1,-1,-1,
    //585	NR_024540	chr1	-	14361	29370	29370	29370	11	14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320,	14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370,	0	WASH7P	unk	unk	-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,


    // Return a map of interval trees, keyed by chromosome name.
    static Map<String, IntervalTree<List<Feature>>> loadFeatures(String file) throws IOException {

        BufferedReader reader = null;

        reader = ParsingUtils.openBufferedReader(file);

        String nextLine;
        String lastChr = null;

        List<Feature> currentFeatureList = new ArrayList<>();
        int currentMin = Integer.MAX_VALUE;
        int currentMax = 0;

        Map<String, IntervalTree<List<Feature>>> map = new HashMap<>();

        List<Feature> features = new ArrayList<>();
        while ((nextLine = reader.readLine()) != null) {

            if (nextLine.startsWith("#") || nextLine.startsWith("track") || nextLine.startsWith("browser")) continue;

            String[] tokens = Globals.whitespacePattern.split(nextLine);
            Feature f = new Feature(tokens);
            features.add(f);
        }
        features.sort(getPositionComparator());


        for (Feature f : features) {


            if (lastChr == null) {
                currentMin = f.start;
                currentMax = f.end;
                currentFeatureList.add(f);
                IntervalTree<List<Feature>> tree = new IntervalTree<>();
                map.put(f.chr, tree);
                lastChr = f.chr;
            } else {

                if (!f.chr.equals(lastChr)) {

                    // New tree
                    IntervalTree<List<Feature>> tree = map.get(lastChr);
                    tree.insert(new Interval(currentMin, currentMax, currentFeatureList));

                    if (map.containsKey(f.chr)) {
                        System.out.println();
                    }

                    tree = new IntervalTree<>();
                    map.put(f.chr, tree);
                    lastChr = f.chr;

                    currentFeatureList = new ArrayList<>();
                    currentFeatureList.add(f);
                    currentMin = f.start;
                    currentMax = f.end;

                } else if (currentFeatureList.size() > 10) {

                    // New interval
                    IntervalTree<List<Feature>> tree = map.get(lastChr);
                    tree.insert(new Interval(currentMin, currentMax, currentFeatureList));

                    currentFeatureList = new ArrayList<>();
                    currentFeatureList.add(f);
                    currentMin = f.start;
                    currentMax = f.end;


                } else {

                    // Update interval
                    currentMin = Math.min(currentMin, f.start);
                    currentMax = Math.max(currentMax, f.end);
                    currentFeatureList.add(f);

                }
            }
        }

        return map;
    }

}