FeatureFileUtils.java example

Explorer
igv-master
- src
- test
  - src
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2007-2015 Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.broad.igv.feature;


import org.broad.igv.Globals;
import org.broad.igv.feature.tribble.IGVBEDCodec;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.Feature;
import org.broadinstitute.gatk.utils.collections.PrimitivePair;

import java.io.*;
import java.util.*;

/**
 * @author jrobinso
 */
public class FeatureFileUtils {


    static Set<String> types = new HashSet(Arrays.asList("SINE", "LINE", "LTR", "DNA", "Simple_repeat",
            "Low_complexity", "Satellite", "RNA", "Other", "Unknown", "Uncategorized"));

    /**
     * Compute feature density in units of # / window.  Note: This function could be extended to other file types by
     * using codecs.
     *
     * @param iFile      - a bed file
     * @param windowSize
     * @param step
     */
    static void computeBedDensity(String iFile, String oFile, int windowSize, int step) throws IOException {

        //FeatureCodec codec = CodecFactory.getCodec(iFile, null);

        BufferedReader br = null;
        PrintWriter pw = null;

        try {
            br = ParsingUtils.openBufferedReader(iFile);
            pw = new PrintWriter(new BufferedWriter(new FileWriter(oFile)));

            String nextLine;
            Map<Integer, Window> openWindows = new LinkedHashMap<Integer, Window>();
            int lastWindowOutput = 0;
            String lastChr = null;

            while ((nextLine = br.readLine()) != null) {
                if (nextLine.startsWith("track") || nextLine.startsWith("#")) continue;

                String[] tokens = Globals.tabPattern.split(nextLine);
                if (tokens.length < 3) continue;

                String chr = tokens[0];
                int start = Integer.parseInt(tokens[1]);
                int end = Integer.parseInt(tokens[2]);

                if (!chr.equals(lastChr)) {
                    lastWindowOutput = 0;
                    for (Window window : openWindows.values()) {
                        int w = window.idx;
                        int windowCenter = w * step + (windowSize / 2);
                        int windowStart = windowCenter - step / 2;
                        int windowEnd = windowStart + step;
                        pw.println(lastChr + "\t" + windowStart + "\t" + windowEnd + "\t" + window.count);
                    }
                    openWindows.clear();
                    lastChr = chr;
                }

                int startWindow = start / step;
                int endWindow = (end + windowSize) / step;
                for (int w = startWindow; w < endWindow; w++) {
                    Window window = openWindows.get(w);
                    if (window == null) {
                        window = new Window(w);
                        openWindows.put(w, window);
                    }
                    window.increment();
                }

                // File is sorted by start position, will never see windows < startWindow aganin
                if (startWindow > lastWindowOutput) {
                    for (int w = lastWindowOutput; w < startWindow; w++) {
                        Window window = openWindows.get(w);
                        if (window != null) {
                            int windowCenter = w * step + (windowSize / 2);
                            int windowStart = windowCenter - step / 2;
                            int windowEnd = windowStart + step;
                            pw.println(chr + "\t" + windowStart + "\t" + windowEnd + "\t" + window.count);
                        }
                    }
                    for (int w = lastWindowOutput; w < startWindow; w++) {
                        openWindows.remove(w);
                    }
                    lastWindowOutput = startWindow - 1;
                }
            }
        } finally {
            if (br != null) {
                br.close();
            }
            if (pw != null) {
                pw.close();
            }
        }

    }

    /**
     * Utility to convert a file with the following format to "bed"
     * ILMN_2087817	chrY:9529429:9529478:+					        // "1" based?
     * ILMN_2204360	chrY:9598604:9598615:-	chrY:9590985:9591022:-
     *
     * @param iFile
     * @param oFile
     * @param includeMultiMappings if false probes with multiple location mappings are filtered out
     */
    public static void probeToBed(String iFile, String oFile, boolean includeMultiMappings) throws IOException {

        BufferedReader br = null;
        PrintWriter pw = null;

        try {
            br = ParsingUtils.openBufferedReader(iFile);
            pw = new PrintWriter(new FileWriter(oFile));

            String nextLine;
            br.readLine();  // eat header
            while ((nextLine = br.readLine()) != null) {

                String[] tokens = Globals.commaPattern.split(nextLine);
                String probe = tokens[0];

                if (tokens.length < 1 || (tokens.length > 2 && !includeMultiMappings)) continue;

                for (int i = 1; i < tokens.length; i++) {

                    String loc = tokens[i];
                    String[] locParts = Globals.colonPattern.split(loc);
                    String chr = locParts[0];
                    int start = Integer.parseInt(locParts[1]) - 1;
                    int end = Integer.parseInt(locParts[2]);
                    String strand = locParts[3];
                    pw.println(chr + "\t" + start + "\t" + end + "\t" + probe + "\t1000\t" + strand);
                }
            }
        } finally {
            if (br != null) br.close();
            if (pw != null) pw.close();
        }


    }

    public static void splitRepeatMasker(String iFile, String outputDirectory, String prefix) throws IOException {


        BufferedReader br = null;
        Map<String, PrintWriter> pws = new HashMap();

        try {
            br = new BufferedReader(new FileReader(iFile));
            File dir = new File(outputDirectory);
            for (String type : types) {
                File f = new File(dir, prefix + type + ".bed");
                pws.put(type, new PrintWriter(new BufferedWriter(new FileWriter(f))));
            }

            String nextLine;
            while ((nextLine = br.readLine()) != null) {
                if (nextLine.startsWith("#")) continue;
                String[] tokens = nextLine.split("\t");
                String type = getType(tokens[5]);

                // Rerrange columns for legal bed
                pws.get(type).println(tokens[0] + "\t" + tokens[1] + "\t" + tokens[2] + "\t" +
                        tokens[4] + "\t" + tokens[3]);
            }


        } finally {
            if (br != null) {
                br.close();
            }
            for (PrintWriter pw : pws.values()) {
                pw.close();
            }
        }

    }

    public static String getType(String s) {

        s = s.replace("?", "");

        if (s.contains("RNA")) {
            return "RNA";
        } else if (s.equals("RC")) {
            return "Other";
        } else if (s.equals("repClass")) {
            return "Other";
        } else if (types.contains(s)) {
            return s;
        } else {
            return "Uncategorized";
        }

    }

    /**
     * Given a GFF File, creates a new GFF file for each type. Any feature type
     * which is part of a "gene" ( {@link SequenceOntology#geneParts} ) are put in the same file,
     * others are put in different files. So features of type "gene", "exon", and "mrna"
     * would go in gene.gff, but features of type "myFeature" would go in myFeature.gff.
     *
     * @param gffFile
     * @param outputDirectory
     * @throws IOException
     */
    public static void splitGffFileByType(String gffFile, String outputDirectory) throws IOException {

        BufferedReader br = new BufferedReader(new FileReader(gffFile));
        String nextLine;
        String ext = "." + gffFile.substring(gffFile.length() - 4);

        Map<String, PrintWriter> writers = new HashMap();

        while ((nextLine = br.readLine()) != null) {
            nextLine = nextLine.trim();
            if (!nextLine.startsWith("#")) {
                String[] tokens = Globals.tabPattern.split(nextLine.trim().replaceAll("\"", ""), -1);

                String type = tokens[2];
                if (SequenceOntology.geneParts.contains(type)) {
                    type = "gene";
                }
                if (!writers.containsKey(type)) {
                    writers.put(type,
                            new PrintWriter(new FileWriter(new File(outputDirectory, type + ext))));
                }
            }
        }
        br.close();

        br = new BufferedReader(new FileReader(gffFile));
        PrintWriter currentWriter = null;
        while ((nextLine = br.readLine()) != null) {
            nextLine = nextLine.trim();
            if (nextLine.startsWith("#")) {
                for (PrintWriter pw : writers.values()) {
                    pw.println(nextLine);
                }
            } else {
                String[] tokens = Globals.tabPattern.split(nextLine.trim().replaceAll("\"", ""), -1);
                String type = tokens[2];
                if (SequenceOntology.geneParts.contains(type)) {
                    type = "gene";
                }
                currentWriter = writers.get(type);

                if (currentWriter != null) {
                    currentWriter.println(nextLine);
                } else {
                    System.out.println("No writer for: " + type);
                }
            }

        }

        br.close();
        for (PrintWriter pw : writers.values()) {
            pw.close();
        }
    }

    static class Window {
        int idx;
        int count;

        Window(int idx) {
            this.idx = idx;
        }

        void increment() {
            count++;
        }
    }

    /**
     * Create a "canonical" gene file from an annotation UCSC refseq type annotation file.  The "canonical" file
     * represents all the isoforms of a gene as a single feature containing the union of all exons from the
     * splice forms for the gene.  This might or might not represent an actual transcript (although it usually does).
     *
     * @param iFile
     * @param outputFile
     */
    static void createCanonicalGeneFile(String iFile, String outputFile) throws IOException {

        BufferedReader br = null;
        PrintWriter pw = null;

        try {
            br = new BufferedReader(new FileReader(iFile));
            pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
            FeatureParser parser = AbstractFeatureParser.getInstanceFor(new ResourceLocator(iFile), null);
            List<Feature> features = parser.loadFeatures(br, null);
            IGVBEDCodec codec = new IGVBEDCodec();


            Map<String, List<BasicFeature>> genes = new HashMap<String, List<BasicFeature>>();
            for (Feature f : features) {

                BasicFeature transcript = (BasicFeature) f;
                String geneName = transcript.getName();

                List<BasicFeature> genelist = genes.get(geneName);
                if (genelist == null) {
                    genelist = new ArrayList<BasicFeature>();
                    genes.put(geneName, genelist);
                }

                // Loop through genes with this name to find one that overlaps
                boolean foundOverlap = false;
                for (BasicFeature gene : genelist) {
                    if (gene.overlaps(transcript)) {
                        gene.setThickStart(Math.min(gene.getThickStart(), transcript.getThickStart()));
                        gene.setThickEnd(Math.max(gene.getThickEnd(), transcript.getThickEnd()));
                        mergeExons(gene, transcript.getExons());
                        foundOverlap = true;
                        break;
                    }
                }
                if (!foundOverlap) {
                    genelist.add(transcript);

                }

            }

            for (List<BasicFeature> geneList : genes.values()) {
                for (BasicFeature gene : geneList) {
                    pw.println(codec.encode(gene));
                }
            }
        } finally {
            if (br != null) br.close();
            if (pw != null) pw.close();
        }


    }

    /**
     * Create a bed file of "TSS regions", define as the 20 bp region downstream of the start of the feature.
     * <p/>
     * It is assume that iFile is sorted by start position.
     *
     * @param iFile
     * @param outputFile
     */
    static void createTSSFile(String iFile, String outputFile) throws IOException {

        BufferedReader br = null;
        PrintWriter pw = null;

        try {
            br = new BufferedReader(new FileReader(iFile));
            pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
            FeatureParser parser = AbstractFeatureParser.getInstanceFor(new ResourceLocator(iFile), null);
            List<Feature> features = parser.loadFeatures(br, null);
            IGVBEDCodec codec = new IGVBEDCodec();


            Map<String, List<BasicFeature>> genes = new HashMap<String, List<BasicFeature>>();

            int lastTSS = -1;
            for (Feature f : features) {

                BasicFeature transcript = (BasicFeature) f;

                int tss = transcript.getStrand() == Strand.POSITIVE ? f.getStart() : f.getEnd();
                if (tss != lastTSS) {
                    int tssEnd = transcript.getStrand() == Strand.POSITIVE ? tss + 20 : tss - 20;
                    pw.println(transcript.getChr() + "\t" + Math.min(tss, tssEnd) + "\t" + Math.max(tss, tssEnd));
                    lastTSS = tss;
                }


            }
        } finally {
            if (br != null) br.close();
            if (pw != null) pw.close();
        }
    }

    /*
 0     `bin` smallint(5) unsigned NOT NULL,
 1 `name` varchar(255) NOT NULL,
 2 `chrom` varchar(255) NOT NULL,
 3 `strand` char(1) NOT NULL,
 4 `txStart` int(10) unsigned NOT NULL,
 5 `txEnd` int(10) unsigned NOT NULL,
 6 `cdsStart` int(10) unsigned NOT NULL,
 7 `cdsEnd` int(10) unsigned NOT NULL,
 8 `exonCount` int(10) unsigned NOT NULL,
 9 `exonStarts` longblob NOT NULL,
 10 `exonEnds` longblob NOT NULL,
 11 `score` int(11) DEFAULT NULL,
 12 `name2` varchar(255) NOT NULL,
 13 `cdsStartStat` enum('none','unk','incmpl','cmpl') NOT NULL,
 14 `cdsEndStat` enum('none','unk','incmpl','cmpl') NOT NULL,
 15 `exonFrames` longblob NOT NULL,
     */
    static void refgeneToBed(String iFile, String outputFile) throws IOException {

        BufferedReader br = null;
        PrintWriter pw = null;

        try {
            br = new BufferedReader(new FileReader(iFile));
            pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));

            String nextLine;
            while ((nextLine = br.readLine()) != null) {

                if (nextLine.startsWith("#") || nextLine.startsWith("track") || nextLine.startsWith("browser")) {
                    pw.println(nextLine);
                    continue;
                }

                String[] tokens = Globals.whitespacePattern.split(nextLine);

                String chr = tokens[2];
                int start = Integer.parseInt(tokens[4]);
                String end = tokens[5];
                String name = tokens[12];
                String score = "1000";
                String strand = tokens[3];
                String thickStart = tokens[6];
                String thickEnd = tokens[7];
                String itemRGB = ".";
                int blockCount = Integer.parseInt(tokens[8]);

                String exonStarts = tokens[9];
                String[] stok = Globals.commaPattern.split(exonStarts);
                String[] etok = Globals.commaPattern.split(tokens[10]);

                String blockStarts = "";
                String blockSizes = "";

                for (int i = 0; i < blockCount; i++) {
                    final int bs = Integer.parseInt(stok[i]);
                    blockStarts += String.valueOf(bs - start);
                    blockSizes += String.valueOf(Integer.parseInt(etok[i]) - bs);
                    if (i != blockCount - 1) {
                        blockStarts += ",";
                        blockSizes += ",";
                    }
                }

                pw.println(chr + "\t" + start + "\t" + end + "\t" + name + "\t" + score + "\t" + strand + "\t" +
                        thickStart + "\t" + thickEnd + "\t" + itemRGB + "\t" + blockCount + "\t" + blockSizes + "\t" + blockStarts);


            }

        } finally {
            if (br != null) br.close();
            if (pw != null) pw.close();
        }
    }


    private static void mergeExons(BasicFeature gene, List<Exon> exons) {
        Set<IExon> exonProxies = new HashSet<IExon>(gene.getExons());
        for (Exon exon : gene.getExons()) {
            exonProxies.add(Exon.getExonProxy(exon));
        }
        for (Exon exon : exons) {
            IExon proxy = Exon.getExonProxy(exon);
            if (!exonProxies.contains(proxy)) {
                gene.addExon(exon);
            }
        }
        FeatureUtils.sortFeatureList(gene.getExons());
    }

    static void covertProbeMapToBedFile(String probeMapFile, String bedFile) throws
            FileNotFoundException, IOException {


        BufferedReader br = new BufferedReader(new FileReader(probeMapFile));
        PrintWriter pw = new PrintWriter(new FileWriter(bedFile));

        String nextLine;
        while ((nextLine = br.readLine()) != null) {
            String[] tokens = nextLine.split("\t");
            Locus locus = Locus.fromString(tokens[1].trim());
            pw.println(
                    locus.getChr() + "\t" + locus.getStart() + "\t" + locus.getEnd() + "\t" + tokens[0].trim());
        }

        br.close();
        pw.close();

    }

    static void splitEmblFileByType(String emblFile, String outputDirectory) throws IOException {

        BufferedReader br = new BufferedReader(new FileReader(emblFile));
        String nextLine;
        Set<String> codes = new HashSet();

        while ((nextLine = br.readLine()) != null) {
            if (nextLine.startsWith("FT") && (nextLine.length() > 19)) {
                String code = nextLine.substring(5, 19).trim();
                if (code.length() > 0) {
                    codes.add(code);
                }
            }
        }
        br.close();

        Map<String, PrintWriter> writers = new HashMap();
        for (String code : codes) {
            writers.put(code,
                    new PrintWriter(new FileWriter(new File(outputDirectory, code + ".embl"))));
        }

        br = new BufferedReader(new FileReader(emblFile));
        PrintWriter currentWriter = null;
        while ((nextLine = br.readLine()) != null) {
            if (nextLine.startsWith("ID")) {
                for (PrintWriter pw : writers.values()) {
                    pw.println(nextLine);
                }
            } else if (nextLine.startsWith("FT")) {
                String code = nextLine.substring(5, 19).trim();
                if (code.length() > 0) {
                    currentWriter = writers.get(code);
                }
                if (currentWriter != null) {
                    currentWriter.println(nextLine);
                }
            } else {
                currentWriter = null;
            }
        }

        br.close();
        for (PrintWriter pw : writers.values()) {
            pw.close();
        }
    }


    public static void main(String[] args) throws IOException {
        refgeneToBed(args[0], args[1]);
    }

}