/* * The MIT License (MIT) * * Copyright (c) 2016 University of California San Diego * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * * */ package org.broad.igv.tools; import org.broad.igv.Globals; import org.broad.igv.util.ParsingUtils; import htsjdk.tribble.readers.AsciiLineReader; import java.io.*; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.Properties; /** * Splits a repeat mask file downloaded from UCSC into multiple files, one per repeat class. * Assumes downloaded columns as follows (use table browser, and "select columns" option * <p/> * genoName genoStart genoEnd strand repName repClass repFamily * <p/> * Assumes file is sorted by chromosome * * @author jrobinso */ public class RepeatMaskSplitter { public static void main(String[] args) { File file = new File(args[0]); split(file); } public static void split(File inputFile) { int binCol = 0; int millDivCol = 2; int millDelCol = 3; int millInsCol = 4; int chrCol = 5; int startCol = 6; int endCol = 7; int strandCol = 9; int nameCol = 10; int classCol = 11; int famCol = 12; Map<String, LinkedHashMap<String, String>> fileMappings = new HashMap(); AsciiLineReader reader = null; HashMap<String, PrintWriter> writers = new HashMap(); PrintWriter allWriter = null; try { reader = new AsciiLineReader(new FileInputStream(inputFile)); // Skip header reader.readLine(); String nextLine; File dir = inputFile.getParentFile(); allWriter = new PrintWriter(new BufferedWriter(new FileWriter("rmsk.bed"))); allWriter.println("track name=\"Repeat Masker\" \" gffTags=\"on\""); while ((nextLine = reader.readLine()) != null) { String[] tokens = Globals.tabPattern.split(nextLine, -1); String chr = tokens[chrCol]; String repClass = tokens[classCol]; if (repClass.contains("?")) { continue; } String filename = repClass + ".bed"; // Get or create file writer for the class PrintWriter pw = writers.get(filename); if (pw == null) { File outputFile = new File(dir, filename); pw = new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); pw.println("track name=\"" + repClass + "\" gffTags=\"on\""); writers.put(filename, pw); } String nm = tokens[nameCol]; String fam = tokens[famCol]; String name = "Name=" + nm + ";Class=" + repClass + ";Family=" + fam; pw.print(chr); pw.print("\t"); pw.print(Integer.parseInt(tokens[startCol])); pw.print("\t"); pw.print(Integer.parseInt(tokens[endCol])); pw.print("\t"); pw.print(name); pw.print("\t"); pw.print(tokens[strandCol]); pw.println(); allWriter.print(chr); allWriter.print("\t"); allWriter.print(Integer.parseInt(tokens[startCol])); allWriter.print("\t"); allWriter.print(Integer.parseInt(tokens[endCol])); allWriter.print("\t"); allWriter.print(name); allWriter.print("\t"); allWriter.print(tokens[strandCol]); allWriter.println(); } } catch (Exception e) { e.printStackTrace(); } finally { reader.close(); // allWriter.close(); closeWriters(writers); } } private static void closeWriters(HashMap<String, PrintWriter> writers) { for (PrintWriter pw : writers.values()) { pw.close(); } writers.clear(); } }