package picard.util; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.programgroups.Intervals; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.StringUtil; import java.io.File; import java.lang.Boolean;import java.lang.Override;import java.lang.String;import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; /** * A CLP for breaking up a reference into intervals of Ns and ACGTs bases. * Used for creating a broken-up interval list for calling WGS. * * @author Yossi Farjoun */ @CommandLineProgramProperties( usage = "Writes an interval list based on splitting the reference by Ns.", usageShort = "Writes an interval list based on splitting the reference by Ns", programGroup = Intervals.class ) public class ScatterIntervalsByNs extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "Reference sequence to use.") public File REFERENCE; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file for interval list.") public File OUTPUT; @Option(shortName = "OT", doc = "Type of intervals to output.", optional = true) public OutputType OUTPUT_TYPE = OutputType.BOTH; @Option(shortName = "N", doc = "Maximal number of contiguous N bases to tolerate, thereby continuing the current ACGT interval.", optional = true) public int MAX_TO_MERGE = 1; //not using an enum since Interval.name is a String, and am using that to define the type of the Interval static final String ACGTmer = "ACGTmer", Nmer = "Nmer"; //an enum to determine which types of intervals get outputted private enum OutputType { N(Nmer), ACGT(ACGTmer), BOTH(Nmer, ACGTmer); private final Set acceptedTypes; public Boolean accepts(final String string) {return acceptedTypes.contains(string);} OutputType(final String... strings) { acceptedTypes = new HashSet<String>(); Collections.addAll(acceptedTypes, strings); } } private static final Log log = Log.getInstance(ScatterIntervalsByNs.class); final ProgressLogger locusProgress = new ProgressLogger(log, (int) 1e7, "examined", "loci"); final ProgressLogger intervalProgress = new ProgressLogger(log, (int) 10, "found", "intervals"); public static void main(final String[] args) { new ScatterIntervalsByNs().instanceMainWithExit(args); } @Override protected int doWork() { IOUtil.assertFileIsReadable(REFERENCE); IOUtil.assertFileIsWritable(OUTPUT); final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE, true); // get the intervals final IntervalList intervals = segregateReference(refFile, MAX_TO_MERGE); log.info(String.format("Found %d intervals in %d loci during %s seconds", intervalProgress.getCount(), locusProgress.getCount(), locusProgress.getElapsedSeconds())); /********************************** * Now output regions for calling * **********************************/ final IntervalList outputIntervals = new IntervalList(intervals.getHeader().clone()); log.info(String.format("Collecting requested type of intervals (%s)", OUTPUT_TYPE)); for (final Interval i : intervals.getIntervals()) { if (OUTPUT_TYPE.accepts(i.getName())) { outputIntervals.add(i); } } log.info("Writing Intervals."); outputIntervals.write(OUTPUT); log.info(String.format("Execution ending. Total time %d seconds", locusProgress.getElapsedSeconds())); return 0; } /** * **************************************************************** * Generate an interval list that alternates between Ns and ACGTs * * **************************************************************** */ public static IntervalList segregateReference(final ReferenceSequenceFile refFile, final int maxNmerToMerge) { final List<Interval> preliminaryIntervals = new LinkedList<Interval>(); final SAMFileHeader header = new SAMFileHeader(); header.setSequenceDictionary(refFile.getSequenceDictionary()); header.setSortOrder(SAMFileHeader.SortOrder.coordinate); final IntervalList finalIntervals = new IntervalList(header); //iterate over all the sequences in the dictionary for (final SAMSequenceRecord rec : refFile.getSequenceDictionary().getSequences()) { final ReferenceSequence ref = refFile.getSequence(rec.getSequenceName()); final byte[] bytes = ref.getBases(); StringUtil.toUpperCase(bytes); boolean nBlockIsOpen = (bytes[0] == 'N'); int start = 0; for (int i = 0; i < bytes.length; ++i) { final boolean currentBaseIsN = (bytes[i] == 'N'); //create intervals when switching, i.e "nBlockIsOpen" disagrees with "currentBaseIsN" if (nBlockIsOpen != currentBaseIsN) { preliminaryIntervals.add(new Interval(rec.getSequenceName(), start + 1, i, false, nBlockIsOpen ? Nmer : ACGTmer)); start = i; nBlockIsOpen = !nBlockIsOpen; } } // Catch the last block of chromosome preliminaryIntervals.add(new Interval(rec.getSequenceName(), start + 1, bytes.length, false, nBlockIsOpen ? Nmer : ACGTmer)); } // now that we have the whole list, we need to remove the short Nmers. // process the list, replacing trios with short Nmers in the middle with longer intervals: while (!preliminaryIntervals.isEmpty()) { //if top trio match the bill, replace them with a merged interval, // and push it back the top of the list (we expect alternating Nmers and ACGTmers, but // not assuming it in the logic) //(I want this to be fast and the strings are all copies of the static prototypes Nmer and ACGTmer ) //noinspection StringEquality if (preliminaryIntervals.size() >= 3 && // three or more intervals preliminaryIntervals.get(0).getName() == ACGTmer && //an N-mer preliminaryIntervals.get(1).getName() == Nmer && //between two preliminaryIntervals.get(2).getName() == ACGTmer && //ACGT-mers preliminaryIntervals.get(0).abuts(preliminaryIntervals.get(1)) && // all abutting preliminaryIntervals.get(1).abuts(preliminaryIntervals.get(2)) && // each other (there are many contigs...) preliminaryIntervals.get(1).length() <= maxNmerToMerge) //and the N-mer is of length N or less { // create the new ACGTmer interval final Interval temp = new Interval( preliminaryIntervals.get(0).getSequence(), preliminaryIntervals.get(0).getStart(), preliminaryIntervals.get(2).getEnd(), false, ACGTmer); //remove the first 3 elements of the list for (int i = 0; i < 3; ++i) { preliminaryIntervals.remove(0); } //and replace them with the newly created one preliminaryIntervals.add(0, temp); } else { //if cannot merge top three intervals, transfer the top intervals to finalIntervals finalIntervals.add(preliminaryIntervals.remove(0)); } } return finalIntervals; } }