/* * The MIT License * * Copyright (c) 2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.reference; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.IntervalListReferenceSequenceMask; import htsjdk.samtools.util.ReferenceSequenceMask; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; import htsjdk.samtools.util.WholeGenomeReferenceSequenceMask; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Fasta; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; /** * A tool to count the number of non-N bases in a fasta file */ @CommandLineProgramProperties( usage = NonNFastaSize.USAGE_SUMMARY + NonNFastaSize.USAGE_DETAILS, usageShort = NonNFastaSize.USAGE_SUMMARY, programGroup = Fasta.class ) public class NonNFastaSize extends CommandLineProgram { static final String USAGE_SUMMARY = "Counts the number of non-N bases in a fasta file."; static final String USAGE_DETAILS = "This tool takes any FASTA-formatted file and counts the number of non-N bases in it." + "Note that it requires that the fasta file have associated index (.fai) and dictionary (.dict) files.<br />" + "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar NonNFastaSize \\<br />" + " I=input_sequence.fasta \\<br />" + " O=count.txt" + "</pre>" + "<hr />" ; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input FASTA file.") public File INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output file in which to record the count.") public File OUTPUT; @Option(shortName = "INTERVALS", doc = "An interval list file that contains the locations of the positions to assess. If not provided, the entire reference will be used", optional = true) public File INTERVALS = null; public static void main(final String[] args) { new NonNFastaSize().instanceMainWithExit(args); } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); // set up the reference and a mask so that we only count the positions requested by the user final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(INPUT); final ReferenceSequenceMask referenceSequenceMask; if (INTERVALS != null) { IOUtil.assertFileIsReadable(INTERVALS); final IntervalList intervalList = IntervalList.fromFile(INTERVALS); referenceSequenceMask = new IntervalListReferenceSequenceMask(intervalList); } else { final SAMFileHeader header = new SAMFileHeader(); header.setSequenceDictionary(ref.getSequenceDictionary()); referenceSequenceMask = new WholeGenomeReferenceSequenceMask(header); } long nonNbases = 0L; for (final SAMSequenceRecord rec : ref.getSequenceDictionary().getSequences()) { // pull out the contig and set up the bases final ReferenceSequence sequence = ref.getSequence(rec.getSequenceName()); final byte[] bases = sequence.getBases(); StringUtil.toUpperCase(bases); for (int i = 0; i < bases.length; i++) { // only investigate this position if it's within our mask if (referenceSequenceMask.get(sequence.getContigIndex(), i+1)) { nonNbases += bases[i] == SequenceUtil.N ? 0 : 1; } } } try { final BufferedWriter out = IOUtil.openFileForBufferedWriting(OUTPUT); out.write(nonNbases + "\n"); out.close(); } catch (IOException ioe) { throw new PicardException("Error writing to file " + OUTPUT.getAbsolutePath(), ioe); } return 0; } }