MarkDuplicatesWithMateCigarIterator.java example

Explorer

picard-tools-master
- htsjdk
  - src
    - java
      - htsjdk
        samtools
        AbstractBAMFileIndex.java
        AbstractSAMHeaderRecord.java
        AlignmentBlock.java
        AsyncSAMFileWriter.java
        BAMFileConstants.java
        BAMFileReader.java
        BAMFileWriter.java
        BAMIndex.java
        BAMIndexContent.java
        BAMIndexMetaData.java
        BAMIndexWriter.java
        BAMIndexer.java
        BAMRecord.java
        BAMRecordCodec.java
        BamFileIoUtils.java
        BamIndexValidator.java
        Bin.java
        BinList.java
        BinaryBAMIndexWriter.java
        BinaryCigarCodec.java
        BinaryTagCodec.java
        BinningIndexBuilder.java
        BinningIndexContent.java
        BrowseableBAMIndex.java
        CachingBAMFileIndex.java
        Chunk.java
        Cigar.java
        CigarElement.java
        CigarOperator.java
        ComparableSamRecordIterator.java
        CoordinateSortedPairInfoMap.java
        CustomReaderFactory.java
        DefaultSAMRecordFactory.java
        Defaults.java
        DiskBasedBAMFileIndex.java
        DuplicateScoringStrategy.java
        FileTruncatedException.java
        FixBAMFile.java
        GenomicIndexUtil.java
        LinearIndex.java
        MergingSamRecordIterator.java
        NotPrimarySkippingIterator.java
        QueryInterval.java
        ReservedTagConstants.java
        SAMBinaryTagAndUnsignedArrayValue.java
        SAMBinaryTagAndValue.java
        SAMException.java
        SAMFileHeader.java
        SAMFileReader.java
        SAMFileSource.java
        SAMFileSpan.java
        SAMFileTruncatedReader.java
        SAMFileWriter.java
        SAMFileWriterFactory.java
        SAMFileWriterImpl.java
        SAMFormatException.java
        SAMHeaderRecordComparator.java
        SAMLineParser.java
        SAMProgramRecord.java
        SAMReadGroupRecord.java
        SAMRecord.java
        SAMRecordComparator.java
        SAMRecordCoordinateComparator.java
        SAMRecordFactory.java
        SAMRecordIterator.java
        SAMRecordQueryNameComparator.java
        SAMRecordSetBuilder.java
        SAMRecordUtil.java
        SAMSequenceDictionary.java
        SAMSequenceRecord.java
        SAMSortOrderChecker.java
        SAMTag.java
        SAMTagUtil.java
        SAMTestUtil.java
        SAMTextHeaderCodec.java
        SAMTextReader.java
        SAMTextWriter.java
        SAMTools.java
        SAMUtils.java
        SAMValidationError.java
        SQTagUtil.java
        SamFileHeaderMerger.java
        SamFileValidator.java
        SamFiles.java
        SamInputResource.java
        SamPairUtil.java
        SamReader.java
        SamReaderFactory.java
        SamStreams.java
        SecondaryOrSupplementarySkippingIterator.java
        StreamInflatingIndexingOutputStream.java
        TagValueAndUnsignedArrayFlag.java
        TextCigarCodec.java
        TextTagCodec.java
        TextualBAMIndexWriter.java
        ValidationStringency.java
        apps
        TimeChannel.java
        TimeRandomAccessFile.java
        example
        ExampleSamUsage.java
        fastq
        AsyncFastqWriter.java
        BasicFastqWriter.java
        FastqConstants.java
        FastqReader.java
        FastqRecord.java
        FastqWriter.java
        FastqWriterFactory.java
        filter
        AggregateFilter.java
        AlignedFilter.java
        DuplicateReadFilter.java
        FailsVendorReadQualityFilter.java
        FilteringIterator.java
        IntervalFilter.java
        NotPrimaryAlignmentFilter.java
        ReadNameFilter.java
        SamRecordFilter.java
        SecondaryAlignmentFilter.java
        SecondaryOrSupplementaryFilter.java
        SolexaNoiseFilter.java
        TagFilter.java
        WholeReadClippedFilter.java
        liftover
        Chain.java
        LiftOver.java
        metrics
        Header.java
        MetricBase.java
        MetricsFile.java
        StringHeader.java
        VersionHeader.java
        reference
        AbstractFastaSequenceFile.java
        FastaSequenceFile.java
        FastaSequenceIndex.java
        IndexedFastaSequenceFile.java
        ReferenceSequence.java
        ReferenceSequenceFile.java
        ReferenceSequenceFileFactory.java
        ReferenceSequenceFileWalker.java
        seekablestream
        ISeekableStreamFactory.java
        SeekableBufferedStream.java
        SeekableFTPStream.java
        SeekableFTPStreamHelper.java
        SeekableFileStream.java
        SeekableHTTPStream.java
        SeekableStream.java
        SeekableStreamFactory.java
        UserPasswordInput.java
        util
        AbstractAsyncWriter.java
        AbstractIterator.java
        AsciiWriter.java
        BinaryCodec.java
        BlockCompressedFilePointerUtil.java
        BlockCompressedInputStream.java
        BlockCompressedOutputStream.java
        BlockCompressedStreamConstants.java
        BlockGunzipper.java
        BufferedLineReader.java
        CigarUtil.java
        CloseableIterator.java
        CloserUtil.java
        CollectionUtil.java
        CoordMath.java
        DateParser.java
        DelegatingIterator.java
        DiskBackedQueue.java
        FastLineReader.java
        FastqQualityFormat.java
        FileAppendStreamLRUCache.java
        FormatUtil.java
        Histogram.java
        HttpUtils.java
        IOUtil.java
        Interval.java
        IntervalList.java
        IntervalListReferenceSequenceMask.java
        IntervalTree.java
        IntervalTreeMap.java
        IntervalUtil.java
        Iso8601Date.java
        IterableAdapter.java
        IterableOnceIterator.java
        Iterables.java
        Lazy.java
        LineReader.java
        ListMap.java
        LocationAware.java
        Locus.java
        LocusComparator.java
        LocusImpl.java
        Log.java
        Md5CalculatingInputStream.java
        Md5CalculatingOutputStream.java
        MergingIterator.java
        Objects.java
        OverlapDetector.java
        PeekIterator.java
        PeekableIterator.java
        ProcessExecutor.java
        ProgressLogger.java
        ProgressLoggerInterface.java
        QualityEncodingDetector.java
        QualityUtil.java
        ReferenceSequenceMask.java
        RelativeIso8601Date.java
        ResourceLimitedMap.java
        ResourceLimitedMapFunctor.java
        RuntimeEOFException.java
        RuntimeIOException.java
        SamLocusIterator.java
        SamRecordIntervalIteratorFactory.java
        SamRecordTrackingBuffer.java
        SamRecordWithOrdinal.java
        SequenceUtil.java
        SnappyLoader.java
        SolexaQualityConverter.java
        SortingCollection.java
        SortingLongCollection.java
        StopWatch.java
        StringLineReader.java
        StringUtil.java
        TempStreamFactory.java
        TestUtil.java
        Tuple.java
        WholeGenomeReferenceSequenceMask.java
        ftp
        FTPClient.java
        FTPReply.java
        FTPStream.java
        FTPUtils.java
        zip
        DeflaterFactory.java
        IntelDeflater.java
        ZStreamRef.java
        tribble
        AbstractFeatureCodec.java
        AbstractFeatureReader.java
        AsciiFeatureCodec.java
        BasicFeature.java
        BinaryFeatureCodec.java
        CloseableTribbleIterator.java
        Feature.java
        FeatureCodec.java
        FeatureCodecHeader.java
        FeatureReader.java
        NameAwareCodec.java
        TabixFeatureReader.java
        Tribble.java
        TribbleException.java
        TribbleIndexedFeatureReader.java
        annotation
        Strand.java
        bed
        BEDCodec.java
        BEDFeature.java
        FullBEDFeature.java
        SimpleBEDFeature.java
        dbsnp
        OldDbSNPCodec.java
        OldDbSNPFeature.java
        example
        CountRecords.java
        ExampleBinaryCodec.java
        IndexToTable.java
        IndicesAreEqual.java
        ProfileIndexReading.java
        exception
        CodecLineParsingException.java
        UnsortedFileException.java
        gelitext
        DiploidGenotype.java
        GeliTextCodec.java
        GeliTextFeature.java
        index
        AbstractIndex.java
        Block.java
        ChrIndex.java
        DynamicIndexCreator.java
        Index.java
        IndexCreator.java
        IndexFactory.java
        MutableIndex.java
        TribbleIndexCreator.java
        interval
        Interval.java
        IntervalIndexCreator.java
        IntervalTree.java
        IntervalTreeIndex.java
        linear
        LinearIndex.java
        LinearIndexCreator.java
        tabix
        TabixFormat.java
        TabixIndex.java
        TabixIndexCreator.java
        readers
        AsciiLineReader.java
        AsciiLineReaderIterator.java
        AsynchronousLineReader.java
        LineIterator.java
        LineIteratorImpl.java
        LineReader.java
        LineReaderUtil.java
        LongLineBufferedReader.java
        Positional.java
        PositionalBufferedStream.java
        TabixIteratorLineReader.java
        TabixReader.java
        util
        FTPHelper.java
        HTTPHelper.java
        LittleEndianInputStream.java
        LittleEndianOutputStream.java
        MathUtils.java
        ParsingUtils.java
        RemoteURLHelper.java
        TabixUtils.java
        URLHelper.java
        popgen
        HardyWeinbergCalculation.java
        variant
        bcf2
        BCF2Codec.java
        BCF2Decoder.java
        BCF2GenotypeFieldDecoders.java
        BCF2LazyGenotypesDecoder.java
        BCF2Type.java
        BCF2Utils.java
        BCFVersion.java
        utils
        GeneralUtils.java
        SAMSequenceDictionaryExtractor.java
        variantcontext
        Allele.java
        CommonInfo.java
        FastGenotype.java
        Genotype.java
        GenotypeBuilder.java
        GenotypeLikelihoods.java
        GenotypeType.java
        GenotypesContext.java
        JEXLMap.java
        LazyGenotypesContext.java
        VariantContext.java
        VariantContextBuilder.java
        VariantContextComparator.java
        VariantContextUtils.java
        VariantJEXLContext.java
        writer
        AsyncVariantContextWriter.java
        BCF2Encoder.java
        BCF2FieldEncoder.java
        BCF2FieldWriter.java
        BCF2FieldWriterManager.java
        BCF2Writer.java
        IndexingVariantContextWriter.java
        IntGenotypeFieldAccessors.java
        Options.java
        SortingVariantContextWriter.java
        SortingVariantContextWriterBase.java
        VCFWriter.java
        VariantContextWriter.java
        VariantContextWriterBuilder.java
        VariantContextWriterFactory.java
        vcf
        AbstractVCFCodec.java
        VCF3Codec.java
        VCFCodec.java
        VCFCompoundHeaderLine.java
        VCFConstants.java
        VCFContigHeaderLine.java
        VCFEncoder.java
        VCFFileReader.java
        VCFFilterHeaderLine.java
        VCFFormatHeaderLine.java
        VCFHeader.java
        VCFHeaderLine.java
        VCFHeaderLineCount.java
        VCFHeaderLineTranslator.java
        VCFHeaderLineType.java
        VCFHeaderVersion.java
        VCFIDHeaderLine.java
        VCFInfoHeaderLine.java
        VCFRecordCodec.java
        VCFSimpleHeaderLine.java
        VCFStandardHeaderLines.java
        VCFUtils.java
    - tests
      - java
        htsjdk
        samtools
        AbstractBAMFileIndexTest.java
        BAMFileIndexTest.java
        BAMFileWriterTest.java
        BAMIndexWriterTest.java
        BAMIteratorTest.java
        BAMRemoteFileTest.java
        BinTest.java
        ChunkTest.java
        CigarCodecTest.java
        CigarTest.java
        MergingSamRecordIteratorGroupCollisionTest.java
        MergingSamRecordIteratorTest.java
        ProgramRecordChainingTest.java
        SAMCloneTest.java
        SAMFileReaderTest.java
        SAMFileWriterFactoryTest.java
        SAMIntegerTagTest.java
        SAMTextReaderTest.java
        SAMTextWriterTest.java
        SAMUtilsTest.java
        SamFileHeaderMergerTest.java
        SamHeaderRecordComparatorTest.java
        SamPairUtilTest.java
        SamReaderFactoryTest.java
        SamReaderSortTest.java
        SequenceNameTruncationAndValidationTest.java
        ValidateSamFileTest.java
        filter
        FailsVendorReadQualityFilterTest.java
        SolexaNoiseFilterTest.java
        TagFilterTest.java
        liftover
        LiftOverTest.java
        metrics
        MetricsFileTest.java
        reference
        FastaSequenceFileTest.java
        FastaSequenceIndexTest.java
        IndexedFastaSequenceFileTest.java
        ReferenceSequenceFileWalkerTest.java
        ReferenceSequenceTests.java
        seekablestream
        SeekableBufferedStreamTest.java
        SeekableFTPStreamTest.java
        SeekableFileStreamTest.java
        SeekableStreamFactoryTest.java
        util
        BinaryCodecTest.java
        BlockCompressedFilePointerUtilTest.java
        BlockCompressedOutputStreamTest.java
        BlockCompressedTerminatorTest.java
        CigarUtilTest.java
        DiskBackedQueueTest.java
        HistogramTest.java
        IntervalListTest.java
        IntervalTreeMapTest.java
        IntervalTreeTest.java
        IoUtilTest.java
        Iso8601DateTest.java
        IupacTest.java
        MergingIteratorTest.java
        QualityEncodingDetectorTest.java
        RelativeIso8601DateTest.java
        SamLocusIteratorTest.java
        SequenceUtilTest.java
        SolexaQualityConverterTest.java
        SortingCollectionTest.java
        SortingLongCollectionTest.java
        StringLineReaderTest.java
        StringUtilTest.java
        tribble
        AbstractFeatureReaderTest.java
        BinaryFeaturesTest.java
        FeatureReaderTest.java
        TestUtils.java
        bed
        BEDCodecTest.java
        dbsnp
        OldDbSNPCodecTest.java
        gelitext
        GeliTextTest.java
        index
        IndexFactoryTest.java
        IndexTest.java
        interval
        IntervalTreeTest.java
        linear
        LinearIndexTest.java
        tabix
        TabixIndexTest.java
        readers
        AsciiLineReaderTest.java
        AsynchronousLineReaderTest.java
        LineReaderUtilTest.java
        LongLineBufferedReaderTest.java
        PositionalBufferedStreamTest.java
        ReaderTest.java
        TabixReaderTest.java
        util
        ParsingUtilsTest.java
        ftp
        FTPClientTest.java
        FTPUtilsTest.java
        popgen
        HardyWeinbergCalculationTest.java
        variant
        VariantBaseTest.java
        bcf2
        BCF2EncoderDecoderUnitTest.java
        BCF2UtilsUnitTest.java
        utils
        SAMSequenceDictionaryExtractorTest.java
        variantcontext
        AlleleUnitTest.java
        GenotypeLikelihoodsUnitTest.java
        GenotypeUnitTest.java
        GenotypesContextUnitTest.java
        VariantContextTestProvider.java
        VariantContextUnitTest.java
        VariantJEXLContextUnitTest.java
        writer
        TabixOnTheFlyIndexCreationTest.java
        VCFWriterUnitTest.java
        VariantContextWriterBuilderUnitTest.java
        VariantContextWritersUnitTest.java
        vcf
        AbstractVCFCodecTest.java
        IndexFactoryUnitTest.java
        VCFCompoundHeaderLineUnitTest.java
        VCFEncoderTest.java
        VCFHeaderUnitTest.java
        VCFStandardHeaderLinesUnitTest.java
- src
  - java
    - picard
  - tests
    - java
      - picard
        analysis
        CollectAlignmentSummaryMetricsTest.java
        CollectInsertSizeMetricsTest.java
        CollectRnaSeqMetricsTest.java
        MultiLevelCollectorTest.java
        cmdline
        CommandLineParserTest.java
        CommandLineProgramTest.java
        illumina
        CheckIlluminaDirectoryTest.java
        CollectIlluminaBasecallingMetricsTest.java
        CollectIlluminaSummaryMetricsTest.java
        ExtractIlluminaBarcodesTest.java
        IlluminaBasecallsToFastqTest.java
        IlluminaBasecallsToSamAdapterClippingTest.java
        IlluminaBasecallsToSamTest.java
        IlluminaLaneMetricsCollectorTest.java
        ReadStructureTest.java
        parser
        BclParserTest.java
        BinTdUtil.java
        CycleIlluminaFileMapTest.java
        FilterParserTest.java
        IlluminaDataProviderFactoryTest.java
        IlluminaDataProviderTest.java
        IlluminaFileUtilTest.java
        PerTileParserTest.java
        PerTilePerCycleParserTest.java
        PosParserTest.java
        fakers
        BclFileFakerTest.java
        readers
        AbstractIlluminaPositionFileReaderTest.java
        BclReaderTest.java
        ClocsFileReaderTest.java
        FilterFileReaderTest.java
        IlluminaFileUtilTest.java
        LocsFileReaderTest.java
        MMapBackedIteratorFactoryTest.java
        PosFileReaderTest.java
        metrics
        CollectRrbsMetricsTest.java
        pedigree
        PedFileTest.java
        sam
        AddCommentsToBamTest.java
        CleanSamTest.java
        CompareSAMsTest.java
        CreateSequenceDictionaryTest.java
        FastqToSamTest.java
        GatherBamFilesTest.java
        MergeBamAlignmentTest.java
        MergeSamFilesTest.java
        RevertSamTest.java
        SamToFastqTest.java
        SplitSamByLibraryTest.java
        ViewSamTest.java
        markduplicates
        AbstractMarkDuplicatesCommandLineProgramTest.java
        AbstractMarkDuplicatesCommandLineProgramTester.java
        MarkDuplicatesTest.java
        MarkDuplicatesTester.java
        MarkDuplicatesWithMateCigarTest.java
        MarkDuplicatesWithMateCigarTester.java
        util
        OpticalDuplicateFinderTest.java
        testers
        CleanSamTester.java
        SamFileTester.java
        ValidateSamTester.java
        util
        BedToIntervalListTest.java
        ClippingUtilityTest.java
        DelimitedTextFileWithHeaderIteratorTest.java
        FifoBufferTest.java
        FileChannelJDKBugWorkAroundTest.java
        IlluminaUtilTest.java
        IntervalListScattererTest.java
        MathUtilTest.java
        MergingIteratorTest.java
        RExecutorTest.java
        ScatterIntervalsByNsTest.java
        TabbedTextFileWithHeaderParserTest.java
        TextFileParsersTest.java
        UnsignedTypeUtilTest.java
        vcf
        AbstractVcfMergingClpTester.java
        GenotypeConcordanceTest.java
        MergeVcfsTest.java
        SortVcfsTest.java
        SplitVcfsTest.java
        UpdateVcfSequenceDictionaryTest.java
        VariantContextComparatorTest.java
        VcfFormatConverterTest.java

/*
 * The MIT License
 *
 * Copyright (c) 2014 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package picard.sam.markduplicates;

import htsjdk.samtools.util.SamRecordWithOrdinal;
import htsjdk.samtools.util.SamRecordTrackingBuffer;
import picard.PicardException;
import htsjdk.samtools.util.Histogram;
import picard.sam.DuplicationMetrics;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.*;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import htsjdk.samtools.util.CloseableIterator;
import picard.sam.markduplicates.util.*;

import java.io.File;
import java.util.*;

/**
 * This will iterate through a coordinate sorted SAM file (iterator) and either mark or
 * remove duplicates as appropriate.  This class relies on the coordinate sort order as
 * well as the mate cigar (MC) optional SAM tag.
 */
public class MarkDuplicatesWithMateCigarIterator implements SAMRecordIterator {

    private SAMFileHeader header = null;

    /** The iterator from which records are read. */
    private PeekableIterator<SAMRecord> backingIterator = null;

    /** The ordinal of the next record to be read from the backing iterator */
    private int backingIteratorRecordIndex = 0;

    private boolean removeDuplicates = false;

    /** Should we skip pairs with no mate cigars or should be throw an error? */
    private boolean skipPairsWithNoMateCigar = true;
    private int numRecordsWithNoMateCigar = 0;

    /** When we hit unmapped reads that are just before the EOF, we can greedily process them as they will not have coordinates */
    private boolean foundUnmappedEOFReads = false;

    /** We can flush our queues and buffers if we move to a different reference index */
    private int referenceIndex = 0;

    /**
     * This buffer contains all the records read from input in the same order.  Nonetheless, each record
     * must be examined for duplicate marking, and so we may need to wait for this process to occur.  This
     * buffer stores the records in coordinate order, whether or not they can be emitted, and their associated
     * duplicate marking flag.  By definition, any record in the toMarkQueue will also be in the outputBuffer,
     * so we can omit checking the size of the toMarkQueue in some cases.
     */
    private SamRecordTrackingBuffer outputBuffer = null;

    /**
     * The queue that stores the records that currently are not marked as duplicates.  These need to be kept until
     * they cannot proven not to be duplicates, with the latter records having greater coordinate.  The queue is stored in 5' unclipped
     * ordering, along with keeping the record with the best score, defined by the scoring strategies.  If any record
     * is added to this queue and can be identified as a duplicate, the outputBuffer is notified of its
     * status and it can be emitted.  Therefore, we limit the amount of records in this queue to only those that will NOT
     * be duplicates.
     */
    private final MarkQueue toMarkQueue;

    /** The next record to be returned by next * */
    private SAMRecord nextRecord = null;

    /** This gets various information about the library id for a given record */
    private final LibraryIdGenerator libraryIdGenerator;

    /** This is used to identify optical duplicates among sets of duplicates */
    private OpticalDuplicateFinder opticalDuplicateFinder = null;

    /** We use this to check that the input data was in coordinate sort order */
    private final SAMRecordCoordinateComparator sortComparator = new SAMRecordCoordinateComparator();

    boolean isClosed = false;

    /**
     * Initializes the mark duplicates iterator.
     *
     * @param header                     the SAM header
     * @param iterator                   an iterator over the SAM records to consider
     * @param opticalDuplicateFinder     the algorithm for optical duplicate detection
     * @param duplicateScoringStrategy   the scoring strategy for choosing duplicates.  This cannot be SUM_OF_BASE_QUALITIES.
     * @param toMarkQueueMinimumDistance minimum distance for which to buffer
     * @param removeDuplicates           true to remove duplicates, false to mark duplicates
     * @param skipPairsWithNoMateCigar   true to not return mapped pairs with no mate cigar, false otherwise
     * @param blockSize                  the size of the blocks in the underlying buffer/queue
     * @param tmpDirs                    the temporary directories to use if we spill records to disk
     * @throws PicardException if the inputs are not in coordinate sort order
     */
    public MarkDuplicatesWithMateCigarIterator(final SAMFileHeader header,
                                               final CloseableIterator<SAMRecord> iterator,
                                               final OpticalDuplicateFinder opticalDuplicateFinder,
                                               final ScoringStrategy duplicateScoringStrategy,
                                               final int toMarkQueueMinimumDistance,
                                               final boolean removeDuplicates,
                                               final boolean skipPairsWithNoMateCigar,
                                               final int maxRecordsInRam,
                                               final int blockSize,
                                               final List<File> tmpDirs) throws PicardException {
        if (header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new PicardException(getClass().getName() + " expects the input to be in coordinate sort order.");
        }

        this.header = header;
        backingIterator = new PeekableIterator<SAMRecord>(iterator);
        outputBuffer = new SamRecordTrackingBuffer<SamRecordWithOrdinalAndSetDuplicateReadFlag>(maxRecordsInRam, blockSize, tmpDirs, header, SamRecordWithOrdinalAndSetDuplicateReadFlag.class);

        this.removeDuplicates = removeDuplicates;
        this.skipPairsWithNoMateCigar = skipPairsWithNoMateCigar;
        this.opticalDuplicateFinder = opticalDuplicateFinder;
        toMarkQueue = new MarkQueue(duplicateScoringStrategy);
        libraryIdGenerator = new LibraryIdGenerator(header);

        // Check for supported scoring strategies
        if (duplicateScoringStrategy == ScoringStrategy.SUM_OF_BASE_QUALITIES)
            throw new PicardException("SUM_OF_BASE_QUALITIES not supported as this may cause inconsistencies across ends in a pair.  Please use a different scoring strategy.");

        // set up metrics
        for (final SAMReadGroupRecord readGroup : header.getReadGroups()) {
            final String library = readGroup.getLibrary();
            DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library);
            if (metrics == null) {
                metrics = new DuplicationMetrics();
                metrics.LIBRARY = library;
                libraryIdGenerator.addMetricsByLibrary(library, metrics);
            }
        }

        // This sets the window size we need to keep to guarantee we can mark duplicates correctly
        toMarkQueue.setToMarkQueueMinimumDistance(toMarkQueueMinimumDistance);

        // get the first samRecordWithOrdinal
        nextRecord = markDuplicatesAndGetTheNextAvailable(); // get one directly, or null
    }

    public void logMemoryStats(final Log log) {
        System.gc();
        final Runtime runtime = Runtime.getRuntime();
        log.info("freeMemory: " + runtime.freeMemory() +
                "; totalMemory: " + runtime.totalMemory() +
                "; maxMemory: " + runtime.maxMemory() +
                "; output buffer size: " + outputBuffer.size() +
                "; duplicate queue size: " + toMarkQueue.size()
        );
    }

    /**
     * Establishes that records returned by this iterator are expected to
     * be in the specified sort order.  If this method has been called,
     * then implementers must throw an IllegalStateException from tmpReadEnds()
     * when a samRecordWithOrdinal is read that violates the sort order.  This method
     * may be called multiple times over the course of an iteration,
     * changing the expected sort, if desired -- from the time it is called,
     * it validates whatever sort is set, or stops validating if it
     * is set to null or SAMFileHeader.SortOrder.unsorted.  If this method
     * is not called, then no validation of the iterated records is done.
     *
     * @param sortOrder The order in which records are expected to be returned
     * @return This SAMRecordIterator
     */
    @Override
    public SAMRecordIterator assertSorted(final SAMFileHeader.SortOrder sortOrder) {
        if (sortOrder != SAMFileHeader.SortOrder.coordinate) {
            throw new IllegalStateException("Cannot assort " + sortOrder + " when expecting coordinate sorted input");
        }
        return this;
    }

    @Override
    public boolean hasNext() {
        // fast succeed
        if (null != nextRecord) return true;

        // We would need to get another record, so check if we can either a record read from the input to the mark queue, or we have more that we should return.
        // There should be at no time records in the mark queue that are not tracked in the output buffer.
        return (backingIterator.hasNext() || !outputBuffer.isEmpty());
    }

    @Override
    public SAMRecord next() throws PicardException {
        final SAMRecord toReturn = nextRecord; // save for return


        // This should always return an element
        // NB: it should be the case that nextRecord != null
        if (null == toReturn) {
            throw new NoSuchElementException();
        }

        // Get the next record, if possible
        // NB: it should be the case that (nextRecord != null), due to the (null == toReturn) above
        if (hasNext()) {
            nextRecord = markDuplicatesAndGetTheNextAvailable(); // get one more, if possible
        } else {
            nextRecord = null;
        }

        // Check for sorted order
        if (null != nextRecord && 0 < sortComparator.fileOrderCompare(toReturn, nextRecord)) {
            System.err.print("Previous record: " + toReturn.getSAMString());
            System.err.print("Current record:" + nextRecord.getSAMString());
            throw new PicardException("Records were not found coordinate sort order");
        }

        return toReturn;
    }

    /**
     * Handles records that are paired with both ends mapped, but lacking a mate cigar.  This returns true if we
     * can ignore this record after calling this method (when reading input), false otherwise.
     */
    private boolean ignoreDueToMissingMateCigar(final SamRecordWithOrdinal samRecordWithOrdinal) {
        final SAMRecord record = samRecordWithOrdinal.getRecord();
        // ignore/except-on paired records with mapped mate and no mate cigar
        if (record.getReadPairedFlag() &&
                !record.getMateUnmappedFlag() && null == SAMUtils.getMateCigar(record)) { // paired with one end unmapped and no mate cigar

            // NB: we are not truly examining these records. Do we want to count them?
            if (!record.isSecondaryOrSupplementary()) {
                // update metrics
                final DuplicationMetrics metrics = getMetrics(record);
                if (record.getReadUnmappedFlag()) {
                    ++metrics.UNMAPPED_READS;
                } else if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) {
                    ++metrics.UNPAIRED_READS_EXAMINED;
                } else {
                    ++metrics.READ_PAIRS_EXAMINED;
                }
            }

            if (skipPairsWithNoMateCigar) { // pseudo-silently ignores them
                // NB: need to addRecordToTheOutputBuffer/set-flag as chunking/flushing of the toMarkQueue may need to occur
                addRecordToTheOutputBuffer(samRecordWithOrdinal); // now samRecordWithOrdinal will be stored in outputBuffer for return
                backingIteratorRecordIndex++;
                outputBuffer.setResultState(samRecordWithOrdinal, false); // indicate the present wrapped samRecordWithOrdinal is available for return
                numRecordsWithNoMateCigar++;
                backingIterator.next(); // remove it, since we called backingIterator.peek()
                return true;
            } else {
                throw new PicardException("Read " + record.getReadName() + " was mapped and had a mapped mate, but no mate cigar (\"MC\") tag.");
            }
        }
        return false;
    }

    /**
     * This handles unmapped records at the end of the file.  If this is the first time we have found them, then we
     * can empty the toMarkQueue and call markDuplicatesAndGetTheNextAvailable, otherwise we can just emit them.  The
     * duplication metrics will be updated.
     */
    private SAMRecord nextIfRecordIsUnmappedAtEOF(final SAMRecord record) {
        // when we find unmapped reads with -1 as their reference index, there should be no mapped reads later in the file.
        if (foundUnmappedEOFReads) { // previously found unmapped reads at the end of the file
            final SAMRecord unmappedRecord = backingIterator.next(); // since we called backingIterator.peek()

            if (!record.isSecondaryOrSupplementary()) {
                // update metrics
                final DuplicationMetrics metrics = getMetrics(record);
                ++metrics.UNMAPPED_READS;
            }

            // We should have no more in the queue
            if (!outputBuffer.isEmpty()) {
                throw new PicardException("Encountered unmapped reads at the end of the file, but the alignment start buffer was not empty.");
            }
            return unmappedRecord; // unmapped end of file records can simply be emitted - no need to duplicate mark them
        } else {
            foundUnmappedEOFReads = true;
            // move past all mapped reads
            referenceIndex = header.getSequenceDictionary().getSequences().size();

            // do the final round of duplicate marking
            tryPollingTheToMarkQueue(true, null);

            // NB: we do not call next here since we will recurse and perhaps hit the flush, or re-enter the if with unmapped EOF reads
            return markDuplicatesAndGetTheNextAvailable(); // this should flush the buffer
        }
    }

    /**
     * Check that we are not incorrectly performing any duplicate marking, by having too few of the records.  This
     * can happen if the alignment start is increasing but 5' soft-clipping is increasing such that we miss reads with
     * the same 5' unclipped alignment start.  This is especially true for RNAseq.
     */
    private void checkForMinimumDistanceFailure(final ReadEndsForMateCigar current) {
        if (!toMarkQueue.isEmpty()) {
            final ReadEndsForMateCigar other = toMarkQueue.peek();
            if (other.read1ReferenceIndex == current.read1ReferenceIndex && toMarkQueue.getToMarkQueueMinimumDistance() <= other.read1Coordinate - current.read1Coordinate) {
                if (checkCigarForSkips(other.getRecord().getCigar())) {
                    throw new PicardException("Found a samRecordWithOrdinal with sufficiently large code length that we may have\n"
                            + " missed including it in an early duplicate marking iteration.  Alignment contains skipped"
                            + " reference bases (N's). If this is an\n RNAseq aligned bam, please use MarkDuplicates instead,"
                            + " as this tool does not work well with spliced reads.\n Minimum distance set to "
                            + toMarkQueue.getToMarkQueueMinimumDistance() + " but " + (other.read1Coordinate - current.read1Coordinate - 1)
                            + " would be required.\n" + "Record was: " + other.getRecord().getSAMString());
                } else {
                    System.err.print("record #1: " + other.getRecord().getSAMString());
                    System.err.print("record #2: " + current.getRecord().getSAMString());
                    throw new PicardException("Found a samRecordWithOrdinal with sufficiently large clipping that we may have\n"
                            + " missed including it in an early duplicate marking iteration.  Please increase the"
                            + " minimum distance to at least " + (other.read1Coordinate - current.read1Coordinate - 1)
                            + "bp\nto ensure it is considered (was " + toMarkQueue.getToMarkQueueMinimumDistance() + ").\n"
                            + "Record was: " + other.getRecord().getSAMString());
                }
            }
        }
    }

    /**
     * This tries to get a record that has been evaluated for duplicate marking.  It does this by first seeing if there
     * are any records that have been through duplicate marking.  If none are available, it will try to get more records
     * from the input iterator until there are reads available that have been duplicate marked.  If there are no more
     * records available from the input iterator, it will duplicate mark the final chunk of records.  Finally, if there
     * are no more records, it will return null;
     */
    private SAMRecord markDuplicatesAndGetTheNextAvailable() {

        // Check if there are any we can flush output buffer
        { // NB: braces to limit the scope of 'record'
            final SAMRecord record = flush();
            if (null != record) return record;
        }

        // Check if there are any more records to read in
        if (!backingIterator.hasNext()) { // no more records to read in

            // Check if there are any more to mark
            if (toMarkQueue.isEmpty()) {
                // check if there are any that can be outputted
                if (outputBuffer.isEmpty()) {
                    return null;
                } // no need to flush; no records in the queue and buffer
            } else {
                // force marking duplicates on the remaining records
                tryPollingTheToMarkQueue(true, null);
            }

            /** Since we have no more records to read in, and no more records that need duplicate marking run, we
             * update our coordinate to past the end of the reference
             */
            referenceIndex = header.getSequenceDictionary().getSequences().size();

            /** Now we recurse, so that we can flush from the outputBuffer until it is empty, then return null when
             * all of the input, queue, and buffer are empty */
            return markDuplicatesAndGetTheNextAvailable();
        }

        /** We need to retrieve more records from the input iterator and duplicate mark, until we can return one that
         *  has been through duplicate marking.
         */
        while (backingIterator.hasNext()) {

            // NB: we could get rid of this if we made nextRecord into a list...
            // NB: we do not actually remove this record from the backing iterator until much later, to help with processing unmapped reads at the EOF
            SAMRecord record = backingIterator.peek(); // peek: used for unmapped reads
            final SamRecordWithOrdinal samRecordWithOrdinal = new SamRecordWithOrdinalAndSetDuplicateReadFlag(record, backingIteratorRecordIndex);

            ReadEndsForMateCigar readEnds = null;
            boolean performedChunkAndMarkTheDuplicates = false;

            // remove duplicate information
            record.setDuplicateReadFlag(false);

            /** Check for pairs that have both ends mapped and missing mate cigar. */
            if (ignoreDueToMissingMateCigar(samRecordWithOrdinal)) {
                continue;
            }

            // check for an unmapped read
            if (record.getReadUnmappedFlag()) {
                // unmapped reads at the end of the file!
                if (-1 == record.getReferenceIndex()) {
                    // NB: this may call markDuplicatesAndGetTheNextAvailable if this is the first time a EOF unmapped record has been seen
                    return nextIfRecordIsUnmappedAtEOF(record);
                } else if (!record.isSecondaryOrSupplementary()) {
                    // update metrics
                    final DuplicationMetrics metrics = getMetrics(record);
                    ++metrics.UNMAPPED_READS;
                }
                // we will check for unmapped reads later so as not to add them to mark queue
            } else {
                // If not already set, this sets the minimum distance to twice the read length, or 100, whichever is larger
                if (-1 == toMarkQueue.getToMarkQueueMinimumDistance()) {
                    // use twice the first read's length
                    toMarkQueue.setToMarkQueueMinimumDistance(Math.max(2 * record.getReadBases().length, 100));
                }

                // build a read end for use in the toMarkQueue
                readEnds = new ReadEndsForMateCigar(header, samRecordWithOrdinal, opticalDuplicateFinder, libraryIdGenerator.getLibraryId(samRecordWithOrdinal.getRecord()));

                // check that the minimumDistance was not too small
                checkForMinimumDistanceFailure(readEnds);

                /**
                 * If we can do some duplicate marking, lets do it!
                 * IMPORTANT: this does not flush the to-mark-queue, so the minimum distance needs to be set for us to infer
                 * which records will never be supplemented (i.e. are non-duplicate).
                 */
                performedChunkAndMarkTheDuplicates = tryPollingTheToMarkQueue(false, readEnds);
            }

            // We can now remove the record from the input
            backingIterator.next();

            // Add this to the outputBuffer so it can be tracked.  It will not be available to emit until it has been through duplicate marking.
            addRecordToTheOutputBuffer(samRecordWithOrdinal);
            backingIteratorRecordIndex++; // Each record is has an index and is emitted in the same order. This helps that.

            // We do not consider secondary, supplementary, or unmapped alignments for duplicate marking. We can thus mark that duplicate marking on them has been completed.
            if (record.isSecondaryOrSupplementary() || record.getReadUnmappedFlag()) {
                outputBuffer.setResultState(samRecordWithOrdinal, false);
            } else {
                // Bring the simple metrics up to date
                final DuplicationMetrics metrics = getMetrics(record);
                if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) {
                    ++metrics.UNPAIRED_READS_EXAMINED;
                } else {
                    ++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
                }

                // Add the record for duplicate marking, which may in fact cause it to be duplicate marked or stored for later
                toMarkQueue.add(readEnds, outputBuffer, getMetrics(readEnds.getRecord()));
            }

            // Check if there are any we can flush, which happens if we just performed duplicate marking
            if (performedChunkAndMarkTheDuplicates) {
                record = flush();
                if (null != record) return record;
            }
        }

        // try again, as we may have records we can flush, or we want to see if we are at the EOF
        return markDuplicatesAndGetTheNextAvailable();
    }


    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    @Override
    public void close() {
        // close the input and output
        backingIterator.close();
        outputBuffer.close();
        isClosed = true;
    }

    /**
     * Checks a Cigar for the presence of N operators. Reads with skipped bases may be spliced RNAseq reads
     *
     * @param cigar
     */
    private boolean checkCigarForSkips(final Cigar cigar) {
        final List<CigarElement> elements = cigar.getCigarElements();
        for (final CigarElement el : elements) {
            if (el.getOperator() == CigarOperator.N) return true;
        }
        return false;
    }

    private void enforceClosed() {
        if (!isClosed) throw new PicardException("Calling a method that assumes the iterator is closed");
    }

    /** Useful for statistics after the iterator has been exhausted and closed. */
    public int getNumRecordsWithNoMateCigar() {
        enforceClosed();
        return numRecordsWithNoMateCigar;
    }

    public int getNumDuplicates() {
        enforceClosed();
        return toMarkQueue.getNumDuplicates();
    }

    public LibraryIdGenerator getLibraryIdGenerator() {
        enforceClosed();
        return libraryIdGenerator;
    }

    public Histogram<Short> getOpticalDupesByLibraryId() {
        enforceClosed();
        return libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap();
    }

    /**
     * Gets a SAMRecord if one is available after marking.  This enforces that we return records in the original
     * coordinate sort order in a stable fashion.
     *
     * @return record representing the head of the alignment-start sorted buffer, or null if the head record has not yet been duplicate marked
     */
    private SAMRecord flush() {
        // Check that there is at least one record in the coordinate-sorted buffer, and that the head record has been through duplicate-marking
        while (!outputBuffer.isEmpty() && outputBuffer.canEmit()) {
            // the buffer contains wrapped SAMRecords, which we want to unwrap
            final SAMRecord record = outputBuffer.next().getRecord();

            // If this read is a duplicate, do we want to remove it (continue the loop) or return it for emission?
            if (!removeDuplicates || !record.getDuplicateReadFlag()) {
                return record;
            }
        }
        return null;
    }

    /**
     * Adds a samRecordWithOrdinal to the output buffer.  This does not mean that it is ready to be emitted, since it may need to be
     * duplicate marked.
     *
     * @param samRecordWithOrdinal the index of the record of which to track.
     * @throws PicardException if the records are added out of order
     */
    private void addRecordToTheOutputBuffer(final SamRecordWithOrdinal samRecordWithOrdinal) throws PicardException {
        final int recordReferenceIndex = samRecordWithOrdinal.getRecord().getReferenceIndex();
        if (recordReferenceIndex < referenceIndex) {
            throw new PicardException("Records out of order: " + recordReferenceIndex + " < " + referenceIndex);
        } else if (referenceIndex < recordReferenceIndex) {
            // new reference, so we need to mark duplicates on the current ones
            // NB: we will not miss inter-chromosomal alignments since presumably one end will have been mapped to this chromosome and processed, and we do not need the other end to do so.
            tryPollingTheToMarkQueue(true, null);
            // update genomic coordinate to the next reference index
            referenceIndex = recordReferenceIndex;
        }

        // add the samRecordWithOrdinal to the output buffer so that it can be tracked
        outputBuffer.add(samRecordWithOrdinal);
    }

    /**
     * Tries to get a record from the toMarkQueue that has been successfully through duplicate marking.  Note, either flush is true or
     * current must be non-null.
     *
     * @param flush   true if we should empty the toMarkQueue fully.
     * @param current the current end to ensure we consider all possible ends for a duplicate
     * @return true if we did get at least one record, false otherwise
     */
    private boolean tryPollingTheToMarkQueue(final boolean flush, final ReadEndsForMateCigar current) {
        boolean performedChunkAndMarkTheDuplicates = false;

        if (!flush && null == current) throw new PicardException("Flush cannot be false and current be null");

        if (toMarkQueue.isEmpty()) return false;

        if (!toMarkQueue.isEmpty() && outputBuffer.isEmpty()) {
            throw new PicardException("0 < toMarkQueue && outputBuffer.isEmpty()");
        }

        /**
         * Try to poll the toMarkQueue.  If we are flushing all the records from it, just do so until empty.  Otherwise, we need to
         * make sure we only poll those a certain distance away from current.
         */
        while (!toMarkQueue.isEmpty() &&
                (flush || referenceIndex != current.read1ReferenceIndex ||
                        toMarkQueue.getToMarkQueueMinimumDistance() < current.read1Coordinate - toMarkQueue.peek().read1Coordinate)) {

            // Poll will track that this samRecordWithOrdinal has been through duplicate marking. It is not marked as a duplicate :)
            final ReadEndsForMateCigar next = toMarkQueue.poll(outputBuffer, header, opticalDuplicateFinder, libraryIdGenerator); // get the first one!
            performedChunkAndMarkTheDuplicates = true;

            // track optical duplicates using only those reads that are the first end...
            if (toMarkQueue.shouldBeInLocations(next) && next.getRecord().getFirstOfPairFlag()) {
                final Set<ReadEnds> locations = toMarkQueue.getLocations(next);

                if (!locations.isEmpty()) {
                    AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(new ArrayList<ReadEnds>(locations),
                            opticalDuplicateFinder, libraryIdGenerator);
                }
            }
            // NB: we could try to greedily return a record if one is available here.  Instead we continue processing the mark queue */
        }
        return performedChunkAndMarkTheDuplicates;
    }

    /** Get the duplication metrics for the library associated with end. */
    private DuplicationMetrics getMetrics(final SAMRecord record) {
        final String library = LibraryIdGenerator.getLibraryName(header, record);
        DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library);
        if (metrics == null) {
            metrics = new DuplicationMetrics();
            metrics.LIBRARY = library;
            libraryIdGenerator.addMetricsByLibrary(library, metrics);
        }
        return metrics;
    }
}