/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.illumina.quality; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Metrics; import picard.illumina.parser.ClusterData; import picard.illumina.parser.IlluminaDataProvider; import picard.illumina.parser.IlluminaDataProviderFactory; import picard.illumina.parser.IlluminaDataType; import picard.illumina.parser.ReadData; import picard.illumina.parser.ReadStructure; import picard.illumina.parser.readers.BclQualityEvaluationStrategy; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * Collect metrics regarding the reason for reads (sequenced by HiSeqX) not passing the Illumina PF Filter. (BETA) * * @author Yossi Farjoun */ @CommandLineProgramProperties( usage = "Classify PF-Failing reads in a HiSeqX Illumina Basecalling directory into various categories. The classification is based on a heuristic that was derived by looking at a few titration experiments.", usageShort = "Classify PF-Failing reads in a HiSeqX Illumina Basecalling directory into various categories.", programGroup = Metrics.class ) public class CollectHiSeqXPfFailMetrics extends CommandLineProgram { @Option(doc = "The Illumina basecalls directory. ", shortName = "B") public File BASECALLS_DIR; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Basename for metrics file. Resulting file will be" + " <OUTPUT>" + summaryMetricsExtension, optional = false) public File OUTPUT; @Option(shortName = "P", doc = "The fraction of (non-PF) reads for which to output explicit classification. Output file will be <OUTPUT>" + detailedMetricsExtension + " (if PROB_EXPLICIT_READS != 0)", optional = true) public double PROB_EXPLICIT_READS = 0; @Option(doc = "Lane number.", shortName = StandardOptionDefinitions.LANE_SHORT_NAME) public Integer LANE; @Option(shortName = "NP", doc = "Run this many PerTileBarcodeExtractors in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " + "the number of cores available on the machine. If NUM_PROCESSORS < 0 then the number of cores used will be " + "the number available on the machine less NUM_PROCESSORS.", optional = true) public int NUM_PROCESSORS = 1; @Option(doc = "Number of cycles to look at. At time of writing PF status gets determined at cycle 24 so numbers greater than this will yield strange results. " + "In addition, PF status is currently determined at cycle 24, so running this with any other value is neither tested nor recommended.", optional = true) public int N_CYCLES = 24; private static final Log LOG = Log.getInstance(CollectHiSeqXPfFailMetrics.class); private final Map<Integer, PFFailSummaryMetric> tileToSummaryMetrics = new LinkedHashMap<Integer, PFFailSummaryMetric>(); private final Map<Integer, List<PFFailDetailedMetric>> tileToDetailedMetrics = new LinkedHashMap<Integer, List<PFFailDetailedMetric>>(); //Add "T" to the number of cycles to create a "TemplateRead" of the desired length. private final ReadStructure READ_STRUCTURE = new ReadStructure(N_CYCLES + "T"); public final static String detailedMetricsExtension = ".pffail_detailed_metrics"; public final static String summaryMetricsExtension = ".pffail_summary_metrics"; @Override protected String[] customCommandLineValidation() { final List<String> errors = new ArrayList<String>(); if (N_CYCLES < 0) { errors.add("Number of Cycles to look at must be greater than 0"); } if (PROB_EXPLICIT_READS > 1 || PROB_EXPLICIT_READS < 0) { errors.add("PROB_EXPLICIT_READS must be a probability, i.e., 0 <= PROB_EXPLICIT_READS <= 1"); } if (errors.size() > 0) { return errors.toArray(new String[errors.size()]); } else { return super.customCommandLineValidation(); } } /** Stock main method. */ public static void main(final String[] args) { new CollectHiSeqXPfFailMetrics().instanceMainWithExit(args); } @Override protected int doWork() { final IlluminaDataProviderFactory factory = new IlluminaDataProviderFactory(BASECALLS_DIR, LANE, READ_STRUCTURE, new BclQualityEvaluationStrategy(BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY), IlluminaDataType.BaseCalls, IlluminaDataType.PF, IlluminaDataType.QualityScores, IlluminaDataType.Position); final File summaryMetricsFileName = new File(OUTPUT + summaryMetricsExtension); final File detailedMetricsFileName = new File(OUTPUT + detailedMetricsExtension); IOUtil.assertFileIsWritable(summaryMetricsFileName); if (PROB_EXPLICIT_READS != 0) { IOUtil.assertFileIsWritable(detailedMetricsFileName); } final int numProcessors; if (NUM_PROCESSORS == 0) { numProcessors = Runtime.getRuntime().availableProcessors(); } else if (NUM_PROCESSORS < 0) { numProcessors = Runtime.getRuntime().availableProcessors() + NUM_PROCESSORS; } else { numProcessors = NUM_PROCESSORS; } // Create thread-pool submit jobs and what for their completion LOG.info("Processing with " + numProcessors + " PerTilePFMetricsExtractor(s)."); final ExecutorService pool = Executors.newFixedThreadPool(numProcessors); final List<PerTilePFMetricsExtractor> extractors = new ArrayList<PerTilePFMetricsExtractor>(factory.getAvailableTiles().size()); for (final int tile : factory.getAvailableTiles()) { tileToSummaryMetrics.put(tile, new PFFailSummaryMetric(Integer.toString(tile))); tileToDetailedMetrics.put(tile, new ArrayList<PFFailDetailedMetric>()); final PerTilePFMetricsExtractor extractor = new PerTilePFMetricsExtractor( tile, tileToSummaryMetrics.get(tile), tileToDetailedMetrics.get(tile), factory, PROB_EXPLICIT_READS ); extractors.add(extractor); } try { for (final PerTilePFMetricsExtractor extractor : extractors) { pool.submit(extractor); } pool.shutdown(); // Wait forever for tasks to terminate pool.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS); } catch (final Throwable e) { // Cancel if current thread also interrupted LOG.error(e, "Parent thread encountered problem submitting extractors to thread pool or awaiting shutdown of threadpool. Attempting to kill threadpool."); pool.shutdownNow(); return 2; } LOG.info("Processed " + extractors.size() + " tiles."); // Check for exceptions from extractors for (final PerTilePFMetricsExtractor extractor : extractors) { if (extractor.getException() != null) { LOG.error("Abandoning metrics calculation because one or more PerTilePFMetricsExtractors failed."); return 4; } } // Add detailed metrics to file final MetricsFile<PFFailDetailedMetric, ?> detailedMetrics = getMetricsFile(); for (final Collection<PFFailDetailedMetric> detailedMetricCollection : tileToDetailedMetrics.values()) { for (final PFFailDetailedMetric metric : detailedMetricCollection) { detailedMetrics.addMetric(metric); } } // If detailed metrics were requested, write them now. if (PROB_EXPLICIT_READS > 0) { detailedMetrics.write(detailedMetricsFileName); } // Finish metrics tallying. Looping twice so that the "All" metrics will come out on top. final PFFailSummaryMetric totalMetric = new PFFailSummaryMetric("All"); // a "fake" tile that will contain the total tally for (final PFFailSummaryMetric summaryMetric : tileToSummaryMetrics.values()) { totalMetric.merge(summaryMetric); } // Derive fields for total metric and add to file totalMetric.calculateDerivedFields(); final MetricsFile<PFFailSummaryMetric, ?> summaryMetricsFile = getMetricsFile(); summaryMetricsFile.addMetric(totalMetric); // Prepare each tile's derived fields and add it to the file for (final PFFailSummaryMetric summaryMetric : tileToSummaryMetrics.values()) { summaryMetric.calculateDerivedFields(); summaryMetricsFile.addMetric(summaryMetric); } // Actually write the summary metrics to their file. summaryMetricsFile.write(summaryMetricsFileName); return 0; } /** Extracts metrics from a HiSeqX tile. */ private static class PerTilePFMetricsExtractor implements Runnable { private final int tile; private final PFFailSummaryMetric summaryMetric; final Collection<PFFailDetailedMetric> detailedMetrics; private Exception exception = null; private final IlluminaDataProvider provider; final private double pWriteDetailed; final private Random random = new Random(); /** * Constructor * * @param tile The number of the tile being processed. * @param summaryMetric A summaryMetric for collecting the tile data in. * @param detailedMetrics A set of metrics for collecting the classification data in. * @param factory A dataprovider for IlluminaData */ public PerTilePFMetricsExtractor( final int tile, final PFFailSummaryMetric summaryMetric, final Collection<PFFailDetailedMetric> detailedMetrics, final IlluminaDataProviderFactory factory, final double pWriteDetailed ) { this.tile = tile; this.summaryMetric = summaryMetric; this.detailedMetrics = detailedMetrics; this.pWriteDetailed = pWriteDetailed; this.provider = factory.makeDataProvider(Arrays.asList(tile)); } public Exception getException() { return this.exception; } /** run method which extracts accumulates metrics for a tile */ public void run() { try { LOG.info("Extracting PF metrics for tile " + tile); /** * Sometimes makeDataProvider takes a while waiting for slow file IO, for each tile the needed set of files * is non-overlapping sets of files so make the data providers in the individual threads for Extractors * so they are not all waiting for each others file operations */ while (provider.hasNext()) { // Extract the PF status and infer reason if FAIL from the cluster and update the summaryMetric for the tile final ClusterData cluster = provider.next(); this.summaryMetric.READS++; if (!cluster.isPf()) { this.summaryMetric.PF_FAIL_READS++; final ReadClassifier readClassifier = new ReadClassifier(cluster.getRead(0)); if (random.nextDouble() < pWriteDetailed) { detailedMetrics.add(new PFFailDetailedMetric(tile, cluster.getX(), cluster.getY(), readClassifier.numNs, readClassifier.numQGtTwo, readClassifier.failClass)); } switch (readClassifier.failClass) { case EMPTY: this.summaryMetric.PF_FAIL_EMPTY++; break; case MISALIGNED: this.summaryMetric.PF_FAIL_MISALIGNED++; break; case POLYCLONAL: this.summaryMetric.PF_FAIL_POLYCLONAL++; break; case UNKNOWN: this.summaryMetric.PF_FAIL_UNKNOWN++; break; default: LOG.error("Got unexpected fail Reason"); } } } } catch (final Exception e) { LOG.error(e, "Error processing tile ", this.tile); this.exception = e; } finally { provider.close(); } } } protected static class ReadClassifier { public enum PfFailReason { EMPTY, POLYCLONAL, MISALIGNED, UNKNOWN } private final int numNs; // The number of Ns in the base calls private final int numQGtTwo; // The number of quality scores greater than 2 private PfFailReason failClass = null; // The classification of the failure mode /** * Heart of CLP. * This class actually classifies ReadData into the reason why it failed PF * classification is based on a small set of titrated flowcells sequenced at the Broad Institute by the Genomics Platform. * Three cluster were observed: * - numNs~24 and was found only near the boundaries of tiles. it didn't seem to depend on concentration. For this reason it * was classified as MISALIGNED * <p/> * - numNs~0 and numQGtTwo<=8 these were found throughout the tiles and _decreased_ in number as the concentration of the library increased * Thus it was concluded that these correspond to the EMPTY wells * <p/> * - numNs~0 and numQGtTwo>=12 there were found throughout the tiles and _increased_ in number as the concentration of the library increased * Thus it was concluded that these correspond to the POLYCLONAL wells * <p/> * - the remaining reads were few in number the classification for them wasn't clear. Thus they are left as UNKNOWN. * <p/> * We use the length of the read as a parameter and scale the 8 and the 12 accordingly as length/3 and length/2, but in reality this has only * been tested on length=24. * * @param read The read to classify. */ public ReadClassifier(final ReadData read) { final int length = read.getBases().length; numNs = countEquals(read.getBases(), (byte) '.'); // Ns are returned as periods from Illumina numQGtTwo = countGreaterThan(read.getQualities(), (byte) 2); failClass = PfFailReason.UNKNOWN; //for cases not covered below if (numNs >= (length - 1)) { failClass = PfFailReason.MISALIGNED; } else if (numNs <= 1) { if (numQGtTwo <= length / 3) { failClass = PfFailReason.EMPTY; } else if (numQGtTwo >= length / 2) { failClass = PfFailReason.POLYCLONAL; } } } } /** a metric class for describing FP failing reads from an Illumina HiSeqX lane * */ public static class PFFailDetailedMetric extends MetricBase { // The Tile that is described by this metric. public Integer TILE; //The X coordinate of the read within the tile public int X; //The Y coordinate of the read within the tile public int Y; //The number of Ns found in this read. public int NUM_N; //The number of Quality scores greater than 2 found in this read public int NUM_Q_GT_TWO; /** * The classification of this read: {EMPTY, POLYCLONAL, MISALIGNED, UNKNOWN} * (See PFFailSummaryMetric for explanation regarding the possible classification.) */ public ReadClassifier.PfFailReason CLASSIFICATION; public PFFailDetailedMetric(final Integer TILE, final int x, final int y, final int NUM_N, final int NUM_Q_GT_TWO, final ReadClassifier.PfFailReason CLASSIFICATION) { this.TILE = TILE; X = x; Y = y; this.NUM_N = NUM_N; this.NUM_Q_GT_TWO = NUM_Q_GT_TWO; this.CLASSIFICATION = CLASSIFICATION; } /** This ctor is necessary for when reading metrics from file */ public PFFailDetailedMetric() { } } /** * Metrics produced by the GetHiSeqXPFFailMetrics program. Used to diagnose lanes from HiSeqX * Sequencing, providing the number and fraction of each of the reasons that reads could have not passed PF. * Possible reasons are EMPTY (reads from empty wells with no template strand), POLYCLONAL (reads from wells that had more than one strand * cloned in them), MISALIGNED (reads from wells that are near the edge of the tile), UNKNOWN (reads that didn't pass PF but couldn't be diagnosed) */ public static class PFFailSummaryMetric extends MetricBase { /** The Tile that is described by this metric. Can be a string (like "All") to mean some marginal over tiles. * */ public String TILE = null; /** The total number of reads examined */ public int READS = 0; /** The number of non-PF reads in this tile. */ public int PF_FAIL_READS = 0; /** The fraction of PF_READS */ public double PCT_PF_FAIL_READS = 0.0; /** The number of non-PF reads in this tile that are deemed empty. */ public int PF_FAIL_EMPTY = 0; /** The fraction of non-PF reads in this tile that are deemed empty (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_EMPTY = 0.0; /** The number of non-PF reads in this tile that are deemed multiclonal. */ public int PF_FAIL_POLYCLONAL = 0; /** The fraction of non-PF reads in this tile that are deemed multiclonal (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_POLYCLONAL = 0.0; /** The number of non-PF reads in this tile that are deemed "misaligned". */ public int PF_FAIL_MISALIGNED = 0; /** The fraction of non-PF reads in this tile that are deemed "misaligned" (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_MISALIGNED = 0.0; /** The number of non-PF reads in this tile that have not been classified. */ public int PF_FAIL_UNKNOWN = 0; /** The fraction of non-PF reads in this tile that have not been classified (as fraction of all non-PF reads). */ public double PCT_PF_FAIL_UNKNOWN = 0.0; // constructor takes a String for tile since we want to have one instance with tile="All". This tile will contain the summary of all the tiles public PFFailSummaryMetric(final String tile) { TILE = tile; } /** This ctor is necessary for when reading metrics from file */ public PFFailSummaryMetric() { } /** * Adds the non-calculated fields from the other metric to this one. * * @param metric */ public void merge(final PFFailSummaryMetric metric) { this.READS += metric.READS; this.PF_FAIL_READS += metric.PF_FAIL_READS; this.PF_FAIL_EMPTY += metric.PF_FAIL_EMPTY; this.PF_FAIL_MISALIGNED += metric.PF_FAIL_MISALIGNED; this.PF_FAIL_POLYCLONAL += metric.PF_FAIL_POLYCLONAL; this.PF_FAIL_UNKNOWN += metric.PF_FAIL_UNKNOWN; } public void calculateDerivedFields() { //protect against divide by zero if (this.READS != 0) { this.PCT_PF_FAIL_READS = (double) this.PF_FAIL_READS / this.READS; this.PCT_PF_FAIL_EMPTY = (double) this.PF_FAIL_EMPTY / this.READS; this.PCT_PF_FAIL_MISALIGNED = (double) this.PF_FAIL_MISALIGNED / this.READS; this.PCT_PF_FAIL_POLYCLONAL = (double) this.PF_FAIL_POLYCLONAL / this.READS; this.PCT_PF_FAIL_UNKNOWN = (double) this.PF_FAIL_UNKNOWN / this.READS; } } } /** * a simple function that counts how many elements in array are equal to 'toCount' * * @param array * @param toCount * @return number of elements in array that == 'toCount' */ static private int countEquals(final byte[] array, final byte toCount) { int count = 0; for (final byte t : array) { if (t == toCount) count++; } return count; } /** * a simple function that counts how many elements in array are greater-than-or-equal-to 'value' * * @param array * @param value * @return number of elements in array that >= 'value' */ static private int countGreaterThan(final byte[] array, final byte value) { int count = 0; for (final int t : array) { if (t > value) count++; } return count; } }