/* * The MIT License * * Copyright (c) 2011 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.illumina; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Illumina; import picard.cmdline.StandardOptionDefinitions; import picard.illumina.parser.ClusterData; import picard.illumina.parser.IlluminaDataProvider; import picard.illumina.parser.IlluminaDataProviderFactory; import picard.illumina.parser.IlluminaDataType; import picard.illumina.parser.ReadDescriptor; import picard.illumina.parser.ReadStructure; import picard.illumina.parser.ReadType; import picard.illumina.parser.readers.BclQualityEvaluationStrategy; import picard.util.IlluminaUtil; import picard.util.TabbedTextFileWithHeaderParser; import java.io.BufferedWriter; import java.io.File; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** * Determine the barcode for each read in an Illumina lane. * For each tile, a file is written to the basecalls directory of the form s_<lane>_<tile>_barcode.txt. * An output file contains a line for each read in the tile, aligned with the regular basecall output * The output file contains the following tab-separated columns: * - read subsequence at barcode position * - Y or N indicating if there was a barcode match * - matched barcode sequence (empty if read did not match one of the barcodes). If there is no match * but we're close to the threshold of calling it a match we output the barcode that would have been * matched but in lower case * * @author jburke@broadinstitute.org */ @CommandLineProgramProperties( usage = "Determine the barcode for each read in an Illumina lane.\n" + "For each tile, a file is written to the basecalls directory of the form s_<lane>_<tile>_barcode.txt. " + "An output file contains a line for each read in the tile, aligned with the regular basecall output. \n" + "The output file contains the following tab-separated columns: \n" + " * read subsequence at barcode position\n" + " * Y or N indicating if there was a barcode match\n" + " * matched barcode sequence\n" + "Note that the order of specification of barcodes can cause arbitrary differences in output for poorly matching barcodes.\n\n", usageShort = "Tool to determine the barcode for each read in an Illumina lane", programGroup = Illumina.class ) public class ExtractIlluminaBarcodes extends CommandLineProgram { // The following attributes define the command-line arguments @Option(doc = "The Illumina basecalls directory. ", shortName = "B") public File BASECALLS_DIR; @Option(doc = "Where to write _barcode.txt files. By default, these are written to BASECALLS_DIR.", optional = true) public File OUTPUT_DIR; @Option(doc = "Lane number. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME) public Integer LANE; @Option(doc = ReadStructure.PARAMETER_DOC, shortName = "RS") public String READ_STRUCTURE; @Option(doc = "Barcode sequence. These must be unique, and all the same length. This cannot be used with reads that " + "have more than one barcode; use BARCODE_FILE in that case. ", mutex = {"BARCODE_FILE"}) public List<String> BARCODE = new ArrayList<String>(); @Option(doc = "Tab-delimited file of barcode sequences, barcode name and, optionally, library name. " + "Barcodes must be unique and all the same length. Column headers must be 'barcode_sequence_1', " + "'barcode_sequence_2' (optional), 'barcode_name', and 'library_name'.", mutex = {"BARCODE"}) public File BARCODE_FILE; @Option(doc = "Per-barcode and per-lane metrics written to this file.", shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME) public File METRICS_FILE; @Option(doc = "Maximum mismatches for a barcode to be considered a match.") public int MAX_MISMATCHES = 1; @Option(doc = "Minimum difference between number of mismatches in the best and second best barcodes for a barcode to be considered a match.") public int MIN_MISMATCH_DELTA = 1; @Option(doc = "Maximum allowable number of no-calls in a barcode read before it is considered unmatchable.") public int MAX_NO_CALLS = 2; @Option(shortName = "Q", doc = "Minimum base quality. Any barcode bases falling below this quality will be considered a mismatch even in the bases match.") public int MINIMUM_BASE_QUALITY = 0; @Option(doc = "The minimum quality (after transforming 0s to 1s) expected from reads. If qualities are lower than this value, an error is thrown." + "The default of 2 is what the Illumina's spec describes as the minimum, but in practice the value has been observed lower.") public int MINIMUM_QUALITY = BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY; @Option(shortName = "GZIP", doc = "Compress output s_l_t_barcode.txt files using gzip and append a .gz extension to the file names.") public boolean COMPRESS_OUTPUTS = false; @Option(doc = "Run this many PerTileBarcodeExtractors in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " + "the number of cores available on the machine. If NUM_PROCESSORS < 0 then the number of cores used will be " + "the number available on the machine less NUM_PROCESSORS.") public int NUM_PROCESSORS = 1; private static final Log LOG = Log.getInstance(ExtractIlluminaBarcodes.class); /** The read structure of the actual Illumina Run, i.e. the readStructure of the input data */ private ReadStructure readStructure; private IlluminaDataProviderFactory factory; private final Map<String, BarcodeMetric> barcodeToMetrics = new LinkedHashMap<String, BarcodeMetric>(); private final NumberFormat tileNumberFormatter = NumberFormat.getNumberInstance(); private BclQualityEvaluationStrategy bclQualityEvaluationStrategy; public ExtractIlluminaBarcodes() { tileNumberFormatter.setMinimumIntegerDigits(4); tileNumberFormatter.setGroupingUsed(false); } @Override protected int doWork() { IOUtil.assertFileIsWritable(METRICS_FILE); if (OUTPUT_DIR == null) { OUTPUT_DIR = BASECALLS_DIR; } IOUtil.assertDirectoryIsWritable(OUTPUT_DIR); // Create BarcodeMetric for counting reads that don't match any barcode final String[] noMatchBarcode = new String[readStructure.barcodes.length()]; int index = 0; for (final ReadDescriptor d : readStructure.descriptors) { if (d.type == ReadType.Barcode) { noMatchBarcode[index++] = StringUtil.repeatCharNTimes('N', d.length); } } final BarcodeMetric noMatchMetric = new BarcodeMetric(null, null, IlluminaUtil.barcodeSeqsToString(noMatchBarcode), noMatchBarcode); final int numProcessors; if (NUM_PROCESSORS == 0) { numProcessors = Runtime.getRuntime().availableProcessors(); } else if (NUM_PROCESSORS < 0) { numProcessors = Runtime.getRuntime().availableProcessors() + NUM_PROCESSORS; } else { numProcessors = NUM_PROCESSORS; } LOG.info("Processing with " + numProcessors + " PerTileBarcodeExtractor(s)."); final ExecutorService pool = Executors.newFixedThreadPool(numProcessors); // TODO: This is terribly inefficient; we're opening a huge number of files via the extractor constructor and we never close them. final List<PerTileBarcodeExtractor> extractors = new ArrayList<PerTileBarcodeExtractor>(factory.getAvailableTiles().size()); for (final int tile : factory.getAvailableTiles()) { final PerTileBarcodeExtractor extractor = new PerTileBarcodeExtractor( tile, getBarcodeFile(tile), barcodeToMetrics, noMatchMetric, factory, MINIMUM_BASE_QUALITY, MAX_NO_CALLS, MAX_MISMATCHES, MIN_MISMATCH_DELTA ); extractors.add(extractor); } try { for (final PerTileBarcodeExtractor extractor : extractors) { pool.submit(extractor); } pool.shutdown(); // Wait a while for existing tasks to terminate if (!pool.awaitTermination(6, TimeUnit.HOURS)) { pool.shutdownNow(); // Cancel any still-executing tasks // Wait a while for tasks to respond to being cancelled if (!pool.awaitTermination(60, TimeUnit.SECONDS)) LOG.error("Pool did not terminate"); return 1; } } catch (final Throwable e) { // (Re-)Cancel if current thread also interrupted LOG.error(e, "Parent thread encountered problem submitting extractors to thread pool or awaiting shutdown of threadpool. Attempting to kill threadpool."); pool.shutdownNow(); return 2; } LOG.info("Processed " + extractors.size() + " tiles."); for (final PerTileBarcodeExtractor extractor : extractors) { for (final String key : barcodeToMetrics.keySet()) { barcodeToMetrics.get(key).merge(extractor.getMetrics().get(key)); } noMatchMetric.merge(extractor.getNoMatchMetric()); if (extractor.getException() != null) { LOG.error("Abandoning metrics calculation because one or more PerTileBarcodeExtractors failed."); return 4; } } // Finish metrics tallying. int totalReads = noMatchMetric.READS; int totalPfReads = noMatchMetric.PF_READS; int totalPfReadsAssigned = 0; for (final BarcodeMetric barcodeMetric : barcodeToMetrics.values()) { totalReads += barcodeMetric.READS; totalPfReads += barcodeMetric.PF_READS; totalPfReadsAssigned += barcodeMetric.PF_READS; } if (totalReads > 0) { noMatchMetric.PCT_MATCHES = noMatchMetric.READS / (double) totalReads; double bestPctOfAllBarcodeMatches = 0; for (final BarcodeMetric barcodeMetric : barcodeToMetrics.values()) { barcodeMetric.PCT_MATCHES = barcodeMetric.READS / (double) totalReads; if (barcodeMetric.PCT_MATCHES > bestPctOfAllBarcodeMatches) { bestPctOfAllBarcodeMatches = barcodeMetric.PCT_MATCHES; } } if (bestPctOfAllBarcodeMatches > 0) { noMatchMetric.RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT = noMatchMetric.PCT_MATCHES / bestPctOfAllBarcodeMatches; for (final BarcodeMetric barcodeMetric : barcodeToMetrics.values()) { barcodeMetric.RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT = barcodeMetric.PCT_MATCHES / bestPctOfAllBarcodeMatches; } } } if (totalPfReads > 0) { noMatchMetric.PF_PCT_MATCHES = noMatchMetric.PF_READS / (double) totalPfReads; double bestPctOfAllBarcodeMatches = 0; for (final BarcodeMetric barcodeMetric : barcodeToMetrics.values()) { barcodeMetric.PF_PCT_MATCHES = barcodeMetric.PF_READS / (double) totalPfReads; if (barcodeMetric.PF_PCT_MATCHES > bestPctOfAllBarcodeMatches) { bestPctOfAllBarcodeMatches = barcodeMetric.PF_PCT_MATCHES; } } if (bestPctOfAllBarcodeMatches > 0) { noMatchMetric.PF_RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT = noMatchMetric.PF_PCT_MATCHES / bestPctOfAllBarcodeMatches; for (final BarcodeMetric barcodeMetric : barcodeToMetrics.values()) { barcodeMetric.PF_RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT = barcodeMetric.PF_PCT_MATCHES / bestPctOfAllBarcodeMatches; } } } // Warn about minimum qualities and assert that we've achieved the minimum. for (Map.Entry<Byte, Integer> entry : bclQualityEvaluationStrategy.getPoorQualityFrequencies().entrySet()) { LOG.warn(String.format("Observed low quality of %s %s times.", entry.getKey(), entry.getValue())); } bclQualityEvaluationStrategy.assertMinimumQualities(); // Calculate the normalized matches if (totalPfReadsAssigned > 0) { final double mean = (double) totalPfReadsAssigned / (double) barcodeToMetrics.values().size(); for (final BarcodeMetric m : barcodeToMetrics.values()) { m.PF_NORMALIZED_MATCHES = m.PF_READS / mean; } } final MetricsFile<BarcodeMetric, Integer> metrics = getMetricsFile(); for (final BarcodeMetric barcodeMetric : barcodeToMetrics.values()) { metrics.addMetric(barcodeMetric); } metrics.addMetric(noMatchMetric); metrics.write(METRICS_FILE); return 0; } /** Create a barcode filename corresponding to the given tile qseq file. */ private File getBarcodeFile(final int tile) { return new File(OUTPUT_DIR, "s_" + LANE + "_" + tileNumberFormatter.format(tile) + "_barcode.txt" + (COMPRESS_OUTPUTS ? ".gz" : "")); } /** * Validate that POSITION >= 1, and that all BARCODEs are the same length and unique * * @return null if command line is valid. If command line is invalid, returns an array of error message * to be written to the appropriate place. */ @Override protected String[] customCommandLineValidation() { final ArrayList<String> messages = new ArrayList<String>(); this.bclQualityEvaluationStrategy = new BclQualityEvaluationStrategy(MINIMUM_QUALITY); /** * In extract illumina barcodes we NEVER want to look at the template reads, therefore replace them with skips because * IlluminaDataProvider and its factory will not open these nor produce ClusterData with the template reads in them, thus reducing * the file IO and value copying done by the data provider */ readStructure = new ReadStructure(READ_STRUCTURE.replaceAll("T", "S")); final IlluminaDataType[] datatypes = (MINIMUM_BASE_QUALITY > 0) ? new IlluminaDataType[]{IlluminaDataType.BaseCalls, IlluminaDataType.PF, IlluminaDataType.QualityScores} : new IlluminaDataType[]{IlluminaDataType.BaseCalls, IlluminaDataType.PF}; factory = new IlluminaDataProviderFactory(BASECALLS_DIR, LANE, readStructure, bclQualityEvaluationStrategy, datatypes); if (BARCODE_FILE != null) { parseBarcodeFile(messages); } else { final Set<String> barcodes = new HashSet<String>(); for (final String barcode : BARCODE) { if (barcodes.contains(barcode)) { messages.add("Barcode " + barcode + " specified more than once."); } barcodes.add(barcode); final BarcodeMetric metric = new BarcodeMetric(null, null, barcode, new String[]{barcode}); barcodeToMetrics.put(barcode, metric); } } if (barcodeToMetrics.keySet().size() == 0) { messages.add("No barcodes have been specified."); } if (messages.size() == 0) { return null; } return messages.toArray(new String[messages.size()]); } public static void main(final String[] argv) { new ExtractIlluminaBarcodes().instanceMainWithExit(argv); } private static final String BARCODE_SEQUENCE_COLUMN = "barcode_sequence"; private static final String BARCODE_SEQUENCE_1_COLUMN = "barcode_sequence_1"; private static final String BARCODE_NAME_COLUMN = "barcode_name"; private static final String LIBRARY_NAME_COLUMN = "library_name"; private void parseBarcodeFile(final ArrayList<String> messages) { final TabbedTextFileWithHeaderParser barcodesParser = new TabbedTextFileWithHeaderParser(BARCODE_FILE); final String sequenceColumn = barcodesParser.hasColumn(BARCODE_SEQUENCE_COLUMN) ? BARCODE_SEQUENCE_COLUMN : barcodesParser.hasColumn(BARCODE_SEQUENCE_1_COLUMN) ? BARCODE_SEQUENCE_1_COLUMN : null; if (sequenceColumn == null) { messages.add(BARCODE_FILE + " does not have " + BARCODE_SEQUENCE_COLUMN + " or " + BARCODE_SEQUENCE_1_COLUMN + " column header"); return; } final boolean hasBarcodeName = barcodesParser.hasColumn(BARCODE_NAME_COLUMN); final boolean hasLibraryName = barcodesParser.hasColumn(LIBRARY_NAME_COLUMN); final int numBarcodes = readStructure.barcodes.length(); final Set<String> barcodes = new HashSet<String>(); for (final TabbedTextFileWithHeaderParser.Row row : barcodesParser) { final String bcStrings[] = new String[numBarcodes]; int barcodeNum = 1; for (final ReadDescriptor rd : readStructure.descriptors) { if (rd.type != ReadType.Barcode) continue; final String header = barcodeNum == 1 ? sequenceColumn : "barcode_sequence_" + String.valueOf(barcodeNum); bcStrings[barcodeNum - 1] = row.getField(header); barcodeNum++; } final String bcStr = IlluminaUtil.barcodeSeqsToString(bcStrings); if (barcodes.contains(bcStr)) { messages.add("Barcode " + bcStr + " specified more than once in " + BARCODE_FILE); } barcodes.add(bcStr); final String barcodeName = (hasBarcodeName ? row.getField(BARCODE_NAME_COLUMN) : ""); final String libraryName = (hasLibraryName ? row.getField(LIBRARY_NAME_COLUMN) : ""); final BarcodeMetric metric = new BarcodeMetric(barcodeName, libraryName, bcStr, bcStrings); barcodeToMetrics.put(StringUtil.join("", bcStrings), metric); } barcodesParser.close(); } /** * Metrics produced by the ExtractIlluminaBarcodes program that is used to parse data in * the basecalls directory and determine to which barcode each read should be assigned. */ public static class BarcodeMetric extends MetricBase { /** * The barcode (from the set of expected barcodes) for which the following metrics apply. * Note that the "symbolic" barcode of NNNNNN is used to report metrics for all reads that * do not match a barcode. */ public String BARCODE; public String BARCODE_NAME = ""; public String LIBRARY_NAME = ""; /** The total number of reads matching the barcode. */ public int READS = 0; /** The number of PF reads matching this barcode (always less than or equal to READS). */ public int PF_READS = 0; /** The number of all reads matching this barcode that matched with 0 errors or no-calls. */ public int PERFECT_MATCHES = 0; /** The number of PF reads matching this barcode that matched with 0 errors or no-calls. */ public int PF_PERFECT_MATCHES = 0; /** The number of all reads matching this barcode that matched with 1 error or no-call. */ public int ONE_MISMATCH_MATCHES = 0; /** The number of PF reads matching this barcode that matched with 1 error or no-call. */ public int PF_ONE_MISMATCH_MATCHES = 0; /** The percentage of all reads in the lane that matched to this barcode. */ public double PCT_MATCHES = 0d; /** * The rate of all reads matching this barcode to all reads matching the most prevelant barcode. For the * most prevelant barcode this will be 1, for all others it will be less than 1 (except for the possible * exception of when there are more orphan reads than for any other barcode, in which case the value * may be arbitrarily large). One over the lowest number in this column gives you the fold-difference * in representation between barcodes. */ public double RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT = 0d; /** The percentage of PF reads in the lane that matched to this barcode. */ public double PF_PCT_MATCHES = 0d; /** * The rate of PF reads matching this barcode to PF reads matching the most prevelant barcode. For the * most prevelant barcode this will be 1, for all others it will be less than 1 (except for the possible * exception of when there are more orphan reads than for any other barcode, in which case the value * may be arbitrarily large). One over the lowest number in this column gives you the fold-difference * in representation of PF reads between barcodes. */ public double PF_RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT = 0d; /** * The "normalized" matches to each barcode. This is calculated as the number of pf reads matching * this barcode over the sum of all pf reads matching any barcode (excluding orphans). If all barcodes * are represented equally this will be 1. */ public double PF_NORMALIZED_MATCHES; protected byte[][] barcodeBytes; public BarcodeMetric(final String barcodeName, final String libraryName, final String barcodeDisplay, final String[] barcodeSeqs) { this.BARCODE = barcodeDisplay; this.BARCODE_NAME = barcodeName; this.LIBRARY_NAME = libraryName; this.barcodeBytes = new byte[barcodeSeqs.length][]; for (int i = 0; i < barcodeSeqs.length; i++) { barcodeBytes[i] = htsjdk.samtools.util.StringUtil.stringToBytes(barcodeSeqs[i]); } } /** This ctor is necessary for when reading metrics from file */ public BarcodeMetric() { barcodeBytes = null; } /** Creates a copy of metric initialized with only non-accumulated and non-calculated values set */ public static BarcodeMetric copy(final BarcodeMetric metric) { final BarcodeMetric result = new BarcodeMetric(); result.BARCODE = metric.BARCODE; result.BARCODE_NAME = metric.BARCODE_NAME; result.LIBRARY_NAME = metric.LIBRARY_NAME; result.barcodeBytes = metric.barcodeBytes; return result; } /** * Adds the non-calculated * * @param metric */ public void merge(final BarcodeMetric metric) { this.READS += metric.READS; this.PF_READS += metric.PF_READS; this.PERFECT_MATCHES += metric.PERFECT_MATCHES; this.PF_PERFECT_MATCHES += metric.PF_PERFECT_MATCHES; this.ONE_MISMATCH_MATCHES += metric.ONE_MISMATCH_MATCHES; this.PF_ONE_MISMATCH_MATCHES += metric.PF_ONE_MISMATCH_MATCHES; } } /** Extracts barcodes and accumulates metrics for an entire tile. */ private static class PerTileBarcodeExtractor implements Runnable { private final int tile; private final File barcodeFile; private final Map<String, BarcodeMetric> metrics; private final BarcodeMetric noMatch; private Exception exception = null; private final boolean usingQualityScores; private final IlluminaDataProvider provider; private final ReadStructure outputReadStructure; private final int maxNoCalls, maxMismatches, minMismatchDelta, minimumBaseQuality; /** Utility class to hang onto data about the best match for a given barcode */ class BarcodeMatch { boolean matched; String barcode; int mismatches; int mismatchesToSecondBest; } /** * Constructor * * @param tile The number of the tile being processed; used for logging only. * @param barcodeFile The file to write the barcodes to * @param noMatchMetric A "template" metric that is cloned and the clone is stored internally for accumulating data * @param barcodeToMetrics A "template" metric map whose metrics are cloned, and the clones are stored internally for accumulating data */ public PerTileBarcodeExtractor( final int tile, final File barcodeFile, final Map<String, BarcodeMetric> barcodeToMetrics, final BarcodeMetric noMatchMetric, final IlluminaDataProviderFactory factory, final int minimumBaseQuality, final int maxNoCalls, final int maxMismatches, final int minMismatchDelta ) { this.tile = tile; this.barcodeFile = barcodeFile; this.usingQualityScores = minimumBaseQuality > 0; this.maxNoCalls = maxNoCalls; this.maxMismatches = maxMismatches; this.minMismatchDelta = minMismatchDelta; this.minimumBaseQuality = minimumBaseQuality; this.metrics = new LinkedHashMap<String, BarcodeMetric>(barcodeToMetrics.size()); for (final String key : barcodeToMetrics.keySet()) { this.metrics.put(key, BarcodeMetric.copy(barcodeToMetrics.get(key))); } this.noMatch = BarcodeMetric.copy(noMatchMetric); this.provider = factory.makeDataProvider(Arrays.asList(tile)); this.outputReadStructure = factory.getOutputReadStructure(); } // These methods return the results of the extraction public synchronized Map<String, BarcodeMetric> getMetrics() { return this.metrics; } public synchronized BarcodeMetric getNoMatchMetric() { return this.noMatch; } public synchronized Exception getException() { return this.exception; } /** run method which extracts barcodes and accumulates metrics for an entire tile */ synchronized public void run() { try { LOG.info("Extracting barcodes for tile " + tile); //Sometimes makeDataProvider takes a while waiting for slow file IO, for each tile the needed set of files //is non-overlapping sets of files so make the data providers in the individual threads for PerTileBarcodeExtractors //so they are not all waiting for each others file operations //Most likely we have SKIPS in our read structure since we replace all template reads with skips in the input data structure //(see customCommnandLineValidation), therefore we must use the outputReadStructure to index into the output cluster data final int[] barcodeIndices = outputReadStructure.barcodes.getIndices(); final BufferedWriter writer = IOUtil.openFileForBufferedWriting(barcodeFile); final byte barcodeSubsequences[][] = new byte[barcodeIndices.length][]; final byte qualityScores[][] = usingQualityScores ? new byte[barcodeIndices.length][] : null; while (provider.hasNext()) { // Extract the barcode from the cluster and write it to the file for the tile final ClusterData cluster = provider.next(); for (int i = 0; i < barcodeIndices.length; i++) { barcodeSubsequences[i] = cluster.getRead(barcodeIndices[i]).getBases(); if (usingQualityScores) qualityScores[i] = cluster.getRead(barcodeIndices[i]).getQualities(); } final boolean passingFilter = cluster.isPf(); final BarcodeMatch match = findBestBarcodeAndUpdateMetrics(barcodeSubsequences, qualityScores, passingFilter, metrics, noMatch); final String yOrN = (match.matched ? "Y" : "N"); for (final byte[] bc : barcodeSubsequences) { writer.write(StringUtil.bytesToString(bc)); } writer.write("\t" + yOrN + "\t" + match.barcode + "\t" + String.valueOf(match.mismatches) + "\t" + String.valueOf(match.mismatchesToSecondBest)); writer.newLine(); } writer.close(); } catch (final Exception e) { LOG.error(e, "Error processing tile ", this.tile); this.exception = e; } finally{ provider.close(); } } /** * Find the best barcode match for the given read sequence, and accumulate metrics * * @param readSubsequences portion of read containing barcode * @param passingFilter PF flag for the current read * @return perfect barcode string, if there was a match within tolerance, or null if not. */ private BarcodeMatch findBestBarcodeAndUpdateMetrics(final byte[][] readSubsequences, final byte[][] qualityScores, final boolean passingFilter, final Map<String, BarcodeMetric> metrics, final BarcodeMetric noMatchBarcodeMetric) { BarcodeMetric bestBarcodeMetric = null; int totalBarcodeReadBases = 0; int numNoCalls = 0; // NoCalls are calculated for all the barcodes combined for (final byte[] bc : readSubsequences) { totalBarcodeReadBases += bc.length; for (final byte b : bc) if (SequenceUtil.isNoCall(b)) ++numNoCalls; } // PIC-506 When forcing all reads to match a single barcode, allow a read to match even if every // base is a mismatch. int numMismatchesInBestBarcode = totalBarcodeReadBases + 1; int numMismatchesInSecondBestBarcode = totalBarcodeReadBases + 1; for (final BarcodeMetric barcodeMetric : metrics.values()) { final int numMismatches = countMismatches(barcodeMetric.barcodeBytes, readSubsequences, qualityScores); if (numMismatches < numMismatchesInBestBarcode) { if (bestBarcodeMetric != null) { numMismatchesInSecondBestBarcode = numMismatchesInBestBarcode; } numMismatchesInBestBarcode = numMismatches; bestBarcodeMetric = barcodeMetric; } else if (numMismatches < numMismatchesInSecondBestBarcode) { numMismatchesInSecondBestBarcode = numMismatches; } } final boolean matched = bestBarcodeMetric != null && numNoCalls <= maxNoCalls && numMismatchesInBestBarcode <= maxMismatches && numMismatchesInSecondBestBarcode - numMismatchesInBestBarcode >= minMismatchDelta; final BarcodeMatch match = new BarcodeMatch(); // If we have something that's not a "match" but matches one barcode // slightly, we output that matching barcode in lower case if (numNoCalls + numMismatchesInBestBarcode < totalBarcodeReadBases) { match.mismatches = numMismatchesInBestBarcode; match.mismatchesToSecondBest = numMismatchesInSecondBestBarcode; match.barcode = bestBarcodeMetric.BARCODE.toLowerCase().replaceAll(IlluminaUtil.BARCODE_DELIMITER, ""); } else { match.mismatches = totalBarcodeReadBases; match.barcode = ""; } if (matched) { ++bestBarcodeMetric.READS; if (passingFilter) { ++bestBarcodeMetric.PF_READS; } if (numMismatchesInBestBarcode == 0) { ++bestBarcodeMetric.PERFECT_MATCHES; if (passingFilter) { ++bestBarcodeMetric.PF_PERFECT_MATCHES; } } else if (numMismatchesInBestBarcode == 1) { ++bestBarcodeMetric.ONE_MISMATCH_MATCHES; if (passingFilter) { ++bestBarcodeMetric.PF_ONE_MISMATCH_MATCHES; } } match.matched = true; match.barcode = bestBarcodeMetric.BARCODE.replaceAll(IlluminaUtil.BARCODE_DELIMITER, ""); } else { ++noMatchBarcodeMetric.READS; if (passingFilter) { ++noMatchBarcodeMetric.PF_READS; } } return match; } /** * Compare barcode sequence to bases from read * * @return how many bases did not match */ private int countMismatches(final byte[][] barcodeBytes, final byte[][] readSubsequence, final byte[][] qualities) { int numMismatches = 0; // Read sequence and barcode length may not be equal, so we just use the shorter of the two for (int j = 0; j < barcodeBytes.length; j++) { final int basesToCheck = Math.min(barcodeBytes[j].length, readSubsequence[j].length); for (int i = 0; i < basesToCheck; ++i) { if (!SequenceUtil.isNoCall(readSubsequence[j][i])) { if (!SequenceUtil.basesEqual(barcodeBytes[j][i], readSubsequence[j][i])) ++numMismatches; else if (qualities != null && qualities[j][i] < minimumBaseQuality) ++numMismatches; } } } return numMismatches; } } }