// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package it.crs4.seal.recab; import it.crs4.seal.common.IMRContext; import it.crs4.seal.common.AbstractTaggedMapping; import it.crs4.seal.common.ReadPair; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class RecabTableMapper { private static final Log LOG = LogFactory.getLog(RecabTableMapper.class); public static final String CONF_SKIP_KNOWN_VAR_SITES = "seal.recab.skip-known-variant-sites"; // mainly for testing private static final byte SANGER_OFFSET = 33; public static enum BaseCounters { All, Used, BadBases, VariantMismatches, VariantBases, NonVariantMismatches, AdaptorBasesTrimmed }; public static enum ReadCounters { Processed, FilteredTotal, FilteredUnmapped, FilteredMapQ, FilteredDuplicate, FilteredQC, FilteredSecondaryAlignment }; private VariantTable snps; private AbstractTaggedMapping currentMapping; private ArrayList<Integer> referenceCoordinates; private ArrayList<Boolean> referenceMatches; private ArrayList<Covariate> covariateList; private IMRContext<Text, ObservationCount> context; private boolean skipKnownVariantPositions = true; private Text key = new Text(); private ObservationCount value = new ObservationCount(); public void setup(VariantReader reader, IMRContext<Text, ObservationCount> context, Configuration conf) throws IOException { this.context = context; snps = new ArrayVariantTable(); LOG.info("Using " + snps.getClass().getName() + " snp table implementation."); LOG.info("loading known variation sites."); snps.load(reader); if (LOG.isInfoEnabled()) LOG.info("loaded " + snps.size() + " known variation sites."); referenceCoordinates = new ArrayList<Integer>(200); referenceMatches = new ArrayList<Boolean>(200); // TODO: make it configurable covariateList = new ArrayList<Covariate>(5); covariateList.add( new ReadGroupCovariate(conf) ); covariateList.add( new QualityCovariate() ); covariateList.add( new CycleCovariate() ); covariateList.add( new DinucCovariate() ); // set counters for (BaseCounters c: BaseCounters.class.getEnumConstants()) context.increment(c, 0); for (ReadCounters c: ReadCounters.class.getEnumConstants()) context.increment(c, 0); skipKnownVariantPositions = conf.getBoolean(CONF_SKIP_KNOWN_VAR_SITES, true); if (!skipKnownVariantPositions) LOG.warn("Not skipping known variant sites. This is not recommended for regular usage."); } protected boolean readFailsFilters(AbstractTaggedMapping map) { boolean fails = false; if (map.isUnmapped()) { context.increment(ReadCounters.FilteredUnmapped, 1); fails = true; } else if (map.getMapQ() == 0 || map.getMapQ() == 255) { context.increment(ReadCounters.FilteredMapQ, 1); fails = true; } else if (map.isDuplicate()) { context.increment(ReadCounters.FilteredDuplicate, 1); fails = true; } else if (map.isFailedQC()) { context.increment(ReadCounters.FilteredQC, 1); fails = true; } else if (map.isSecondaryAlign()) { context.increment(ReadCounters.FilteredSecondaryAlignment, 1); fails = true; } if (fails) context.increment(ReadCounters.FilteredTotal, 1); return fails; } public void map(LongWritable ignored, ReadPair pair, IMRContext<Text, ObservationCount> context) throws IOException, InterruptedException { for (AbstractTaggedMapping mapping : pair) processMapping(mapping, context); } protected void processMapping(AbstractTaggedMapping currentMapping, IMRContext<Text, ObservationCount> context) throws IOException, InterruptedException { context.increment(ReadCounters.Processed, 1); context.increment(BaseCounters.All, currentMapping.getLength()); if (readFailsFilters(currentMapping)) return; final String contig = currentMapping.getContig(); int left, right; // left and right "limits" within the current sequence if (currentMapping.isTemplateLengthAvailable() && currentMapping.getTemplateLength() < currentMapping.getLength()) { // Insert size is less than the read size, so we've sequenced part of the read adapter. // We need to trim it the last read_length - insert_size bases sequenced. if (currentMapping.isOnReverse()) { // trim from front left = currentMapping.getLength() - currentMapping.getTemplateLength(); right = currentMapping.getLength(); } else { // trim from the back left = 0; right = currentMapping.getTemplateLength(); } context.increment(BaseCounters.AdaptorBasesTrimmed, currentMapping.getLength() - (right - left)); } else { left = 0; right = currentMapping.getLength(); } currentMapping.calculateReferenceCoordinates(referenceCoordinates); currentMapping.calculateReferenceMatches(referenceMatches); for (Covariate cov: covariateList) cov.applyToMapping(currentMapping); final ByteBuffer seq = currentMapping.getSequence(); final ByteBuffer qual = currentMapping.getBaseQualities(); if (left > 0) { // we're using a relative get() method in the loop below, so here we // advance the buffer position to our starting point. seq.position( seq.position() + left ); qual.position( qual.position() + left ); } for (int i = left; i < right; ++i) { byte base = seq.get(); byte quality = qual.get(); if (base == 'N' || quality <= SANGER_OFFSET) context.increment(BaseCounters.BadBases, 1); else { int pos = referenceCoordinates.get(i); if (pos > 0) // valid reference position { // is it a known variation site? if (skipKnownVariantPositions && snps.isVariantLocation(contig, pos)) { context.increment(BaseCounters.VariantBases, 1); if (!referenceMatches.get(i)) context.increment(BaseCounters.VariantMismatches, 1); } else { // use this base context.increment(BaseCounters.Used, 1); key.clear(); for (Covariate cov: covariateList) { String tmp = cov.getValue(i); key.append(tmp.getBytes(RecabTable.ASCII), 0, tmp.length()); key.append(RecabTable.TableDelimBytes, 0, RecabTable.TableDelimBytes.length); } boolean match = referenceMatches.get(i); if (match) { value.set(1, 0); // (num observations, num mismatches) } else { // mismatch context.increment(BaseCounters.NonVariantMismatches, 1); value.set(1, 1); } context.write(key, value); } } } } } }