// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. package it.crs4.seal.prq; import it.crs4.seal.common.IMRContext; import it.crs4.seal.common.SequenceId; import org.seqdoop.hadoop_bam.SequencedFragment; import org.apache.hadoop.io.Text; import java.io.IOException; public class PairReadsQSeqMapper { private StringBuilder builder; private SequenceId sequenceKey = new SequenceId(); private Text sequenceValue = new Text(); private static final int LINE_SIZE = 500; private static final byte[] Delim = { 9 }; // tab private static final byte[] ZeroOne = { '0', '1' }; private boolean makeTraditionalIds = false; public void setMakeTraditionalIds(boolean v) { makeTraditionalIds = v; } public void setup() { builder = new StringBuilder(LINE_SIZE); } public void map(Text readId, SequencedFragment read, IMRContext<SequenceId, Text> context) throws IOException, InterruptedException { // build the key builder.delete(0, builder.length()); // field up and including the index number goes in the location. The read is on its own. if (read.getRead() == null) throw new RuntimeException("Cannot get read number from read: " + readId); if (read.getLane() != null && read.getTile() != null && read.getXpos() != null && read.getYpos() != null) { appendIdToBuilder(builder, read); // appends the read id to the builder provided // finally the index field builder.append("#").append(read.getIndexSequence() == null ? '0' : read.getIndexSequence()); sequenceKey.set(builder.toString(), read.getRead()); } else { // maybe it's a fastq id with a trailing read number (/1 or /2) if (readId.getLength() > 2) { int last = readId.getLength() - 1; if (readId.charAt(last - 1) == '/') { // truncate the /[12] from the read id // last == length - 1. We want length - 2 bytes, which is equal to last - 1 sequenceKey.set(Text.decode(readId.getBytes(), 0, last - 1), read.getRead()); } else throw new RuntimeException("Didn't find /read_number at end of the read id. Please use qseq files or fastq with illumina-formatted name tags."); } else throw new RuntimeException("Read id " + readId + " is too short. Please use qseq files or fastq with illumina-formatted name tags."); } // then the tab-delimited value sequenceValue.clear(); sequenceValue.append(read.getSequence().getBytes(), 0, read.getSequence().getLength()); sequenceValue.append(Delim, 0, Delim.length); sequenceValue.append(read.getQuality().getBytes(), 0, read.getQuality().getLength()); sequenceValue.append(Delim, 0, Delim.length); // the filter flag is optional. If it's absent we assume the read passes filtering. sequenceValue.append(ZeroOne, (read.getFilterPassed() == null || read.getFilterPassed() ? 1 : 0), 1); context.write(sequenceKey, sequenceValue); context.progress(); } protected StringBuilder appendIdToBuilder(StringBuilder builder, SequencedFragment read) { builder.append(read.getInstrument() == null ? "" : read.getInstrument()); if (makeTraditionalIds) { builder.append("_").append(read.getRunNumber() == null ? "" : read.getRunNumber()); } else { builder.append(":").append(read.getRunNumber() == null ? "" : read.getRunNumber()); builder.append(":").append(read.getFlowcellId() == null ? "" : read.getFlowcellId()); } builder.append(":").append(read.getLane()); builder.append(":").append(read.getTile()); builder.append(":").append(read.getXpos()); builder.append(":").append(read.getYpos()); return builder; } }