/* * Eoulsan development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public License version 2.1 or * later and CeCILL-C. This should be distributed with the code. * If you do not have a copy, see: * * http://www.gnu.org/licenses/lgpl-2.1.txt * http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt * * Copyright for this code is held jointly by the Genomic platform * of the Institut de Biologie de l'École normale supérieure and * the individual authors. These should be listed in @author doc * comments. * * For more information on the Eoulsan project and its aims, * or to join the Eoulsan Google group, visit the home page * at: * * http://outils.genomique.biologie.ens.fr/eoulsan * */ package fr.ens.biologie.genomique.eoulsan.bio.io.hadoop; import static fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.Counters.ENTRIES_WRITTEN; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; /** * This class define a RecordReader for FASTQ files for the Hadoop MapReduce * framework. * @since 1.0 * @author Laurent Jourdren */ public class FastqRecordReader extends RecordReader<Text, Text> { private static final String COUNTERS_GROUP = "FASTQ Input Format Counters"; private final TaskAttemptContext context; private Text key = new Text(); private Text value = new Text(); private final String[] lines = new String[4]; private FastqLineRecordReader lrr; @Override public synchronized void close() throws IOException { this.lrr.close(); } @Override public Text getCurrentKey() throws IOException, InterruptedException { return this.key; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return this.value; } @Override public float getProgress() throws IOException, InterruptedException { return this.lrr.getProgress(); } @Override public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { this.lrr = new FastqLineRecordReader(); this.lrr.initialize(inputSplit, taskAttemptContext); } @Override public synchronized boolean nextKeyValue() throws IOException, InterruptedException { int count = 0; boolean found = false; while (!found) { if (!this.lrr.nextKeyValue(count != 0)) { return false; } final String s = this.lrr.getCurrentValue().toString().trim(); // Prevent empty lines if (s.length() == 0) { continue; } this.lines[count] = s; if (count < 3) { count++; } else { if (this.lines[0].charAt(0) == '@' && this.lines[2].charAt(0) == '+') { found = true; } else { // Shift lines this.lines[0] = this.lines[1]; this.lines[1] = this.lines[2]; this.lines[2] = this.lines[3]; } } } // Set key this.key = new Text(memberId(this.lines[0].substring(1))); // Set value this.value = new Text(this.lines[0].substring(1) + '\t' + this.lines[1] + '\t' + this.lines[3]); // Clean array this.lines[0] = this.lines[1] = this.lines[2] = this.lines[3] = null; this.context.getCounter(COUNTERS_GROUP, ENTRIES_WRITTEN).increment(1); return true; } /** * Get the member id of a sequence Id * @param s sequence id * @return the member of the sequence id */ private static String memberId(final String s) { if (s == null) { return null; } // New Illumina Id final int pos1 = s.indexOf(' '); if (pos1 != -1) { return s.substring(0, pos1); } // Old Illumina Id final int pos2 = s.indexOf('/'); if (pos2 != -1) { return s.substring(0, pos2); } // Other, do nothing return s; } // // Constructor // /** * Public constructor. * @param context the context */ public FastqRecordReader(final TaskAttemptContext context) { this.context = context; } }