FastqInputFormatMODIF.java example

Explorer

hpg-bigdata-master
- hpg-bigdata-app
  - src
    - main
      - java
        org
        opencb
        hpg
        bigdata
        app
        BigDataLocalMain.java
        BigDataMain.java
        cli
        CommandExecutor.java
        hadoop
        AlignmentCommandExecutor.java
        CliOptionsParser.java
        SequenceCommandExecutor.java
        VariantCommandExecutor.java
        local
        AdminCommandExecutor.java
        AlignmentCommandExecutor.java
        LocalCliOptionsParser.java
        SequenceCommandExecutor.java
        VariantCommandExecutor.java
        rest
        RestServer.java
        ws
        AdminRestWebService.java
        VariantWSServer.java
- hpg-bigdata-core
  - native
    - third-party
      - samtools
        misc
        HmmGlocal.java
  - src
    - main
      - java
        org
        opencb
        hpg
        bigdata
        core
        NativeSupport.java
        avro
        AlignmentAvroSerializer.java
        AvroSerializer.java
        VariantAvroAnnotator.java
        VariantAvroSerializer.java
        converters
        Converter.java
        FastqRecord2ReadConverter.java
        SAMRecord2ReadAlignmentConverter.java
        variation
        Genotype2CallSet.java
        ProtoEncoderTask.java
        VariantAvroEncoderTask.java
        VariantContext2VariantConverter.java
        VariantConverterContext.java
        VcfHeaderLine2VariantSetMetadataConverter.java
        io
        ProtoFileWriter.java
        VariantContextBlockIterator.java
        VcfBlockIterator.java
        avro
        AvroEncoder.java
        AvroFileWriter.java
        AvroWriter.java
        lib
        AlignmentDataset.java
        AlignmentParseQuery.java
        ParentDataset.java
        ParseQuery.java
        SparkConfCreator.java
        VariantDataset.java
        VariantParseQuery.java
        parquet
        AlignmentParquetConverter.java
        ParquetConverter.java
        VariantParquetConverter.java
        utils
        AvroUtils.java
        PathUtils.java
        ReadAlignmentUtils.java
        ReadUtils.java
    - test
      - java
        org
        opencb
        hpg
        bigdata
        core
        lib
        AlignmentDatasetTest.java
        VariantDatasetTest.java
        VariantParseQueryTest.java
        utils
        VariantContextBlockIteratorTest.java
        VcfBlockIteratorTest.java
- hpg-bigdata-tools
  - src
    - main
      - java
        org
        opencb
        hpg
        bigdata
        tools
        alignment
        Bam2AvroMR.java
        RegionCoverageWritable.java
        stats
        ReadAlignmentDepthMR.java
        ReadAlignmentSortMR.java
        ReadAlignmentStatsMR.java
        converters
        mr
        hbase
        GenomeVariantConverter.java
        io
        parquet
        ParquetMR.java
        ParquetMapper.java
        sequence
        Fastq2AvroMR.java
        FastqInputFormatMODIF.java
        stats
        ReadAlignmentStatsWritable.java
        ReadKmersMR.java
        ReadKmersWritable.java
        ReadStatsMR.java
        ReadStatsWritable.java
        utils
        ChunkKey.java
        CompressionUtils.java
        HBaseUtils.java
        variant
        Variant2HbaseMR.java
        Vcf2AvroMR.java

/*
 * Copyright 2015 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.hpg.bigdata.tools.sequence;

import hbparquet.hadoop.util.ContextUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.seqdoop.hadoop_bam.*;
import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding;
import org.seqdoop.hadoop_bam.util.ConfHelper;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by jtarraga on 21/05/15.
 * Hadoop-BAM FastqInputFormat modification
 * To avoid calling context.setStatus(...), since TaskAttemptContext became an interface (not a class) from r2.4.1
 */
public class FastqInputFormatMODIF extends FileInputFormat<Text, SequencedFragment>
{
    public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-input.base-quality-encoding";
    public static final String CONF_FILTER_FAILED_QC      = "hbam.fastq-input.filter-failed-qc";
    public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger";

    public static class FastqRecordReader extends RecordReader<Text, SequencedFragment> {
        /*
         * fastq format:
         * <fastq> := <block>+
         * <block := @<seqname>\n<seq>\n+[<seqname>]\n<qual>\n
         * <seqname> := [A-Za-z0-9_.:-]+
         * <seq> := [A-Za-z\n\.~]+
         * <qual> := [!-~\n]+
         *
         * LP: this format is broken, no?  You can have multi-line sequence and quality strings,
         * and the quality encoding includes '@' in its valid character range.  So how should one
         * distinguish between \n@ as a record delimiter and and \n@ as part of a multi-line
         * quality string?
         *
         * For now I'm going to assume single-line sequences.  This works for our sequencing
         * application.  We'll see if someone complains in other applications.
         */

        // start:  first valid data index
        private long start;
        // end:  first index value beyond the slice, i.e. slice is in range [start,end)
        private long end;
        // pos: current position in file
        private long pos;
        // file:  the file being read
        private Path file;

        private LineReader lineReader;
        private InputStream inputStream;
        private Text currentKey = new Text();
        private SequencedFragment currentValue = new SequencedFragment();

        /* If true, will scan the identifier for read data as specified in the Casava
         * users' guide v1.8:
         * @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:
         *            <control number>:<index sequence>
         * After the first name that doesn't match lookForIlluminaIdentifier will be
         * set to false and no further scanning will be done.
         */
        private boolean lookForIlluminaIdentifier = true;
        private static final Pattern ILLUMINA_PATTERN =
                Pattern.compile("([^:]+):(\\d+):([^:]*):(\\d+):(\\d+):(-?\\d+):(-?\\d+)\\s+([123]):([YN]):(\\d+):(.*)");

        private Text buffer = new Text();

        private BaseQualityEncoding qualityEncoding;
        private boolean filterFailedQC = false;

        // How long can a read get?
        private static final int MAX_LINE_LENGTH = 10000;

        public FastqRecordReader(Configuration conf, FileSplit split) throws IOException {
            setConf(conf);
            file = split.getPath();
            start = split.getStart();
            end = start + split.getLength();

            FileSystem fs = file.getFileSystem(conf);
            FSDataInputStream fileIn = fs.open(file);

            CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
            CompressionCodec        codec        = codecFactory.getCodec(file);

            if (codec == null) { // no codec.  Uncompressed file.
                positionAtFirstRecord(fileIn);
                inputStream = fileIn;
            } else { // compressed file
                if (start != 0) {
                    throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
                }

                inputStream = codec.createInputStream(fileIn);
                end = Long.MAX_VALUE; // read until the end of the file
            }

            lineReader = new LineReader(inputStream);
        }

        protected void setConf(Configuration conf) {
            String encoding = conf.get(FastqInputFormat.CONF_BASE_QUALITY_ENCODING,
                    conf.get(
                            FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING,
                            FastqInputFormat.CONF_BASE_QUALITY_ENCODING_DEFAULT)
            );

            if ("illumina".equals(encoding)) {
                qualityEncoding = BaseQualityEncoding.Illumina;
            } else if ("sanger".equals(encoding)) {
                qualityEncoding = BaseQualityEncoding.Sanger;
            } else {
                throw new RuntimeException("Unknown input base quality encoding value " + encoding);
            }

            filterFailedQC = ConfHelper.parseBoolean(
                    conf.get(FastqInputFormat.CONF_FILTER_FAILED_QC,
                            conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)),
                    false);
        }

        /*
         * Position the input stream at the start of the first record.
         */
        private void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
            if (start > 0) {
                // Advance to the start of the first record
                // We use a temporary LineReader to read lines until we find the
                // position of the right one.  We then seek the file to that position.
                stream.seek(start);
                LineReader reader = new LineReader(stream);

                int bytesRead = 0;
                do {
                    bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
                    if (bytesRead > 0 && (buffer.getLength() <= 0 || buffer.getBytes()[0] != '@')) {
                        start += bytesRead;
                    } else {
                        // line starts with @.  Read two more and verify that it starts with a +
                        //
                        // If this isn't the start of a record, we want to backtrack to its end
                        long backtrackPosition = start + bytesRead;

                        bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
                        bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start));
                        if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                            break; // all good!
                        } else {
                            // backtrack to the end of the record we thought was the start.
                            start = backtrackPosition;
                            stream.seek(start);
                            reader = new LineReader(stream);
                        }
                    }
                } while (bytesRead > 0);

                stream.seek(start);
            }
            // else
            // if start == 0 we presume it starts with a valid fastq record
            pos = start;
        }

        /*
         * Added to use mapreduce API.
         *
         */
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        }

        /*
         * Added to use mapreduce API.
         */
        public Text getCurrentKey() {
            return currentKey;
        }

        /*
         * Added to use mapreduce API.
         */
        public SequencedFragment getCurrentValue() {
            return currentValue;
        }

        /*
         * Added to use mapreduce API.
         */
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return next(currentKey, currentValue);
        }

        /*
         * Close this RecordReader to future operations.
         */
        public void close() throws IOException {
            inputStream.close();
        }

        /*
         * Create an object of the appropriate type to be used as a key.
         */
        public Text createKey() {
            return new Text();
        }

        /*
         * Create an object of the appropriate type to be used as a value.
         */
        public SequencedFragment createValue() {
            return new SequencedFragment();
        }

        /*
         * Returns the current position in the input.
         */
        public long getPos() {
            return pos;
        }

        /*
         * How much of the input has the RecordReader consumed i.e.
         */
        public float getProgress() {
            if (start == end) {
                return 1.0f;
            } else {
                return Math.min(1.0f, (pos - start) / (float)(end - start));
            }
        }

        public String makePositionMessage() {
            return file.toString() + ":" + pos;
        }

        protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException {
            // ID line
            long skipped = lineReader.skip(1); // skip @
            pos += skipped;
            if (skipped == 0) {
                return false; // EOF
            }

            // ID
            readLineInto(key);
            // sequence
            value.clear();
            readLineInto(value.getSequence());
            readLineInto(buffer);
            if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') {
                throw new RuntimeException("unexpected fastq line separating sequence and quality at "
                        + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key);
            }
            readLineInto(value.getQuality());

            // look for the Illumina-formatted name.  Once it isn't found lookForIlluminaIdentifier will be set to false
            lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value);
            if (!lookForIlluminaIdentifier) {
                scanNameForReadNumber(key, value);
            }
            return true;
        }


        /*
         * Reads the next key/value pair from the input for processing.
         */
        public boolean next(Text key, SequencedFragment value) throws IOException {
            if (pos >= end) {
                return false; // past end of slice
            }

            try {
                boolean gotData;
                boolean goodRecord;
                do {
                    gotData = lowLevelFastqRead(key, value);
                    goodRecord = gotData
                            && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed());
                } while (gotData && !goodRecord);

                if (goodRecord) { // goodRecord falso also when we couldn't read any more data
                    if (qualityEncoding == BaseQualityEncoding.Illumina) {
                        try {
                            // convert illumina to sanger scale
                            SequencedFragment.convertQuality(value.getQuality(),
                                    BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger);
                        } catch (FormatException e) {
                            throw new FormatException(e.getMessage() + " Position: "
                                    + makePositionMessage() + "; Sequence ID: " + key);
                        }
                    } else { // sanger qualities.
                        int outOfRangeElement = SequencedFragment.verifyQuality(value.getQuality(),
                                BaseQualityEncoding.Sanger);
                        if (outOfRangeElement >= 0) {
                            throw new FormatException("Base quality out of range for Sanger Phred+33 format (found "
                                    + (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET)
                                    + ").\n"
                                    + "Maybe qualities are in Illumina Phred+64 format?\n"
                                    + "Position: " + makePositionMessage() + "; Sequence ID: " + key);
                        }
                    }
                }
                return goodRecord;
            } catch (EOFException e) {
                throw new RuntimeException("unexpected end of file in fastq record at "
                        + makePositionMessage() + ".  Id: " + key.toString());
            }
        }

        private void scanNameForReadNumber(Text name, SequencedFragment fragment) {
            // look for a /[0-9] at the end of the name
            if (name.getLength() >= 2) {
                byte[] bytes = name.getBytes();
                int last = name.getLength() - 1;

                if (bytes[last - 1] == '/' && bytes[last] >= '0' && bytes[last] <= '9') {
                    fragment.setRead(bytes[last] - '0');
                }
            }
        }

        private boolean scanIlluminaId(Text name, SequencedFragment fragment) {
            Matcher m = ILLUMINA_PATTERN.matcher(name.toString());
            boolean matches = m.matches();
            if (matches) {
                fragment.setInstrument(m.group(1));
                fragment.setRunNumber(Integer.parseInt(m.group(2)));
                fragment.setFlowcellId(m.group(3));
                fragment.setLane(Integer.parseInt(m.group(4)));
                fragment.setTile(Integer.parseInt(m.group(5)));
                fragment.setXpos(Integer.parseInt(m.group(6)));
                fragment.setYpos(Integer.parseInt(m.group(7)));
                fragment.setRead(Integer.parseInt(m.group(8)));
                fragment.setFilterPassed("N".equals(m.group(9)));
                fragment.setControlNumber(Integer.parseInt(m.group(10)));
                fragment.setIndexSequence(m.group(11));
            }
            return matches;
        }

        private int readLineInto(Text dest) throws IOException {
            int bytesRead = lineReader.readLine(dest, MAX_LINE_LENGTH);
            if (bytesRead <= 0) {
                throw new EOFException();
            }
            pos += bytesRead;
            return bytesRead;
        }
    }

    @Override
    public boolean isSplitable(JobContext context, Path path) {
        CompressionCodec codec = new CompressionCodecFactory(ContextUtil.getConfiguration(context)).getCodec(path);
        return codec == null;
    }

    public RecordReader<Text, SequencedFragment> createRecordReader(
            InputSplit genericSplit,
            TaskAttemptContext context) throws IOException, InterruptedException {
        //context.setStatus(genericSplit.toString());
        return new FastqRecordReader(ContextUtil.getConfiguration(context), (FileSplit)genericSplit);
    }
}