MUTCodec.java example

Explorer
igv-master
- src
- test
  - src
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2007-2015 Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.broad.igv.feature.tribble;

import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.DataLoadException;
import org.broad.igv.feature.Mutation;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.collections.MultiMap;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.readers.AsciiLineReader;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.LineIteratorImpl;

import java.io.BufferedReader;
import java.io.IOException;

/**
 * Codec for .mut and .maf mutation files
 *
 * @author jrobinso
 *         Date: 10/24/12
 *         Time: 12:05 PM
 */
public class MUTCodec extends AsciiFeatureCodec<Mutation> {

    private static Logger log = Logger.getLogger(MUTCodec.class);

    private String path;  // for error messages
    private boolean isMAF;
    private String[] headers;
    private String[] samples;
    private Genome genome;
    private int chrColumn;
    private int startColumn;
    private int endColumn;
    private int sampleColumn;
    private int typeColumn;
    private int refAlleleColumn;
    private int tumorAllele1Column;
    private int tumorAllele2Column;
    private int errorCount = 0;


    public MUTCodec(String path, Genome genome) {
        super(Mutation.class);
        this.path = path;
        this.genome = genome;
        try {
            LineIterator reader = new LineIteratorImpl(new AsciiLineReader(ParsingUtils.openInputStream(path)));
            readActualHeader(reader);
        } catch (IOException e) {
            log.error(e.getMessage(), e);
        }
    }

    public Void readActualHeader(LineIterator reader) {
        String nextLine = null;

        while (reader.hasNext()) {
            nextLine = reader.peek();
            if (nextLine.startsWith("#")) {
                if (nextLine.startsWith("#samples")) {
                    String[] tokens = Globals.whitespacePattern.split(nextLine, 2);
                    if (tokens.length < 2) {
                        log.error("Error parsing sample header in mutation file: " + path);
                    } else {
                        samples = Globals.commaPattern.split(tokens[1]);
                        for (int i = 0; i < samples.length; i++) {
                            samples[i] = samples[i].trim();
                        }
                    }
                }
                reader.next();
                continue;
            }

            String[] tokens = Globals.tabPattern.split(nextLine);
            if (tokens.length >= 5) {
                reader.next();
                headers = tokens;
                isMAF = headers.length >= 16 && headers[0].equalsIgnoreCase("Hugo_Symbol");
                setColumns(isMAF);
                return null;
            } else {
                throw new RuntimeException(String.format("Not enough columns in header line found in %s: %s", path, nextLine));
            }


        }
        throw new RuntimeException("Unexpected end-of-file (no header line): " + path);
    }

    public String[] getSamples() {
        return samples;
    }

    public int getChrColumn() {
        return chrColumn;
    }

    public int getStartColumn() {
        return startColumn;
    }

    @Override
    public Mutation decode(String line) {

        if (line.startsWith("#") || line.startsWith("Hugo_Symbol")) {
            return null;
        } else {

            String[] tokens = Globals.tabPattern.split(line);

            String chr = genome == null ? tokens[chrColumn].trim() : genome.getCanonicalChrName(tokens[chrColumn].trim());

            int start;
            try {
                start = Integer.parseInt(tokens[startColumn].trim());
            } catch (NumberFormatException e) {
                errorCount++;
                if(errorCount > 100) {
                    throw new DataLoadException("Column " + (startColumn + 1) + " must be a numeric value.", path);
                }
                else {
                    log.info("Error parsing line: " + line);
                    return null;
                }
            }

            int end;
            try {
                end = Integer.parseInt(tokens[endColumn].trim());
            } catch (NumberFormatException e) {
                errorCount++;
                if(errorCount > 100) {
                    throw new DataLoadException("Column " + (endColumn + 1) + " must be a numeric value.", path);
                }
                else {
                    log.info("Error parsing line: " + line);
                    return null;
                }
            }


            // MAF files use the 1-based inclusive convention for coordinates.  The convention is not
            // specified for MUT files, and it appears both conventions have been used.  We can detect
            // the convention used for single base mutations by testing start == end.
            if (isMAF || (start == end)) {
                start--;
            }

            String sampleId = tokens[sampleColumn].trim();
            String type = tokens[typeColumn].trim();

            MultiMap<String, String> attributes = new MultiMap();
            int n = Math.min(headers.length, tokens.length);
            for (int i = 0; i < n; i++) {
                String key = headers[i];
                String value = tokens[i];
                if (value.length() > 0) {
                    attributes.put(key, value);
                }
            }


            Mutation mut = new Mutation(sampleId, chr, start, end, type);
            mut.setAttributes(attributes);

            if (refAlleleColumn > 0) {
                mut.setRefAllele(tokens[refAlleleColumn].trim());
            }
            if (tumorAllele1Column > 0) {
                mut.setAltAllele1(tokens[tumorAllele1Column].trim());
            }
            if (tumorAllele2Column > 0) {
                mut.setAltAllele2(tokens[tumorAllele2Column].trim());
            }

            return mut;
        }
    }


    private void setColumns(boolean isMAF) {

        this.isMAF = isMAF;
        if (isMAF) {
            chrColumn = 4;
            startColumn = 5;
            endColumn = 6;
            sampleColumn = 15;
            typeColumn = 8;
            refAlleleColumn = 10;
            tumorAllele1Column = 11;
            tumorAllele2Column = 12;
        } else {
            chrColumn = 0;
            startColumn = 1;
            endColumn = 2;
            sampleColumn = 3;
            typeColumn = 4;
            refAlleleColumn = -1;
            tumorAllele1Column = -1;
            tumorAllele2Column = -1;
        }
    }

    /**
     * @param locator
     * @return
     * @throws IOException
     */
    public static boolean isMutationAnnotationFile(ResourceLocator locator) {

        String ext;
        {
            //Only want pathLC when checking extension, so we limit its scope
            String typeStringLC = locator.getTypeString().toLowerCase();
            if (typeStringLC.endsWith(".maf.annotated")) {
                // TCGA extension
                return true;
            }

            ext = ParsingUtils.getIGVExtension(typeStringLC);
        }
        if (ext.equals("mut")) {
            return true;
        } else if (ext.equals("maf")) {

            BufferedReader reader = null;
            String path = locator.getPath();
            try {
                reader = ParsingUtils.openBufferedReader(path);
                if (reader == null) {
                    return false;
                }

                String nextLine;
                while ((nextLine = reader.readLine()) != null && nextLine.startsWith("#")) {
                    if (nextLine.startsWith("#")) {
                        continue;
                    }
                }

                String[] tokens = nextLine.split("\t");
                return tokens.length > 15 && tokens[0].equalsIgnoreCase("Hugo_Symbol");
            } catch (IOException e) {
                log.error("Error reading: " + path, e);
                return false;
            } finally {
                if (reader != null) {
                    try {
                        reader.close();
                    } catch (IOException e) {
                        log.error("Error closing: " + path, e);
                    }
                }
            }
        } else {
            return false;
        }


    }

    @Override
    public boolean canDecode(String path) {
        String fn = path.toLowerCase();
        if (fn.endsWith(".gz")) fn = fn.substring(0, fn.length() - 3);
        return fn.endsWith(".narrowpeak") || fn.endsWith(".broadpeak");
    }
}