ExpressionFileParser.java example

Explorer
igv-master
- src
- test
  - src
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2007-2015 Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

/*
 * ExpressionFileParser.java
 *
 * Parser for GCT and related file formats (e.g. RES).
 *
 * Created on October 18, 2007, 2:33 PM
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */
package org.broad.igv.data.expression;

//~--- non-JDK imports --------------------------------------------------------

import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.feature.Locus;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.tools.StatusMonitor;
import org.broad.igv.track.TrackType;
import org.broad.igv.ui.IGV;
import org.broad.igv.ui.util.MagetabSignalDialog;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.readers.AsciiLineReader;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

/**
 * TODO -- handle case with probe file
 *
 * @author jrobinso
 */
public class ExpressionFileParser {

    private static Logger log = Logger.getLogger(ExpressionFileParser.class);
    private static final int MAX_ERROR_COUNT = 200;

    public enum FileType {

        RES, GCT, MAPPED, TAB, MET, DCHIP, MAGE_TAB
    }

    ResourceLocator dataFileLocator;

    FileType type;
    Genome genome;

    /**
     * Map chr -> longest feature
     */
    Map<String, Integer> longestProbeMap;

    Map<String, List<Row>> rowMap = new HashMap();
    StatusMonitor statusMonitor;
    GeneToLocusHelper locusHelper;

    // For effecient lookup, data column name -> index
    Map<String, Integer> dataColumnIndexMap;

    int errorCount = 0;

    /**
     * Test to determine if the path referes to a GCT file.  The GCT specification requires that the first line
     * be equal to #1.2
     *
     * @param path
     * @return
     * @throws IOException
     */
    public static boolean isGCT(String path) throws IOException {
        BufferedReader reader = null;
        try {
            reader = ParsingUtils.openBufferedReader(path);
            String firstLine = reader.readLine();
            return firstLine != null && firstLine.trim().equals("#1.2");
        } finally {
            if (reader != null) reader.close();
        }
    }


    public ExpressionFileParser(ResourceLocator resFile, String probeFile, Genome genome) throws IOException {

        this.dataFileLocator = resFile;
        this.type = determineType(resFile);
        this.genome = genome;
        longestProbeMap = new HashMap();
        locusHelper = new GeneToLocusHelper(probeFile);

    }


    public ExpressionFileParser(File resFile, String probeFile, Genome genome) throws IOException {
        this(new ResourceLocator(resFile.getAbsolutePath()), probeFile, genome);
    }

    /*
     */

    public static boolean parsableMAGE_TAB(ResourceLocator file) throws IOException {
        AsciiLineReader reader = null;
        try {
            reader = ParsingUtils.openAsciiReader(file);
            String nextLine = null;

            //skip first row
            reader.readLine();

            //check second row for MAGE_TAB identifiers
            if ((nextLine = reader.readLine()) != null && (nextLine.contains("Reporter REF") ||
                    nextLine.contains("Composite Element REF") || nextLine.contains("Term Source REF") ||
                    nextLine.contains("CompositeElement REF") || nextLine.contains("TermSource REF") ||
                    nextLine.contains("Coordinates REF"))) {
                return true;
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }

        return false;
    }

    public ExpressionDataset createDataset() {
        ExpressionDataset dataset = new ExpressionDataset(genome);
        parse(dataset);
        return dataset;
    }

    public static FileType determineType(ResourceLocator dataFileLocator) {
        String fn = dataFileLocator.getPath().toLowerCase();
        if (fn.endsWith(".gz")) {
            int l = fn.length() - 3;
            fn = fn.substring(0, l);
        }
        if (fn.endsWith(".txt")) {
            int l = fn.length() - 4;
            fn = fn.substring(0, l);
        }

        //TODO genomespace hack
        if (dataFileLocator.getPath().contains("?") && dataFileLocator.getPath().contains("dataformat/gct")) {
            fn = ".gct";
        }
        FileType type;

        if (fn.endsWith("res")) {
            type = FileType.RES;
        } else if (fn.endsWith("gct")) {
            type = FileType.GCT;
        } else if (fn.endsWith("mapped")) {
            type = FileType.MAPPED;
        } else if (fn.endsWith("met")) {
            type = FileType.MET;
        } else if (fn.endsWith("dchip")) {
            type = FileType.DCHIP;
        } else if ("mage-tab".equals(dataFileLocator.getType()) || "MAGE_TAB".equals(dataFileLocator.getDescription())) {
            type = FileType.MAGE_TAB;
        } else {
            type = FileType.TAB;
        }
        return type;
    }

    /**
     * Parse the file and return a Dataset
     *
     * @return
     */
    public void parse(ExpressionDataset dataset) {

        // Create a buffer for the string split utility.  We use  a custom utility as opposed
        // to String.split() for performance.

        dataset.setType(TrackType.GENE_EXPRESSION);

        BufferedReader reader = null;
        String nextLine = null;
        int lineCount = 0;
        //String[] columnHeadings = null;
        try {

            reader = ParsingUtils.openBufferedReader(dataFileLocator);

            // Parse the header(s) to determine the precise format.
            FormatDescriptor formatDescriptor = parseHeader(reader, type, dataset);
            final int probeColumn = formatDescriptor.probeColumn;
            final int descriptionColumn = formatDescriptor.descriptionColumn;
            int nDataColumns = formatDescriptor.dataColumns.length;

            dataset.setColumnHeadings(formatDescriptor.dataHeaders);

            dataColumnIndexMap = new HashMap<String, Integer>();
            for (int i = 0; i < formatDescriptor.dataHeaders.length; i++) {
                dataColumnIndexMap.put(formatDescriptor.dataHeaders[i], i);
            }

            // Loop through the data rows

            while ((nextLine = reader.readLine()) != null) {

                String[] tokens = Globals.tabPattern.split(nextLine, -1);
                int nTokens = tokens.length;
                String probeId = new String(tokens[probeColumn]);
                float[] values = new float[nDataColumns];

                String description = (descriptionColumn >= 0) ? tokens[descriptionColumn] : null;

                if (type == FileType.MAGE_TAB && probeId.startsWith("cg")) {
                    // TODO -- this is a very ugly and fragile method to determine data type! Change this!
                    dataset.setType(TrackType.DNA_METHYLATION);
                }

                for (int i = 0; i < nDataColumns; i++) {
                    try {
                        int dataIndex = formatDescriptor.dataColumns[i];

                        // If we are out of value tokens, or the cell is blank, assign NAN to the cell.
                        if ((dataIndex >= nTokens) || (tokens[dataIndex].length() == 0)) {
                            values[i] = Float.NaN;
                        } else {
                            values[i] = Float.parseFloat(tokens[dataIndex]);
                        }

                    } catch (NumberFormatException numberFormatException) {

                        // This s an expected condition.  IGV uses NaN to
                        // indicate non numbers (missing data values)
                        values[i] = Float.NaN;
                    }

                }
                addRow(probeId, description, values);
                lineCount++;

                // This method is designed to be interruptable (canceled by
                // user.  Check every 1000 lines for an interrupt.
                if (lineCount == 1000) {
                    checkForInterrupt();
                    lineCount = 0;
                    if (statusMonitor != null) {
                        statusMonitor.incrementPercentComplete(1);
                    }
                }
            }    // End loop through lines


            // Sort row for each chromosome by start location
            sortRows();

            // Update dataset
            for (String chr : rowMap.keySet()) {
                dataset.setStartLocations(chr, getStartLocations(chr));
                dataset.setEndLocations(chr, getEndLocations(chr));
                dataset.setFeatureNames(chr, getProbes(chr));


                for (String heading : formatDescriptor.dataHeaders) {
                    dataset.setData(heading, chr, getData(heading, chr));
                }
            }

            dataset.setLongestFeatureMap(longestProbeMap);

        } catch (FileNotFoundException ex) {
            throw new RuntimeException(ex);
        } catch (InterruptedException e) {
            throw new RuntimeException("Operation cancelled");
        } catch (Exception e) {
            e.printStackTrace();
            if (nextLine != null && lineCount != 0) {
                throw new ParserException(e.getMessage(), e, lineCount, nextLine);
            } else {
                throw new RuntimeException(e);
            }
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {

                }
            }
        }

    }


    public void addRow(String probeId, String description, float[] values) {
        List<Locus> loci = locusHelper.getLoci(probeId, description, genome.getId());
        if (loci != null) {
            for (Locus locus : loci) {
                if ((locus != null) && locus.isValid()) {

                    String chr = genome == null ? locus.getChr() : genome.getCanonicalChrName(locus.getChr());

                    List<Row> rows = rowMap.get(chr);
                    if (rows == null) {
                        rows = new ArrayList();
                        rowMap.put(chr, rows);
                    }

                    int length = locus.getEnd() - locus.getStart();
                    if (longestProbeMap.containsKey(chr)) {
                        longestProbeMap.put(chr, Math.max(longestProbeMap.get(chr), length));
                    } else {
                        longestProbeMap.put(chr, length);
                    }

                    rows.add(new Row(probeId, chr, locus.getStart(), locus.getEnd(), values));

                }
            }
        } else {
            if (errorCount < MAX_ERROR_COUNT) {
                log.info("Probe: '" + probeId + "' could not be mapped to a genomic position.");
            } else if (errorCount == MAX_ERROR_COUNT) {
                log.info("Maximum probe mapping warning count exceeded.  Further mapping errors will not be logged");
            }
            errorCount++;

        }
    }


    /**
     * Sort all row collections by ascending start location
     */
    private void sortRows() {
        Comparator<Row> c = new Comparator<Row>() {

            public int compare(ExpressionFileParser.Row arg0, ExpressionFileParser.Row arg1) {
                return arg0.start - arg1.start;
            }
        };
        for (List<Row> rows : rowMap.values()) {
            Collections.sort(rows, c);
        }
    }

    public String[] getProbes(String chr) {
        List<Row> rows = rowMap.get(chr);
        String[] labels = new String[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            labels[i] = rows.get(i).feature;
        }
        return labels;

    }


    public int[] getStartLocations(String chr) {

        List<Row> rows = rowMap.get(chr);
        int[] startLocations = new int[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            startLocations[i] = rows.get(i).start;
        }
        return startLocations;
    }


    public int[] getEndLocations(String chr) {

        List<Row> rows = rowMap.get(chr);
        int[] endLocations = new int[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            endLocations[i] = rows.get(i).end;
        }
        return endLocations;
    }


    public float[] getData(String heading, String chr) {

        int columnIndex = dataColumnIndexMap.get(heading);
        List<Row> rows = rowMap.get(chr);
        float[] data = new float[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            data[i] = rows.get(i).values[columnIndex];
        }
        return data;

    }


    /**
     * Note:  This is an exact copy of the method in IGVDatasetParser.  Refactor to merge these
     * two parsers, or share a common base class.
     *
     * @param comment
     * @param dataset
     */
    private static void parseComment(String comment, ExpressionDataset dataset) {

        // If no dataset is supplied there is nothing to do
        if (dataset == null) return;

        String tmp = comment.substring(1, comment.length());
        if (tmp.startsWith("track")) {
            dataset.setTrackLine(tmp);

        } else {
            String[] tokens = tmp.split("=");
            if (tokens.length != 2) {
                return;
            }
            String key = tokens[0].trim().toLowerCase();
            if (key.equals("name")) {
                dataset.setName(tokens[1].trim());
            } else if (key.equals("type")) {

                try {
                    dataset.setType(TrackType.valueOf(tokens[1].trim().toUpperCase()));
                } catch (Exception exception) {
                    // Ignore
                }
            }
        }
    }

    private void checkForInterrupt() throws InterruptedException {
        Thread.sleep(1);    // <- check for interrupted thread
    }

    public static FormatDescriptor parseHeader(BufferedReader reader, FileType type, ExpressionDataset dataset) throws IOException {


        int descriptionColumn = -1;    // Default - no description column
        int dataStartColumn = -1;
        int probeColumn = 0;

        switch (type) {
            case RES:
                dataStartColumn = 2;
                probeColumn = 1;
                descriptionColumn = 0;
                break;
            case GCT:
                dataStartColumn = 2;
                probeColumn = 0;
                descriptionColumn = 1;
                break;
            case MAPPED:
                dataStartColumn = 4;
                probeColumn = 0;
                break;
            case MET:
                dataStartColumn = 4;
                probeColumn = 0;
                break;
            case DCHIP:
                dataStartColumn = 1;
                probeColumn = 0;
                descriptionColumn = -1;
                break;
            case MAGE_TAB:
                descriptionColumn = -1;
                probeColumn = 0;
                break;
            case TAB:
                dataStartColumn = 1;
                probeColumn = 0;
            default:
                // throw exception?
        }


        String headerLine = findHeaderLine(reader, type, dataset);
        String[] firstHeaderRowTokens = headerLine.split("\t");

        List<String> dataHeadingList = new ArrayList(firstHeaderRowTokens.length);
        List<Integer> dataColumnList = new ArrayList<Integer>(firstHeaderRowTokens.length);
        if (type != FileType.MAGE_TAB) {
            int skip = type == FileType.RES ? 2 : 1;
            for (int i = dataStartColumn; i < firstHeaderRowTokens.length; i += skip) {
                String heading = firstHeaderRowTokens[i].replace('\"', ' ').trim();
                dataHeadingList.add(heading);
                dataColumnList.add(i);
            }
        } else {
            //Mage-Tab
            String nextLine = reader.readLine();
            String[] secondHeaderRowTokens = nextLine.split("\t");

            // Remove the label columns.
            for (int i = 1; i < firstHeaderRowTokens.length; i++) {
                if (firstHeaderRowTokens[i].trim().length() > 0) {
                    dataStartColumn = i;
                    break;
                }
            }

            String[] dataHeaderColumns = new String[secondHeaderRowTokens.length - dataStartColumn];
            System.arraycopy(secondHeaderRowTokens, dataStartColumn, dataHeaderColumns, 0, dataHeaderColumns.length);
            String qCol = findSignalColumnHeading(dataHeaderColumns);
            for (int i = 0; i < secondHeaderRowTokens.length; i++) {
                String heading = secondHeaderRowTokens[i].replace('\"', ' ').trim();
                if (heading.toLowerCase().contains(qCol.toLowerCase())) {
                    dataHeadingList.add(firstHeaderRowTokens[i]);
                    dataColumnList.add(i);
                }
            }
        }
        String[] dataHeaders = dataHeadingList.toArray(new String[dataHeadingList.size()]);
        int[] dataColumns = new int[dataColumnList.size()];
        for (int i = 0; i < dataColumnList.size(); i++) dataColumns[i] = dataColumnList.get(i);

        //nColumns = dataHeadings.length;

        // If format is RES skip the two lines following the header
        if (type == FileType.RES) {
            reader.readLine();
            reader.readLine();
        }

        return new FormatDescriptor(probeColumn, descriptionColumn, dataColumns, dataHeaders, firstHeaderRowTokens.length);
    }

    private static String findHeaderLine(BufferedReader reader, FileType type, ExpressionDataset dataset) throws IOException {
        String nextLine;
        String headerLine;
        if (type == FileType.GCT) {
            nextLine = reader.readLine();
            if (nextLine.startsWith("#")) {
                parseComment(nextLine, dataset);
            }
            nextLine = reader.readLine();
            if (nextLine.startsWith("#")) {
                parseComment(nextLine, dataset);
            }
            headerLine = reader.readLine();
        } else if (type != FileType.MAGE_TAB) {
            // Skip meta data, if any
            while ((nextLine = reader.readLine()).startsWith("#") && (nextLine != null)) {
                parseComment(nextLine, dataset);
            }
            headerLine = nextLine;
        } else {
            headerLine = reader.readLine();
        }
        return headerLine;
    }

    private static String findSignalColumnHeading(String[] tokens) {

        //Use a map so comparisons are case insensitive,
        //but returned column heading retains original case
        Map<String, String> columnHeaderSet = new HashMap<String, String>();
        for (String tok : tokens) {
            columnHeaderSet.put(tok.toLowerCase(), tok);
        }

        String[] signals = new String[]{"beta_value", "beta value", "log2 signal", "signal"};

        for (String sig : signals) {
            if (columnHeaderSet.containsKey(sig.toLowerCase())) {
                return columnHeaderSet.get(sig.toLowerCase());
            }
        }

        String qCol = null;

        HashSet<String> uniqueColumns = new HashSet<String>(Arrays.asList(tokens));
        List<String> qColumns = new ArrayList<String>(uniqueColumns);
        if (qColumns.size() == 1) {
            qCol = qColumns.get(0);
        } else if (!Globals.isHeadless()) {
            // Let user choose the signal column
            Collections.sort(qColumns);

            MagetabSignalDialog msDialog = new MagetabSignalDialog(IGV.getMainFrame(), qColumns.toArray(new String[0]));
            msDialog.setVisible(true);

            if (!msDialog.isCanceled()) {
                qCol = msDialog.getQuantitationColumn();
            } else {
                throw new RuntimeException("Could not find any signal columns in the MAGE-TAB file");
            }
        }
        return qCol;
    }

    class Row {

        String feature;
        String chr;
        int start;
        int end;
        float[] values;

        Row(String feature, String chr, int start, int end, float[] values) {
            this.feature = feature;
            this.chr = chr;
            this.start = start;
            this.end = end;
            this.values = values;
        }
    }

    public static class FormatDescriptor {
        private int probeColumn;
        private int descriptionColumn;
        private int[] dataColumns;
        private String[] dataHeaders;
        private int totalColumnCount;

        FormatDescriptor(int probeColumn, int descriptionColumn, int[] dataColumns, String[] dataHeaders, int totalColumnCount) {
            this.probeColumn = probeColumn;
            this.descriptionColumn = descriptionColumn;
            this.dataColumns = dataColumns;
            this.dataHeaders = dataHeaders;
            this.totalColumnCount = totalColumnCount;
        }

        public int getProbeColumn() {
            return probeColumn;
        }

        public int getDescriptionColumn() {
            return descriptionColumn;
        }

        public int[] getDataColumns() {
            return dataColumns;
        }

        public String[] getDataHeaders() {
            return dataHeaders;
        }

        public int getTotalColumnCount() {
            return totalColumnCount;
        }
    }
}