GFFCodec.java example

Explorer
igv-master
- src
- test
  - src
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2007-2015 Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.broad.igv.feature.tribble;

import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.FeatureDB;
import org.broad.igv.feature.SequenceOntology;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackProperties;
import org.broad.igv.ui.IGV;
import org.broad.igv.ui.color.ColorUtilities;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.StringUtils;
import org.broad.igv.util.collections.CI;
import org.broad.igv.util.collections.MultiMap;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.exception.CodecLineParsingException;
import htsjdk.tribble.readers.LineIterator;

import java.util.*;

/**
 * Notes from GFF3 spec  http://www.sequenceontology.org/gff3.shtml
 * These tags have predefined meanings (tags are case sensitive):
 * <p/>
 * ID	   Indicates the name of the feature (unique).
 * Name   Display name for the feature.
 * Alias  A secondary name for the feature.
 * Parent Indicates the parent of the feature.
 * <p/>
 * Specs:
 * GFF3  http://www.sequenceontology.org/gff3.shtml
 * GFF2 specification: http://www.sanger.ac.uk/resources/software/gff/spec.html
 * UCSC GFF (GFF "1") http://genome.ucsc.edu/FAQ/FAQformat#format3
 * GTF  http://mblab.wustl.edu/GTF2.html
 * UCSC GTF  http://genome.ucsc.edu/FAQ/FAQformat#format4
 * Feature type definitions http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html#7.2
 */
public class GFFCodec extends AsciiFeatureCodec<Feature> {

    private static Logger log = Logger.getLogger(GFFCodec.class);


    static HashSet<String> ignoredTypes = new HashSet();


    private TrackProperties trackProperties = null;
    private CI.CIHashSet featuresToHide = new CI.CIHashSet();
    private FeatureFileHeader header;
    private Helper helper;
    private Genome genome;
    private boolean fastaSection = false;

    public enum Version {
        GFF2, GFF3
    }


    public GFFCodec(Genome genome) {
        super(Feature.class);
        // Assume GFF2 until shown otherwise
        helper = new GFF2Helper();
        this.genome = genome;
    }

    public GFFCodec(Version version, Genome genome) {
        super(Feature.class);
        this.genome = genome;
        if (version == Version.GFF2) {
            helper = new GFF2Helper();
        } else {
            helper = new GFF3Helper();
        }
    }

    public void readHeaderLine(String line) {
        if (header == null) {
            header = new FeatureFileHeader();
        }
        if (line.startsWith("#track") || line.startsWith("##track")) {
            trackProperties = new TrackProperties();
            ParsingUtils.parseTrackLine(line, trackProperties);
            header.setTrackProperties(trackProperties);
        } else if (line.startsWith("##gff-version") && line.contains("3")) {
            String[] tokens = Globals.whitespacePattern.split(line);
            if (tokens.length > 1 && tokens[1].startsWith("3")) {
                helper = new GFF3Helper();
            }
        } else if (line.startsWith("#nodecode") || line.startsWith("##nodecode")) {
            helper.setUrlDecoding(false);
        } else if (line.startsWith("#hide") || line.startsWith("##hide")) {
            String[] kv = line.split("=");
            if (kv.length > 1) {
                featuresToHide.addAll(Arrays.asList(kv[1].split(",")));
            }
        } else if (line.startsWith("#displayName") || line.startsWith("##displayName")) {
            String[] nameTokens = line.split("=");
            if (nameTokens.length < 2) {
                helper.setNameFields(null);
            } else {
                String[] fields = nameTokens[1].split(",");
                helper.setNameFields(fields);
            }
        }
    }

    public Object readActualHeader(LineIterator reader) {

        if (header == null) {
            header = new FeatureFileHeader();
        }
        String line;
        int nLines = 0;
        try {
            while (reader.hasNext()) {
                line = reader.peek();
                if (line.startsWith("#")) {
                    nLines++;
                    readHeaderLine(line);
                    reader.next();
                } else {
                    break;
                }
            }

            header.setTrackProperties(trackProperties);
            return header;
        } catch (Exception e) {
            throw new CodecLineParsingException("Error parsing header: " + e.getMessage(), e);
        }
    }

    /**
     * This function returns true iff the File potentialInput can be parsed by this
     * codec.
     * <p/>
     * There is an assumption that there's never a situation where two different Codecs
     * return true for the same file.  If this occurs, the recommendation would be to error out.
     * <p/>
     * Note this function must never throw an error.  All errors should be trapped
     * and false returned.
     *
     * @param path the file to test for parsability with this codec
     * @return true if potentialInput can be parsed, false otherwise
     */
    public boolean canDecode(String path) {
        String pathLowerCase = path.toLowerCase();
        if (pathLowerCase.endsWith(".gz")) pathLowerCase = pathLowerCase.substring(0, pathLowerCase.length() - 3);
        return pathLowerCase.endsWith(".gff") || pathLowerCase.endsWith(".gff3") ||
                pathLowerCase.endsWith(".gvf") || pathLowerCase.endsWith(".gtf");
    }

    public BasicFeature decode(String line) {

        if (fastaSection) {
            return null;
        }
        if (line.startsWith("#")) {
            if (line.toUpperCase().startsWith("##FASTA")) {
                fastaSection = true;
            }
            return null;
        }

        String[] tokens = Globals.tabPattern.split(line, -1);
        int nTokens = tokens.length;

        // GFF3 files have 9 tokens,
        // TODO -- the attribute column is optional for GFF 2 and earlier (8 tokens required)
        if (nTokens < 9) {
            return null;
        }

        String chrToken = tokens[0].trim();
        String featureType = StringUtils.intern(tokens[2].trim());

        if (ignoredTypes.contains(featureType)) {
            return null;
        }

        String chromosome = genome == null ? StringUtils.intern(chrToken) : genome.getCanonicalChrName(chrToken);

        // GFF coordinates are 1-based inclusive (length = end - start + 1)
        // IGV (UCSC) coordinates are 0-based exclusive.  Adjust start and end accordingly
        int start;
        int end;
        int col = 3;
        try {
            start = Integer.parseInt(tokens[col]) - 1;
            if (start < 0) throw new ParserException("Start index must be 1 or larger; GFF is 1-based", -1, line);
            col++;
            end = Integer.parseInt(tokens[col]);
        } catch (NumberFormatException ne) {
            String msg = String.format("Column %d must contain a numeric value. %s", col + 1, ne.getMessage());
            throw new ParserException(msg, -1, line);
        }
        Strand strand = convertStrand(tokens[6]);

        String attributeString = tokens[8];

        MultiMap<String, String> attributes = new MultiMap<String, String>();

        helper.parseAttributes(attributeString, attributes);

        String id = helper.getID(attributes, featureType);
        String[] parentIds = helper.getParentIds(attributes, attributeString);

        BasicFeature f = new BasicFeature(chromosome, start, end, strand);


        // Set "thick start/end" => corresponds to coding start & end, for UTRs
        if (SequenceOntology.utrTypes.contains(featureType)) {
            boolean plus = (SequenceOntology.fivePrimeUTRTypes.contains(featureType) && strand == Strand.POSITIVE) ||
                    (SequenceOntology.threePrimeUTRTypes.contains(featureType) && strand == Strand.NEGATIVE);
            if (plus) {
                f.setThickStart(end);
            } else {
                f.setThickEnd(end);
            }
        }

        String phaseString = tokens[7].trim();
        if (!phaseString.equals(".")) {
            int phaseNum = Integer.parseInt(phaseString);
            f.setReadingFrame(phaseNum);
        }

        f.setName(helper.getName(attributes));
        f.setType(featureType);

        id = id != null ? id : "igv_" + UUID.randomUUID().toString();
        f.setIdentifier(id);

        f.setParentIds(parentIds);
        f.setAttributes(attributes);

        String[] colorNames = new String[]{"color", "Color", "colour", "Colour"};
        for (String colorName : colorNames) {
            if (attributes.containsKey(colorName)) {
                f.setColor(ColorUtilities.stringToColor(attributes.get(colorName)));
                break;
            }
        }

        if (featuresToHide.contains(featureType)) {
            if (IGV.hasInstance()) FeatureDB.addFeature(f, genome);
            return null;
        }

        return f;

    }

    public Object getHeader() {
        return header;
    }

    private Strand convertStrand(String strandString) {
        Strand strand = Strand.NONE;
        if (strandString.equals("-")) {
            strand = Strand.NEGATIVE;
        } else if (strandString.equals("+")) {
            strand = Strand.POSITIVE;
        }

        return strand;
    }

    static StringBuffer buf = new StringBuffer();

    static String getDescription(MultiMap<String, String> attributes, String type) {
        buf.setLength(0);
        buf.append(type);
        buf.append("<br>");
        attributes.printHtml(buf, 100);
        return buf.toString();
    }


    public interface Helper {

        String[] getParentIds(MultiMap<String, String> attributes, String attributeString);

        void parseAttributes(String attributeString, MultiMap<String, String> map);

        String getID(MultiMap<String, String> attributes, String type);

        void setUrlDecoding(boolean b);

        String getName(MultiMap<String, String> attributes);

        void setNameFields(String[] fields);

    }

    public static class GFF2Helper implements Helper {

        //TODO Almost identical
        static String[] DEFAULT_NAME_FIELDS = {"alias", "gene", "ID", "Locus", "locus", "Name", "name", "gene_name", "primary_name", "systematic_id", "transcript_id"};
        static List<String> idFields = new ArrayList<String>(Arrays.asList(DEFAULT_NAME_FIELDS));

        static {
            idFields.add("transcript_id");
        }

        static String[] possParentNames = new String[]{"transcript_id", "id", "mRNA", "systematic_id", "gene", "transcriptId", "Parent", "proteinId"};

        private String[] nameFields;

        GFF2Helper() {
            this(DEFAULT_NAME_FIELDS);
        }

        GFF2Helper(String[] nameFields) {
            if (nameFields != null) {
                this.nameFields = nameFields;
            }

        }

        public void setUrlDecoding(boolean b) {
            // Ignored,  GFF2 files are never url DECODED
        }


        public void parseAttributes(String description, MultiMap<String, String> kvalues) {

            List<String> kvPairs = StringUtils.breakQuotedString(description.trim(), ';');
            for (String kv : kvPairs) {
                String[] tokens = kv.split(" ");
                if (tokens.length == 1) {
                    //Not space delimited, check =
                    tokens = kv.split("=");
                }
                if (tokens.length >= 2) {
                    String key = tokens[0].trim().replaceAll("\"", "");
                    String value = tokens[1].trim().replaceAll("\"", "");
                    kvalues.put(StringUtils.intern(key), value);
                }
            }
        }

        /**
         * @param attributes
         * @param attributeString
         * @return
         */

        public String[] getParentIds(MultiMap<String, String> attributes, String attributeString) {

            if (attributes.size() > 0) {
                for (String possName : possParentNames) {
                    if (attributes.containsKey(possName)) {
                        String parent = attributes.get(possName).trim();
                        if (parent.length() > 0) {
                            return new String[]{parent};
                        }
                    }
                }
            }
            return null;
        }


        public String getID(MultiMap<String, String> attributes, String type) {

            //Search for an attribute == type,  take this as ID
            String id = attributes.get(type);
            if (id != null && id.length() > 0) {
                return id;
            }

            // Try <type>_id convention
            id = attributes.get(type + "_id");
            if (id != null && id.length() > 0) {
                return id;
            }


            for (String nf : idFields) {
                if (attributes.containsKey(nf)) {
                    String tmp = attributes.get(nf).trim();
                    if (tmp.length() > 0) return tmp;
                }
            }

            String tmp = getName(attributes);
            if (tmp != null && tmp.trim().length() > 0) {
                return tmp.trim();
            }

            return null;
        }

        public String getName(MultiMap<String, String> attributes) {

            if (attributes.size() > 0 && nameFields != null) {
                for (String nf : nameFields) {
                    if (attributes.containsKey(nf)) {
                        return attributes.get(nf);
                    }
                }
            }

            return null;
        }

        public void setNameFields(String[] nameFields) {
            this.nameFields = nameFields;
        }

    }

    public static class GFF3Helper implements Helper {

        static String[] DEFAULT_NAME_FIELDS = {"Name", "Alias", "ID", "gene", "locus", "gene_name"};
        private boolean useUrlDecoding = true;

        private String[] nameFields;

        public GFF3Helper() {
            this(DEFAULT_NAME_FIELDS);
        }

        GFF3Helper(String[] nameFields) {
            if (nameFields != null) {
                this.nameFields = nameFields;
            }

        }


        public String[] getParentIds(MultiMap<String, String> attributes, String ignored) {
            String parentIdString = attributes.get("Parent");
            if (parentIdString != null) {
                return parentIdString.split(",");
            } else {
                return null;
            }
        }

        /**
         * Parse the column 9 attributes.  Attributes are separated by semicolons.
         * <p/>
         * TODO -- quotes (column 9) are explicitly forbidden in GFF3 -- should breakQuotedString be used?
         *
         * @param description
         * @param kvalues
         */
        public void parseAttributes(String description, MultiMap<String, String> kvalues) {

            List<String> kvPairs = StringUtils.breakQuotedString(description.trim(), ';');
            for (String kv : kvPairs) {
                //int nValues = ParsingUtils.split(kv, tmp, '=');
                List<String> tmp = StringUtils.breakQuotedString(kv, '=');
                int nValues = tmp.size();
                if (nValues > 0) {
                    String key = tmp.get(0).trim();
                    String value = ((nValues == 1) ? "" : tmp.get(1).trim());

                    if (useUrlDecoding) {
                        key = StringUtils.decodeURL(key);
                        value = StringUtils.decodeURL(value);
                    }
                    kvalues.put(StringUtils.intern(key), value);
                } else {
                    log.info("No attributes: " + description);
                }
            }
        }

        public void setUrlDecoding(boolean useUrlDecoding) {
            this.useUrlDecoding = useUrlDecoding;
        }

        public String getName(MultiMap<String, String> attributes) {

            if (attributes.size() > 0 && nameFields != null) {
                for (String nf : nameFields) {
                    if (attributes.containsKey(nf)) {
                        return attributes.get(nf);
                    }
                }
            }

            return null;
        }

        public String getID(MultiMap<String, String> attributes, String ignore) {
            return attributes.get("ID");
        }

        public void setNameFields(String[] nameFields) {
            this.nameFields = nameFields;
        }
    }


}