REPMaskCodec.java example

Explorer
igv-master
- src
- test
  - src
/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2007-2015 Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

package org.broad.igv.feature.tribble;

import org.broad.igv.Globals;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.Strand;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackProperties;
import org.broad.igv.track.TrackType;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.collections.MultiMap;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.Feature;
import htsjdk.tribble.exception.CodecLineParsingException;
import htsjdk.tribble.readers.LineIterator;

/**
 * Basically BED format with some columns rearranged
 * <p/> Columns, from UCSC documentation
 * <p/>
 * 0  bin	585	smallint(5) unsigned	Indexing field to speed chromosome range queries.
 * 1  swScore	1504	int(10) unsigned	Smith Waterman alignment score
 * 2  milliDiv	13	int(10) unsigned	Base mismatches in parts per thousand
 * 3  milliDel	4	int(10) unsigned	Bases deleted in parts per thousand
 * 4  milliIns	13	int(10) unsigned	Bases inserted in parts per thousand
 * 5  genoName	chr1	varchar(255)	Genomic sequence name
 * 6  genoStart	10000	int(10) unsigned	Start in genomic sequence
 * 7  genoEnd	10468	int(10) unsigned	End in genomic sequence
 * 8  genoLeft	-249240153	int(11)	-#bases after match in genomic sequence
 * 9  strand	+	char(1)	Relative orientation + or -
 * 10 repName	(CCCTAA)n	varchar(255)	Name of repeat
 * 11 repClass	Simple_repeat	varchar(255)	Class of repeat
 * 12 repFamily	Simple_repeat	varchar(255)	Family of repeat
 * 13 repStart	1	int(11)	Start (if strand is +) or -#bases after match (if strand is -) in repeat sequence
 * 14 repEnd	463	int(11)	End in repeat sequence
 * 15 repLeft	0	int(11)	-#bases after match (if strand is +) or start (if strand is -) in repeat sequence
 * 16 id	1	char(1)	First digit of id field in RepeatMasker .out file. Best ignored.
 */
public class REPMaskCodec extends AsciiFeatureCodec<BasicFeature> {

    FeatureFileHeader header;
    Genome genome;

    public REPMaskCodec(Genome genome) {
        super(BasicFeature.class);
        this.genome = genome;
    }

    public Object readActualHeader(LineIterator reader) {

        String nextLine;
        header = new FeatureFileHeader();
        header.setTrackType(TrackType.REPMASK);
        int nLines = 0;

        try {
            while (reader.hasNext()){
                nextLine = reader.peek();
                if( !nextLine.startsWith("#") && !nextLine.startsWith("track") &&
                    !nextLine.startsWith("browser") ){
                    break;
                }
                reader.next();
                nLines++;
                if (nextLine.startsWith("#type")) {
                    String[] tokens = nextLine.split("=");
                    if (tokens.length > 1) {
                        try {
                            header.setTrackType(TrackType.valueOf(tokens[1]));
                        } catch (Exception e) {
                            // log.error("Error converting track type: " + tokens[1]);
                        }
                    }
                } else if (nextLine.startsWith("track")) {
                    TrackProperties tp = new TrackProperties();
                    ParsingUtils.parseTrackLine(nextLine, tp);
                    header.setTrackProperties(tp);
                }
            }
            return header;
        } catch (Exception e) {
            throw new CodecLineParsingException("Error parsing header: " + e.getMessage(), e);
        }
    }

    /**
     * This function returns true iff the File potentialInput can be parsed by this
     * codec.
     * <p/>
     * There is an assumption that there's never a situation where two different Codecs
     * return true for the same file.  If this occurs, the recommendation would be to error out.
     * <p/>
     * Note this function must never throw an error.  All errors should be trapped
     * and false returned.
     *
     * @param path the file to test for parsability with this codec
     * @return true if potentialInput can be parsed, false otherwise
     */
    public boolean canDecode(String path) {
        return true; // Optimisitic!
    }

    /**
     * Return an abbreviated feature, containing only location information.  Used for indexing.
     *
     * @param line
     * @return
     */
    public Feature decodeLoc(String line) {
        String[] tokens = Globals.singleTabMultiSpacePattern.split(line);
        if(tokens.length < 15) {
            return decodeLegacy(tokens);
        }
        String chr = genome == null ? tokens[5] : genome.getCanonicalChrName(tokens[5]);
        int start = Integer.parseInt(tokens[6]);
        int end = Integer.parseInt(tokens[7]);
        return new BasicFeature(chr, start, end);
    }

    public BasicFeature decode(String nextLine) {

        if (nextLine.trim().length() == 0 || nextLine.startsWith("#")) {
            return null;
        }

        String[] tokens = Globals.singleTabMultiSpacePattern.split(nextLine);
        int tokenCount = tokens.length;

        if (tokenCount < 15) {
            return decodeLegacy(tokens);
        }

        String chr = genome == null ? tokens[5] : genome.getCanonicalChrName(tokens[5]);
        int start = Integer.parseInt(tokens[6]);
        int end = Integer.parseInt(tokens[7]);
        BasicFeature feature = new BasicFeature(chr, start, end);
        feature.setRepresentation(nextLine);

        String strandString = tokens[3].trim();
        char strand = (strandString.length() == 0) ? ' ' : strandString.charAt(0);
        if (strand == '-') {
            feature.setStrand(Strand.NEGATIVE);
        } else if (strand == '+') {
            feature.setStrand(Strand.POSITIVE);
        } else {
            feature.setStrand(Strand.NONE);
        }
        String name = tokens[10];
        feature.setName(name);
        feature.setIdentifier(name);

        MultiMap<String, String> attributes = new MultiMap<String, String>();
        attributes.put("Smith Waterman score", tokens[1]);
        attributes.put("base mismatches per thousand", tokens[2]);
        attributes.put("bases deleted per thousand", tokens[3]);
        attributes.put("bases inserted per thousand", tokens[4]);
        attributes.put("repeat class", tokens[11]);
        attributes.put("repeat family", tokens[12]);
        attributes.put("repeat start", tokens[13]);
        attributes.put("repeat end", tokens[14]);
        feature.setAttributes(attributes);

        return feature;
    }


    public BasicFeature decodeLegacy(String [] tokens) {


        // The first 3 columns are non optional for BED.  We will relax this
        // and only require 2.
        int tokenCount = tokens.length;

        if (tokenCount < 2) {
            return null;
        }

        String chr = genome == null ? tokens[0] : genome.getCanonicalChrName(tokens[0]);
        int start = Integer.parseInt(tokens[1]);

        int end = start + 1;
        if (tokenCount > 2) {
            end = Integer.parseInt(tokens[2]);
        }

        BasicFeature feature = new BasicFeature(chr, start, end);

        // The rest of the columns are optional.  Stop parsing upon encountering
        // a non-expected value

        // Strand
        if (tokenCount > 3) {
            String strandString = tokens[3].trim();
            char strand = (strandString.length() == 0) ? ' ' : strandString.charAt(0);

            if (strand == '-') {
                feature.setStrand(Strand.NEGATIVE);
            } else if (strand == '+') {
                feature.setStrand(Strand.POSITIVE);
            } else {
                feature.setStrand(Strand.NONE);
            }
        }

        // Name
        if (tokenCount > 4) {
            String name = tokens[4].replaceAll("\"", "");
            feature.setName(name);
            feature.setIdentifier(name);
        }


        return feature;
    }

}