/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.track;
import org.apache.log4j.Logger;
import org.broad.igv.feature.*;
import htsjdk.tribble.Feature;
import java.io.IOException;
import java.util.*;
/**
* User: jacob
* Date: 2012-Jun-22
*/
public class GFFFeatureSource implements org.broad.igv.track.FeatureSource {
private static Logger log = Logger.getLogger(GFFFeatureSource.class);
private FeatureSource wrappedSource;
public static boolean isGFF(String path) {
String lowpath = path.toLowerCase();
if (lowpath.endsWith(".gz")) {
int idx = lowpath.length() - 3;
lowpath = lowpath.substring(0, idx);
}
if (lowpath.endsWith(".txt")) {
int idx = lowpath.length() - 4;
lowpath = lowpath.substring(0, idx);
}
return lowpath.endsWith("gff3") || lowpath.endsWith("gvf") || lowpath.endsWith("gff") || lowpath.endsWith("gtf");
}
public GFFFeatureSource(FeatureSource wrappedSource) throws IOException {
this.wrappedSource = wrappedSource;
}
@Override
public Iterator<Feature> getFeatures(String chr, int start, int end) throws IOException {
Iterator<Feature> rawIter = wrappedSource.getFeatures(chr, start, end);
GFFCombiner combiner = (new GFFCombiner()).addFeatures(rawIter);
return new WrappedIterator(combiner.combineFeatures().iterator());
}
@Override
public List<LocusScore> getCoverageScores(String chr, int start, int end, int zoom) {
return wrappedSource.getCoverageScores(chr, start, end, zoom);
}
@Override
public int getFeatureWindowSize() {
return wrappedSource.getFeatureWindowSize();
}
@Override
public void setFeatureWindowSize(int size) {
wrappedSource.setFeatureWindowSize(size);
}
/**
* The GFF spec is available at http://www.sequenceontology.org/gff3.shtml
* <p/>
* GFF Combiner is needed because IGV represents a transcript (e.g. an mRNAs) as a single feature with,
* optionally, a collection of child exons. The feature can have a "thick" start and end, corresponding
* to coding start and end. This representation comes from a literal transcript of certain UCSC formats.
* The same feature in a GFF file, on the other hand, is represented as a graph of sub features with
* defined by parent-child relations. GFF3 formalizes the feature types and their relationships to some
* degree in the Sequence Ontology,
*/
public static class GFFCombiner {
List<Feature> igvFeatures;
Map<String, GFFFeature> gffFeatures;
List<BasicFeature> gffExons;
Map<String, GFFCdsCltn> gffCdss;
List<BasicFeature> gffUtrs;
List<BasicFeature> gffMrnaParts;
public GFFCombiner() {
int numElements = 10000;
igvFeatures = new ArrayList<Feature>(numElements);
gffFeatures = new HashMap<String, GFFFeature>(numElements);
gffExons = new ArrayList<BasicFeature>(numElements);
gffCdss = new LinkedHashMap<String, GFFCdsCltn>(numElements);
gffUtrs = new ArrayList<BasicFeature>(numElements);
gffMrnaParts = new ArrayList<>(numElements);
}
/**
* First pass, create transcripts so everything
* has a parent (if it exists in the iterator)
*
* @param rawIter
* @return this, for chaining
*/
public GFFCombiner addFeatures(Iterator<Feature> rawIter) {
while (rawIter.hasNext()) {
addFeature((BasicFeature) rawIter.next());
}
return this;
}
public void addFeature(BasicFeature bf) {
String featureType = bf.getType();
String[] parentIDs = bf.getParentIds();
String id = bf.getIdentifier();
if (SequenceOntology.mrnaParts.contains(featureType) && parentIDs != null) {
gffMrnaParts.add(bf);
if (SequenceOntology.exonTypes.contains(featureType) && parentIDs != null) {
gffExons.add(bf);
} else if (SequenceOntology.utrTypes.contains(featureType) && parentIDs != null) {
gffUtrs.add(bf);
} else if (SequenceOntology.cdsTypes.contains(featureType) && parentIDs != null) {
for (String pid : parentIDs) {
GFFCdsCltn cds = gffCdss.get(pid);
if (cds == null) {
cds = new GFFCdsCltn(pid);
gffCdss.put(pid, cds);
}
cds.addPart(bf);
}
}
} else if (id != null) {
gffFeatures.put(id, new GFFFeature(bf));
} else {
igvFeatures.add(bf); // Just use this feature as is.
}
}
public List<Feature> combineFeatures() {
// Exons
for (BasicFeature gffExon : gffExons) {
final String[] parentIds = gffExon.getParentIds();
for (String parentId : parentIds) {
GFFFeature parent = gffFeatures.get(parentId);
if (parent == null) {
parent = createParent(gffExon);
parent.setIdentifier(parentId);
parent.setName(parentId);
gffFeatures.put(parentId, parent);
}
final Exon exon = new Exon(gffExon);
exon.setNonCoding(!SequenceOntology.isCoding(gffExon.getType()));
parent.addExon(exon);
}
}
// Now process utrs. Modify exon if its already defined, create a new one if not.
for (BasicFeature utr : gffUtrs) {
for (String parentId : utr.getParentIds()) {
GFFFeature parent = gffFeatures.get(parentId);
if (parent == null) {
parent = createParent(utr);
parent.setIdentifier(parentId);
parent.setName(parentId);
gffFeatures.put(parentId, parent);
}
parent.addUTRorCDS(utr);
}
}
// Overlay cdss
for (GFFCdsCltn gffCdsCltn : gffCdss.values()) {
// Get the parent.
String parentId = gffCdsCltn.getParentId();
GFFFeature parent = gffFeatures.get(parentId);
if (parent == null) {
// Create a "dummy" transcript for the orphaned cds records
parent = new GFFFeature(gffCdsCltn.chr, gffCdsCltn.start, gffCdsCltn.end, gffCdsCltn.strand);
parent.setIdentifier(parentId);
parent.setName(parentId);
gffFeatures.put(parentId, parent);
}
// Now add the cds objects. There are 2 conventions in use for describing the coding section of mRNAs
// (1) All cds records for the same isoform get the same id. CDS objects with different ids then
// imply different isoforms. In IGV we need to create a parent object for each. (2) each CDS has
// a unique ID, and all cds records with the same parent id belong to the same isoform.
if (gffCdsCltn.isUniqueIds()) {
for (BasicFeature cdsPart : gffCdsCltn.getParts()) {
parent.addUTRorCDS(cdsPart);
}
} else {
Map<String, List<BasicFeature>> cdsPartsMap = gffCdsCltn.getPartsById();
boolean first = true;
for (Map.Entry<String, List<BasicFeature>> entry : cdsPartsMap.entrySet()) {
List<BasicFeature> cdsParts = entry.getValue();
BasicFeature isoform;
if (first) {
isoform = parent;
} else {
isoform = copyForCDS(parent);
igvFeatures.add(isoform);
}
for (BasicFeature cds : cdsParts) {
isoform.addUTRorCDS(cds);
}
first = false;
}
}
}
// Merge attributes (column 9)
for (BasicFeature mrnaPart : gffMrnaParts) {
for (String parentId : mrnaPart.getParentIds()) {
GFFFeature parent = gffFeatures.get(parentId);
if (parent == null) {
// This shouldn't happen, but if it does use feature directly
igvFeatures.add(mrnaPart);
} else {
parent.mergeAttributes(mrnaPart);
}
}
}
igvFeatures.addAll(gffFeatures.values());
for (Feature f : igvFeatures) {
BasicFeature bf = (BasicFeature) f;
if (bf.hasExons()) {
bf.sortExons();
List<Exon> exons = bf.getExons();
int exonNumber = bf.getStrand() == Strand.NEGATIVE ? exons.size() : 1;
int increment = bf.getStrand() == Strand.NEGATIVE ? -1 : 1;
for (Exon ex : exons) {
ex.setNumber(exonNumber);
exonNumber += increment;
}
}
}
FeatureUtils.sortFeatureList(igvFeatures);
return igvFeatures;
}
private GFFFeature createParent(BasicFeature gffExon) {
return new GFFFeature(gffExon.getChr(), gffExon.getStart(), gffExon.getEnd(), gffExon.getStrand());
}
private static BasicFeature copyForCDS(BasicFeature bf) {
BasicFeature copy = new BasicFeature(bf.getChr(), bf.getStart(), bf.getEnd(), bf.getStrand());
copy.setName(bf.getName());
copy.setColor(bf.getColor());
copy.setIdentifier(bf.getIdentifier());
copy.setURL(bf.getURL());
copy.setType(bf.getType());
for (Exon ex : bf.getExons()) {
Exon newExon = new Exon(ex);
newExon.setNonCoding(true);
copy.addExon(newExon);
}
return copy;
}
/**
* Container to hold all the cds records for a given parent (usually an mRNA).
*/
public static class GFFCdsCltn {
String parentId;
List<BasicFeature> cdsParts;
String chr;
int start = Integer.MAX_VALUE;
int end = Integer.MIN_VALUE;
Strand strand;
public GFFCdsCltn(String parentId) {
this.parentId = parentId;
cdsParts = new ArrayList(5);
}
public void addPart(BasicFeature bf) {
cdsParts.add(bf);
this.chr = bf.getChr();
this.start = Math.min(this.start, bf.getStart());
this.end = Math.max(this.end, bf.getEnd());
this.strand = bf.getStrand();
}
public String getParentId() {
return parentId;
}
public List<BasicFeature> getParts() {
return cdsParts;
}
public Map<String, List<BasicFeature>> getPartsById() {
Map<String, List<BasicFeature>> map = new HashMap<String, List<BasicFeature>>();
for (BasicFeature bf : cdsParts) {
String id = bf.getIdentifier();
List<BasicFeature> parts = map.get(id);
if (parts == null) {
parts = new ArrayList<BasicFeature>();
map.put(id, parts);
}
parts.add(bf);
}
return map;
}
public boolean isUniqueIds() {
if (cdsParts.isEmpty()) return true;
Iterator<BasicFeature> iter = cdsParts.iterator();
String firstId = iter.next().getIdentifier();
while (iter.hasNext()) {
BasicFeature bf = iter.next();
if (firstId.equals(bf.getIdentifier())) return false;
}
return true;
}
}
}
}