/**
* Copyright 2010-14 Simon Andrews
*
* This file is part of BamQC.
*
* BamQC is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* BamQC is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with BamQC; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* Changelog:
* - Piero Dalle Pezze: Added protoFeatures, code optimisation, progress listener.
* - Simon Andrews: Class creation.
*/
package uk.ac.babraham.BamQC.AnnotationParsers;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Enumeration;
import java.util.HashMap;
import uk.ac.babraham.BamQC.DataTypes.ProgressListener;
import uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet;
import uk.ac.babraham.BamQC.DataTypes.Genome.Chromosome;
import uk.ac.babraham.BamQC.DataTypes.Genome.Feature;
import uk.ac.babraham.BamQC.DataTypes.Genome.Location;
/**
* The Class GTFAnnotationParser reads sequence features from GTF files
* @author Simon Andrews
* @author Piero Dalle Pezze
*/
public class GTFAnnotationParser extends AnnotationParser {
public GTFAnnotationParser () {
super();
}
/* (non-Javadoc)
* @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#requiresFile()
*/
@Override
public boolean requiresFile() {
return true;
}
/*
* (non-Javadoc)
* @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseGenome(java.io.File)
*/
@Override
public void parseGenome (File baseLocation) throws Exception {}
/* (non-Javadoc)
* @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#name()
*/
@Override
public String name() {
return "GTF Parser";
}
/*
* (non-Javadoc)
* @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseAnnotation(uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet, java.io.File)
*/
@Override
public void parseAnnotation(AnnotationSet annotationSet, File file) throws Exception {
// Update the listeners
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Loading annotation file "+file.getName(), 0, 1);
}
annotationSet.setFile(file);
HashMap<String, Transcript> groupedFeatures = new HashMap<String, Transcript>();
// This will contain all the other features (the else case)
HashMap<String, ProtoFeature> protoFeatures = new HashMap<String, ProtoFeature>();
BufferedReader br = null;
long totalBytes = file.length();
long bytesRead = 0;
int previousPercent = 0;
try {
br = new BufferedReader(new FileReader(file));
String line;
BiotypeMapping bm = BiotypeMapping.getInstance();
while ((line = br.readLine())!= null) {
bytesRead += line.length();
int percent = (int)(bytesRead * 100 / totalBytes);
if (previousPercent < percent && percent%5 == 0){
// Update the listeners
e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Parsing annotation file " + file.getName() + " (" + percent + "%)", percent, 100);
}
previousPercent = percent;
}
if (line.trim().length() == 0) continue; //Ignore blank lines
if (line.startsWith("#")) continue; //Skip comments
String [] sections = line.split("\t");
/*
* The GFFv3 file fileds are:
* 1. name (which must be the chromosome here)
* 2. source (which is actually the biotype for Ensembl GTF files)
* 3. feature type
* 4. start pos
* 5. end pos
* 6. score (which we ignore)
* 7. strand
* 8. frame (which we ignore)
* 9. attributes (structured field allowing us to group features together)
*
*/
// Check to see if we've got enough data to work with
if (sections.length < 9) {
throw new Exception("Not enough data from line '"+line+"'");
}
// Check if we need to modify the biotype or maybe even discard the feature
sections[1] = bm.getEffectiveBiotype(sections[1]);
if (sections[1].equals("DELETE")) continue;
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]);
end = Integer.parseInt(sections[4]);
// End must always be later than start
if (end < start) {
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 7) {
if (sections[6].equals("+")) {
strand = Location.FORWARD;
}
else if (sections[6].equals("-")) {
strand = Location.REVERSE;
}
else {
strand = Location.UNKNOWN;
}
}
else {
strand = Location.UNKNOWN;
}
}
catch (NumberFormatException ex) {
// progressWarningReceived(new BamQCException("Location "+sections[3]+"-"+sections[4]+" was not an integer"));
continue;
}
Chromosome c = annotationSet.chromosomeFactory().getChromosome(sections[0]);
// Now see what we're doing. The only primary features we care about are genes and transcripts
// If we've got one of these we just make up a new feature and get out.
if (sections[2].equals("gene")) {
Feature feature = new Feature(sections[2],sections[1],c);
feature.setLocation(new Location(start,end,strand));
annotationSet.addFeature(feature);
}
else if (sections[2].equals("transcript")) {
// We do the same but we add it to the grouped features set waiting to have
// some exons and maybe a start/stop codon
Feature feature = new Feature(sections[2],sections[1],c);
Transcript transcript = new Transcript(feature);
// We need to get the transcript id.
String transcriptID = getTranscriptIDFromAttributes(sections[8]);
groupedFeatures.put(transcriptID, transcript);
}
else if (sections[2].equals("exon")) {
// We need to find the transcript to which this exon belongs and then add this
// location as a sub-location for that transcript.
String transcriptID = getTranscriptIDFromAttributes(sections[8]);
if (! groupedFeatures.containsKey(transcriptID)) {
// Not sure if this can ever be valid, but we'll treat it as an error for now.
throw new Exception("Found exon with transcript ID "+transcriptID+" but there was no matching transcript feature");
}
groupedFeatures.get(transcriptID).addSublocation(new Location(start, end, strand));
}
else if (sections[2].equals("stop_codon")) {
String transcriptID = getTranscriptIDFromAttributes(sections[8]);
if (! groupedFeatures.containsKey(transcriptID)) {
// Not sure if this can ever be valid, but we'll treat it as an error for now.
throw new Exception("Found stop_codon with transcript ID "+transcriptID+" but there was no matching transcript feature");
}
if (strand == Location.FORWARD) {
groupedFeatures.get(transcriptID).addStopCodon(start);
}
else {
groupedFeatures.get(transcriptID).addStopCodon(end);
}
}
else if (sections[2].equals("start_codon")) {
String transcriptID = getTranscriptIDFromAttributes(sections[8]);
if (! groupedFeatures.containsKey(transcriptID)) {
// Not sure if this can ever be valid, but we'll treat it as an error for now.
throw new Exception("Found start_codon with transcript ID "+transcriptID+" but there was no matching transcript feature");
}
if (strand == Location.FORWARD) {
groupedFeatures.get(transcriptID).addStartCodon(end);
}
else {
groupedFeatures.get(transcriptID).addStartCodon(start);
}
}
else if (sections[2].equals("UTR")) {
// I don't think we need to do anything with these. We can probably
// figure them out from the transcript and codon positions.
}
else {
// We assume that anything else we don't understand is a single span feature
// class so we just enter it directly.
// THIS CODE HERE CAN BE DETRIMENTAL FOR COMPUTATION
// The creation of the annotation set can fail if the file is too large.
// There are just too many objects which can cause a GC crash
// This also causes a delay in the feature collection.
// and increase the analysis when the sam/bam file is parsed.
// Feature feature = new Feature(sections[2],sections[1],c);
// feature.setLocation(new Location(start,end,strand));
// annotationSet.addFeature(feature);
// Instead of adding all these features separately or using sublocation mechanism
// implemented in FeatureGroup, only one location is saved and kept updated. We do something similar
// to the SplitLocation algorithm, but immediately instead of saving all the locations, sorting them,
// and then extract the values from the smaller and the larger.
String str = sections[2]+"_"+sections[1];
if(protoFeatures.containsKey(str)) {
protoFeatures.get(str).update(start, end, strand);
} else {
Feature feature = new Feature(sections[2],sections[1],c);
ProtoFeature protoFeature = new ProtoFeature(feature, start, end, strand);
protoFeatures.put(str, protoFeature);
}
}
}
for(Transcript t : groupedFeatures.values()) {
annotationSet.addFeature(t.getFeature());
}
for(ProtoFeature pf : protoFeatures.values()) {
annotationSet.addFeature(pf.getFeature());
}
} catch (Exception ex) {
throw ex;
} finally {
if(br != null) {
br.close();
// Update the listeners
e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressComplete("Parsing annotation file " + file.getName() + " (100%)\n" +
"Processed features: "+groupedFeatures.size() + "\n" +
"Parsed annotation file " + file.getName(), null);
}
}
}
}
private String getTranscriptIDFromAttributes (String attribString) throws Exception {
String [] attributes = attribString.split(" *; *"); // Should check for escaped colons
for (int a=0;a<attributes.length;a++) {
String [] keyValue = attributes[a].split(" +");
if (keyValue.length !=2) {
throw new Exception("Not 2 values from splitting "+attributes[a]);
}
if (keyValue[0].equals("transcript_id")) {
return new String(keyValue[1]);
}
}
throw new Exception("Coudn't find transcript_id from within "+attribString);
}
}