/** * Copyright 2010-14 Simon Andrews * * This file is part of BamQC. * * BamQC is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * BamQC is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with BamQC; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Changelog: * - Piero Dalle Pezze: Added protoFeatures, code optimisation, progress listener. * - Simon Andrews: Class creation. */ package uk.ac.babraham.BamQC.AnnotationParsers; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.Enumeration; import java.util.HashMap; import uk.ac.babraham.BamQC.DataTypes.ProgressListener; import uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet; import uk.ac.babraham.BamQC.DataTypes.Genome.Chromosome; import uk.ac.babraham.BamQC.DataTypes.Genome.Feature; import uk.ac.babraham.BamQC.DataTypes.Genome.Location; /** * The Class GTFAnnotationParser reads sequence features from GTF files * @author Simon Andrews * @author Piero Dalle Pezze */ public class GTFAnnotationParser extends AnnotationParser { public GTFAnnotationParser () { super(); } /* (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#requiresFile() */ @Override public boolean requiresFile() { return true; } /* * (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseGenome(java.io.File) */ @Override public void parseGenome (File baseLocation) throws Exception {} /* (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#name() */ @Override public String name() { return "GTF Parser"; } /* * (non-Javadoc) * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseAnnotation(uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet, java.io.File) */ @Override public void parseAnnotation(AnnotationSet annotationSet, File file) throws Exception { // Update the listeners Enumeration<ProgressListener> e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressUpdated("Loading annotation file "+file.getName(), 0, 1); } annotationSet.setFile(file); HashMap<String, Transcript> groupedFeatures = new HashMap<String, Transcript>(); // This will contain all the other features (the else case) HashMap<String, ProtoFeature> protoFeatures = new HashMap<String, ProtoFeature>(); BufferedReader br = null; long totalBytes = file.length(); long bytesRead = 0; int previousPercent = 0; try { br = new BufferedReader(new FileReader(file)); String line; BiotypeMapping bm = BiotypeMapping.getInstance(); while ((line = br.readLine())!= null) { bytesRead += line.length(); int percent = (int)(bytesRead * 100 / totalBytes); if (previousPercent < percent && percent%5 == 0){ // Update the listeners e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressUpdated("Parsing annotation file " + file.getName() + " (" + percent + "%)", percent, 100); } previousPercent = percent; } if (line.trim().length() == 0) continue; //Ignore blank lines if (line.startsWith("#")) continue; //Skip comments String [] sections = line.split("\t"); /* * The GFFv3 file fileds are: * 1. name (which must be the chromosome here) * 2. source (which is actually the biotype for Ensembl GTF files) * 3. feature type * 4. start pos * 5. end pos * 6. score (which we ignore) * 7. strand * 8. frame (which we ignore) * 9. attributes (structured field allowing us to group features together) * */ // Check to see if we've got enough data to work with if (sections.length < 9) { throw new Exception("Not enough data from line '"+line+"'"); } // Check if we need to modify the biotype or maybe even discard the feature sections[1] = bm.getEffectiveBiotype(sections[1]); if (sections[1].equals("DELETE")) continue; int strand; int start; int end; try { start = Integer.parseInt(sections[3]); end = Integer.parseInt(sections[4]); // End must always be later than start if (end < start) { int temp = start; start = end; end = temp; } if (sections.length >= 7) { if (sections[6].equals("+")) { strand = Location.FORWARD; } else if (sections[6].equals("-")) { strand = Location.REVERSE; } else { strand = Location.UNKNOWN; } } else { strand = Location.UNKNOWN; } } catch (NumberFormatException ex) { // progressWarningReceived(new BamQCException("Location "+sections[3]+"-"+sections[4]+" was not an integer")); continue; } Chromosome c = annotationSet.chromosomeFactory().getChromosome(sections[0]); // Now see what we're doing. The only primary features we care about are genes and transcripts // If we've got one of these we just make up a new feature and get out. if (sections[2].equals("gene")) { Feature feature = new Feature(sections[2],sections[1],c); feature.setLocation(new Location(start,end,strand)); annotationSet.addFeature(feature); } else if (sections[2].equals("transcript")) { // We do the same but we add it to the grouped features set waiting to have // some exons and maybe a start/stop codon Feature feature = new Feature(sections[2],sections[1],c); Transcript transcript = new Transcript(feature); // We need to get the transcript id. String transcriptID = getTranscriptIDFromAttributes(sections[8]); groupedFeatures.put(transcriptID, transcript); } else if (sections[2].equals("exon")) { // We need to find the transcript to which this exon belongs and then add this // location as a sub-location for that transcript. String transcriptID = getTranscriptIDFromAttributes(sections[8]); if (! groupedFeatures.containsKey(transcriptID)) { // Not sure if this can ever be valid, but we'll treat it as an error for now. throw new Exception("Found exon with transcript ID "+transcriptID+" but there was no matching transcript feature"); } groupedFeatures.get(transcriptID).addSublocation(new Location(start, end, strand)); } else if (sections[2].equals("stop_codon")) { String transcriptID = getTranscriptIDFromAttributes(sections[8]); if (! groupedFeatures.containsKey(transcriptID)) { // Not sure if this can ever be valid, but we'll treat it as an error for now. throw new Exception("Found stop_codon with transcript ID "+transcriptID+" but there was no matching transcript feature"); } if (strand == Location.FORWARD) { groupedFeatures.get(transcriptID).addStopCodon(start); } else { groupedFeatures.get(transcriptID).addStopCodon(end); } } else if (sections[2].equals("start_codon")) { String transcriptID = getTranscriptIDFromAttributes(sections[8]); if (! groupedFeatures.containsKey(transcriptID)) { // Not sure if this can ever be valid, but we'll treat it as an error for now. throw new Exception("Found start_codon with transcript ID "+transcriptID+" but there was no matching transcript feature"); } if (strand == Location.FORWARD) { groupedFeatures.get(transcriptID).addStartCodon(end); } else { groupedFeatures.get(transcriptID).addStartCodon(start); } } else if (sections[2].equals("UTR")) { // I don't think we need to do anything with these. We can probably // figure them out from the transcript and codon positions. } else { // We assume that anything else we don't understand is a single span feature // class so we just enter it directly. // THIS CODE HERE CAN BE DETRIMENTAL FOR COMPUTATION // The creation of the annotation set can fail if the file is too large. // There are just too many objects which can cause a GC crash // This also causes a delay in the feature collection. // and increase the analysis when the sam/bam file is parsed. // Feature feature = new Feature(sections[2],sections[1],c); // feature.setLocation(new Location(start,end,strand)); // annotationSet.addFeature(feature); // Instead of adding all these features separately or using sublocation mechanism // implemented in FeatureGroup, only one location is saved and kept updated. We do something similar // to the SplitLocation algorithm, but immediately instead of saving all the locations, sorting them, // and then extract the values from the smaller and the larger. String str = sections[2]+"_"+sections[1]; if(protoFeatures.containsKey(str)) { protoFeatures.get(str).update(start, end, strand); } else { Feature feature = new Feature(sections[2],sections[1],c); ProtoFeature protoFeature = new ProtoFeature(feature, start, end, strand); protoFeatures.put(str, protoFeature); } } } for(Transcript t : groupedFeatures.values()) { annotationSet.addFeature(t.getFeature()); } for(ProtoFeature pf : protoFeatures.values()) { annotationSet.addFeature(pf.getFeature()); } } catch (Exception ex) { throw ex; } finally { if(br != null) { br.close(); // Update the listeners e = listeners.elements(); while (e.hasMoreElements()) { e.nextElement().progressComplete("Parsing annotation file " + file.getName() + " (100%)\n" + "Processed features: "+groupedFeatures.size() + "\n" + "Parsed annotation file " + file.getName(), null); } } } } private String getTranscriptIDFromAttributes (String attribString) throws Exception { String [] attributes = attribString.split(" *; *"); // Should check for escaped colons for (int a=0;a<attributes.length;a++) { String [] keyValue = attributes[a].split(" +"); if (keyValue.length !=2) { throw new Exception("Not 2 values from splitting "+attributes[a]); } if (keyValue[0].equals("transcript_id")) { return new String(keyValue[1]); } } throw new Exception("Coudn't find transcript_id from within "+attribString); } }