GFF3AnnotationParser.java example

Explorer
BamQC-master
/**
 * Copyright 2010-14 Simon Andrews
 *
 *    This file is part of BamQC.
 *
 *    BamQC is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    BamQC is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with BamQC; if not, write to the Free Software
 *    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
/*
 * Changelog: 
 * - Piero Dalle Pezze: Added protoFeatures, code optimisation, progress listener.
 * - Simon Andrews: Class creation.
 */
package uk.ac.babraham.BamQC.AnnotationParsers;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.ArrayList;

import uk.ac.babraham.BamQC.DataTypes.ProgressListener;
import uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet;
import uk.ac.babraham.BamQC.DataTypes.Genome.Chromosome;
import uk.ac.babraham.BamQC.DataTypes.Genome.Feature;
import uk.ac.babraham.BamQC.DataTypes.Genome.Location;

/**
 * The Class GFFAnnotationParser reads sequence features from GFFv3 files
 * @author Simon Andrews
 * @author Piero Dalle Pezze
 */
public class GFF3AnnotationParser extends AnnotationParser {

	
	public GFF3AnnotationParser () { 
		super();
	}
	
	
	/* (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#requiresFile()
	 */
	@Override
	public boolean requiresFile() {
		return true;
	}

	/*
	 * (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseGenome(java.io.File)
	 */
	@Override
	public void parseGenome (File baseLocation) throws Exception {}
	

	/* (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#name()
	 */
	@Override
	public String name() {
		return "GFF Parser";
	}
	
	/*
	 * (non-Javadoc)
	 * @see uk.ac.babraham.BamQC.AnnotationParsers.AnnotationParser#parseAnnotation(uk.ac.babraham.BamQC.DataTypes.Genome.AnnotationSet, java.io.File)
	 */
	@Override
	public void parseAnnotation(AnnotationSet annotationSet, File file) throws Exception {

		// Update the listeners
		Enumeration<ProgressListener> e = listeners.elements();
		while (e.hasMoreElements()) {
			e.nextElement().progressUpdated("Loading annotation file "+file.getName(), 0, 1);
		}
		
		
		annotationSet.setFile(file);
		
		HashMap<String, FeatureGroup> groupedFeatures = new HashMap<String, FeatureGroup>();
		// This will contain all the other features (the else case)
		HashMap<String, ProtoFeature> protoFeatures = new HashMap<String, ProtoFeature>();
		
		BufferedReader br = null;
		
        long totalBytes = file.length();                    
        long bytesRead = 0;
        int previousPercent = 0;
		
		try { 
			
			br = new BufferedReader(new FileReader(file));

			String line;

			while ((line = br.readLine())!= null) {


	            bytesRead += line.length();
	            int percent = (int)(bytesRead * 100 / totalBytes);          
	            if (previousPercent < percent && percent%5 == 0){
	        		// Update the listeners
	        		e = listeners.elements();
	        		while (e.hasMoreElements()) {
	        			e.nextElement().progressUpdated("Parsing annotation file " + file.getName() + " (" + percent + "%)", percent, 100);
	        		}
	                previousPercent = percent;
	            }


				if (line.trim().length() == 0) continue;  //Ignore blank lines
				if (line.startsWith("#")) continue; //Skip comments

				String [] sections = line.split("\t");

				/*
				 * The GFFv3 file fileds are:
				 *    1. name (which must be the chromosome here)
				 *    2. source (which we ignore)
				 *    3. feature type
				 *    4. start pos
				 *    5. end pos
				 *    6. score (which we ignore)
				 *    7. strand
				 *    8. frame (which we ignore)
				 *    9. attributes (structured field allowing us to group features together)
				 *    
				 */

				// Check to see if we've got enough data to work with
				//			if (sections.length < 7) {
				//				progressWarningReceived(new BamQCException("Not enough data from line '"+line+"'"));
				//				continue;
				//			}

				int strand;
				int start;
				int end;

				try {

					start = Integer.parseInt(sections[3]);
					end = Integer.parseInt(sections[4]);

					// End must always be later than start
					if (end < start) {
						int temp = start;
						start = end;
						end = temp;
					}

					if (sections.length >= 7) {
						if (sections[6].equals("+")) {
							strand = Location.FORWARD;
						}
						else if (sections[6].equals("-")) {
							strand = Location.REVERSE;
						}
						else {
							strand = Location.UNKNOWN;
						}
					}
					else {
						strand = Location.UNKNOWN;
					}
				}
				catch (NumberFormatException ex) {
					//				progressWarningReceived(new BamQCException("Location "+sections[3]+"-"+sections[4]+" was not an integer"));
					continue;
				}

				Chromosome c = annotationSet.chromosomeFactory().getChromosome(sections[0]);

				if (sections.length > 8 && sections[8].trim().length() > 0) {

					String [] attributes = sections[8].split(" *; *"); // Should check for escaped colons

					// Make up a data structure of the attributes we have
					HashMap<String,ArrayList<String>> keyValuePairs = new HashMap<String, ArrayList<String>>();

					for (int a=0;a<attributes.length;a++) {
						String [] keyValue = attributes[a].split("=",2); // Should check for escaped equals

						// See if we didn't get split
						if (keyValue.length == 1) {
							// This could be a GTF file which uses quoted values in space delimited fields
							keyValue = attributes[a].split(" \"");
							if (keyValue.length == 2) {
								// We need to remove the quote from the end of the value
								keyValue[1] = keyValue[1].substring(0, keyValue[1].length()-1);
								//							System.out.println("Key='"+keyValue[0]+"' value='"+keyValue[1]+"'");
							}
						}

						if (keyValue.length == 2) {
							if (keyValuePairs.containsKey(keyValue[0])) {
								keyValuePairs.get(keyValue[0]).add(keyValue[1]);
							}
							else {
								ArrayList<String> newVector = new ArrayList<String>();
								newVector.add(keyValue[1]);
								keyValuePairs.put(keyValue[0], newVector);
							}

						}

						else {
							// progressWarningReceived(new BamQCException("No key value delimiter in "+attributes[a]));
						}

					}

					// We now need to figure out what we're going to do with this feature.

					// If it's a GFFv3 file and this feature is a subfeature of another
					// type of feature then we need to simply add this as a sublocation
					// to the existing feature.  We only allow this for exon and CDS features
					// since mRNA has gene as a parent and we don't want to boot that

					if (keyValuePairs.containsKey("Parent")  && ! sections[2].equals("mRNA")) {

						// Features of a type get combined under their parent

						// We change exons to mRNA so we don't end up with spliced exon objects
						if (sections[2].equals("exon")) sections[2] = "mRNA";

						String [] parents = keyValuePairs.get("Parent").get(0).split(",");

						for (int p=0;p<parents.length;p++) {

							// System.out.println("Adding feature "+sections[2]+" to GFFv3 parent "+parents[p]);

							if (!groupedFeatures.containsKey(sections[2]+"_"+parents[p])) {
								// Make a new feature to which we can add this
								Feature feature = new Feature(sections[2],c);
								groupedFeatures.put(sections[2]+"_"+parents[p], new FeatureGroup(feature));
							}	
							groupedFeatures.get(sections[2]+"_"+parents[p]).addSublocation(new Location(start, end, strand));

						}
					}


					// This could be a GTF file.  If so then we add the subfeature to the appropriate
					// parent feature
					else if (keyValuePairs.containsKey("transcript_id")) {

						if (sections[2].equals("exon")) sections[2] = "mRNA";

						// System.out.println("Adding feature "+sections[2]+" to GTF parent "+keyValuePairs.get("trancript_id").elementAt(0));

						if (! groupedFeatures.containsKey(sections[2]+"_"+keyValuePairs.get("transcript_id").get(0))) {
							Feature feature = new Feature(sections[2],c);
							groupedFeatures.put(sections[2]+"_"+keyValuePairs.get("transcript_id").get(0), new FeatureGroup(feature));
						}						
						groupedFeatures.get(sections[2]+"_"+keyValuePairs.get("transcript_id").get(0)).addSublocation(new Location(start, end, strand));
					}

					else {
						// If we get here we're making a feature with attributes
						if (keyValuePairs.containsKey("ID")) {
							// This is a feature which may end up having subfeatures
							Feature feature = new Feature(sections[2],c);
							groupedFeatures.put(sections[2]+"_"+keyValuePairs.get("ID").get(0), new FeatureGroup(feature));
						}
						groupedFeatures.get(sections[2]+"_"+keyValuePairs.get("ID").get(0)).addSublocation(new Location(start,end,strand));
					}

				}
				else {
					// We assume that anything else we don't understand is a single span feature
					// class so we just enter it directly.
					
					// THIS CODE HERE CAN BE DETRIMENTAL FOR COMPUTATION
					// The creation of the annotation set can fail if the file is too large.
					// There are just too many features which can cause a GC crash. 
					// This also causes a delay in the feature collection
					// and increase the analysis when the sam/bam file is parsed.
//					Feature feature = new Feature(sections[2],sections[1],c);
//					feature.setLocation(new Location(start,end,strand));
//					annotationSet.addFeature(feature);
						
					// Instead of adding all these features separately or using sublocation mechanism 
					// implemented in FeatureGroup, only one location is saved and kept updated. We do something similar 
					// to the SplitLocation algorithm, but immediately instead of saving all the locations, sorting them, 
					// and then extract the values from the smaller and the larger. 
					String str = sections[2]+"_"+sections[1];
					if(protoFeatures.containsKey(str)) {
						protoFeatures.get(str).update(start, end, strand);
					} else {
						Feature feature = new Feature(sections[2],sections[1],c);
						ProtoFeature protoFeature = new ProtoFeature(feature, start, end, strand);
						protoFeatures.put(str, protoFeature);
					}
				
				}

			}
			// Now go through the grouped features adding them to the annotation set	
			for(FeatureGroup fg : groupedFeatures.values()) {
				annotationSet.addFeature(fg.getFeature());
			}
			
			for(ProtoFeature pf : protoFeatures.values()) {
				annotationSet.addFeature(pf.getFeature());
			}
			
		} catch(Exception ex) {
			throw ex;
		} finally {
			if(br != null) {
				br.close();
				// Update the listeners
				e = listeners.elements();
				while (e.hasMoreElements()) {
	    			e.nextElement().progressComplete("Parsing annotation file " + file.getName() + " (100%)\n" +
	    											 "Processed features: "+groupedFeatures.size() + "\n" +
	    											 "Parsed annotation file " + file.getName(), null);
				}
			}
		}

	}

}