LoadData.java example

Explorer
JContextExplorer-master
- JContextExplorer
  - src
package OperonEvolutionInHalos;

import genomeObjects.AnnotatedGenome;
import genomeObjects.ContextSet;
import genomeObjects.GenomicElement;
import genomeObjects.OrganismSet;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;

import org.biojava3.core.sequence.Strand;

public class LoadData {

	/**
	 * @param args
	 */
	
	//Fields
	public static String GFFDir = "/Users/phillipseitzer/Dropbox/GenomeSets/Sets/Haloarchaea";
	public static String SpeciesNamesFile = "/Users/phillipseitzer/Dropbox/GenomeSets/Process/Halo/SpeciesNames.txt";
	public static String TranslationFile = "/Users/phillipseitzer/Documents/Halophiles_2012/EightyHalophiles/MarkerGenesAnalysis/TranslationFiles.txt";
	public static String GenomeFile = "/Users/phillipseitzer/Documents/Halophiles_2012/EightyHalophiles/GeneMisannotations/GenomeFiles.txt";
	
	//cluster ID counts
	public static LinkedHashMap<String, LinkedList<Integer>> ClusterIDsinOrgs;
	public static LinkedHashMap<String, LinkedList<Integer>> OperonClusterIDsinOrgs;
	public static LinkedHashMap<String, String> TranslationFiles;	// OrgName,File
	public static LinkedHashMap<String, String> GenomeFiles; //OrgName, File
	public static LinkedList<Integer> AllClusters;
	public static LinkedList<Integer> NonSingleCopyClusters;
	public static int MaxClusterNum = 0;
	public static int FastaLineLength = 70;
	
	//gene order processing
	public static LinkedList<Integer> Clusters2Include;
	
	//Organism set in this case is just a hash map
	public static OrganismSet OS;

	//------- Set up OS --------//
	
	//Import all genomes using JCE data structures
	public static void ImportGenomes(){
		
		//initialize organism set
		OS = new OrganismSet();
		
		//instructions on how to process various types
		LinkedList<String> IncludeTypes = new LinkedList<String>();
		IncludeTypes.add("CDS");
		
		LinkedList<String> FeatureOnlyTypes = new LinkedList<String>();
		FeatureOnlyTypes.add("tRNA");
		FeatureOnlyTypes.add("rRNA");
		FeatureOnlyTypes.add("mobile_element");
		FeatureOnlyTypes.add("IS_element");
		
		try {
			BufferedReader br = new BufferedReader(new FileReader(SpeciesNamesFile));
			String Line = null;
			while ((Line = br.readLine()) != null){
				
				//object from JCE library + with appropriate data included
				AnnotatedGenome AG = new AnnotatedGenome();
				String OrgFile = GFFDir + "/" + Line + ".gff";
				AG.setSpecies(Line);
				AG.setIncludeTypes(IncludeTypes);
				AG.setDisplayOnlyTypes(FeatureOnlyTypes);
				AG.importFromGFFFile(OrgFile);
				
				//add to list
				OS.getSpecies().put(Line, AG);
			}
			br.close();
		} catch (Exception e) {
			e.printStackTrace();
		}

		//message
		System.out.println("Genomes loaded from .GFF files.");
		
	}
	
	//Define the intergenic distance context set
	public static void BasicOperons(int OpD){
		
		//iterate through all.
		for (AnnotatedGenome AG : OS.getSpecies().values()){
			AG.ComputeContextSet("BasicOperons", OpD, true);
			//System.out.println("Basic operons computed for " + AG.getSpecies() + ".");
		}
		
		//message
		//System.out.println("All basic operons computed!");
		
	}
	
	// ------ Export ------//
	
	//export operons as context set - for use with JCE, for visualization
	public static void ExportOperonsAsContextSet(String OperonCSFile, String CSName, boolean IncludeSingleGeneOperonInstances){
		
		try {
			
			//open file stream
			BufferedWriter bw = new BufferedWriter(new FileWriter(OperonCSFile));
			
			//Initialize counter
			int ContextCounter = 0;
			
			//write for each organism in the set of organisms.
			for (String s : OS.getSpecies().keySet()){
				
				//retrieve the context set
				ContextSet TheContextSet = null;
				AnnotatedGenome AG = OS.getSpecies().get(s);
				for (ContextSet CS : AG.getGroupings()){
					if (CS.getName().equals(CSName)){
						TheContextSet = CS;
						break;
					}
				}
				
				//retrieve mapping
				HashMap<Integer,LinkedList<GenomicElement>> Mapping =
						TheContextSet.getContextMapping();
				
				//iterate through mapping + write to file
				for (LinkedList<GenomicElement> L : Mapping.values()){
					
					if (IncludeSingleGeneOperonInstances || (!IncludeSingleGeneOperonInstances && L.size() > 1)){
						
						//Increment the context counter
						ContextCounter++;
						
						//Write each line to file.
						for (GenomicElement E : L){
							String Line = s + "\t" 
									 + E.getContig() + "\t"
									 + E.getStart() + "\t"
									 + E.getStop() + "\t"
									 + String.valueOf(ContextCounter) + "\n";
							bw.write(Line);
							bw.flush();
						}
						
					}
					

				}
				
			}
			
			//close file stream
			 bw.close();
			 
			 //output statement
			 System.out.println("Operons successfully exported!");
		} catch (Exception ex){
			ex.printStackTrace();
		}
	}
	
	//Export a gene order pair appropriate query set
	public static void ExportGeneOrderPairQuerySet(String QuerySetFile){
		
		/*
		 * 
		 */
		
	}
	
	//import a list of the clusters under investigation
	public static void ImportClustersToInclude(String ClusterFile){
		Clusters2Include = new LinkedList<Integer>();
		try {
			BufferedReader br = new BufferedReader(new FileReader(ClusterFile));
			String Line = null;
			while ((Line = br.readLine()) != null){
				int Cluster = Integer.parseInt(Line.trim());
				Clusters2Include.add(Cluster);
			}
			br.close();
		} catch (Exception ex){
			ex.printStackTrace();
		}
	}
	
	//show all examples of local duplications
	public static void ShowLocalOperonDuplications(int OpD){
		
		LinkedList<LinkedList<Integer>> LocalDups = new LinkedList<LinkedList<Integer>>();
		
		//iterate through all.
		for (AnnotatedGenome AG : OS.getSpecies().values()){
			for (int i = 1; i < AG.getElements().size()-2; i++){
				GenomicElement E1 = AG.getElements().get(i-1);
				GenomicElement E2 = AG.getElements().get(i);
				GenomicElement E3 = AG.getElements().get(i+1);
				GenomicElement E4 = AG.getElements().get(i+2);
				
				//check for same strand, contig, + distance - in op together
				if (E2.getStrand().equals(E3.getStrand()) 
						&& E2.getContig().equals(E3.getContig())
						&& E3.getStart()-E2.getStop() <= OpD){
					
					//case 1: E2 duplication
					if (E1.getStrand().equals(E2.getStrand()) 
							&& E1.getContig().equals(E2.getContig())
							&& E2.getStart()-E1.getStop() <= OpD 
							&& E1.getClusterID() == E2.getClusterID()){
						
						LinkedList<Integer> ClustList = new LinkedList<Integer>();
						ClustList.add(E1.getClusterID());
						ClustList.add(E3.getClusterID());
						Collections.sort(ClustList);
						if (!LocalDups.contains(ClustList)){
							LocalDups.add(ClustList);
						}
					}
					
					//case 2: E3 duplication
					if (E3.getStrand().equals(E4.getStrand()) 
							&& E3.getContig().equals(E4.getContig())
							&& E4.getStart()-E3.getStop() <= OpD 
							&& E3.getClusterID() == E4.getClusterID()){
						LinkedList<Integer> ClustList = new LinkedList<Integer>();
						ClustList.add(E2.getClusterID());
						ClustList.add(E4.getClusterID());
						Collections.sort(ClustList);
						if (!LocalDups.contains(ClustList)){
							LocalDups.add(ClustList);
						}
					}
					
				}
			}
		}
		
		//sort the list
		Collections.sort(LocalDups, new OperonSet.SortListOfPairs());
		
		//print statements
		for (LinkedList<Integer> L : LocalDups){
			String str = L.get(0) + " ; " + L.get(1);
			System.out.println(str);
		}
		
	}
	// ------ Deprecated ------ //
	
	//Old import way - not taking advantage of JCE
	//Import all genomic info
	public static void ImportGenomesOld(){
		OS = new OrganismSet();
		try {
			//First, build empty hash map
			BufferedReader br = new BufferedReader(new FileReader(SpeciesNamesFile));
			String Line = null;
			while ((Line = br.readLine()) != null){
				AnnotatedGenome AG = new AnnotatedGenome();
				OS.getSpecies().put(Line, AG);
			}
			br.close();
			
			for (String s : OS.getSpecies().keySet()){
				
				//info
				String OrgFile = GFFDir + "/" + s + ".gff";
				LinkedList<GenomicElement> Elements = new LinkedList<GenomicElement>();
				
				//read file, build list
				BufferedReader br2 = new BufferedReader(new FileReader(OrgFile));
				String Line2 = null;
				while ((Line2 = br2.readLine()) != null){
					
					//import each line of the .gff file
					String ImportedLine[] = Line2.split("\t");
					
					//create a new element
					GenomicElement E = new GenomicElement();

					//set appropriate fields of this genomic element with inputs achieved from the GFF file
					E.setGenome(s);
					E.setContig(ImportedLine[0]);
					E.setType(ImportedLine[2]);
					E.setStart(Integer.parseInt(ImportedLine[3]));
					E.setStop(Integer.parseInt(ImportedLine[4]));
					E.DetermineCenter();

					try {
						if(Integer.parseInt(ImportedLine[6])==1){
							E.setStrand(Strand.POSITIVE);
						}else{
							E.setStrand(Strand.NEGATIVE);
						}
					} catch (Exception ex) {
						if (ImportedLine[6].contentEquals("+")){
							E.setStrand(Strand.POSITIVE);
						} else {
							E.setStrand(Strand.NEGATIVE);
						}
					} 

					//set annotation
					E.setAnnotation(ImportedLine[8]);

					//add gene IDs + homology clusters, if available
					if (ImportedLine.length > 9){
						int ClustID = Integer.parseInt(ImportedLine[9]);
						E.setClusterID(ClustID);

						//System.out.println("Set!");
						if (ImportedLine.length > 10){
							E.setGeneID(ImportedLine[10]);
						}
					}

					//add to list, if it doesn't already exist.
					Elements.add(E);
				}
				
				//close file stream
				br2.close();
				
				//update annotatedgenome
				OS.getSpecies().get(s).setElements(Elements);
				
			}
			
		} catch (Exception ex){
			ex.printStackTrace();
		}
		
		//output message
		System.out.println("All organisms loaded.");
	}
	
	//Determine clusters only
	public static void DetermineClustersInOrgs(){
		
		//initialize output structures
		ClusterIDsinOrgs = new LinkedHashMap<String, LinkedList<Integer>>();
		AllClusters = new LinkedList<Integer>();
		NonSingleCopyClusters = new LinkedList<Integer>();
		
		try {
			
			//First, build empty hash map
			BufferedReader br = new BufferedReader(new FileReader(SpeciesNamesFile));
			String Line = null;
			while ((Line = br.readLine()) != null){
				ClusterIDsinOrgs.put(Line, new LinkedList<Integer>());
			}
			br.close();
			
			//next, import each file
			for (String s : ClusterIDsinOrgs.keySet()){
				
				//info
				String OrgFile = GFFDir + "/" + s + ".gff";
				LinkedList<Integer> Clusters = new LinkedList<Integer>();
				
				//read file, build list
				BufferedReader br2 = new BufferedReader(new FileReader(OrgFile));
				String Line2 = null;
				while ((Line2 = br2.readLine()) != null){
					String[] L = Line2.split("\t");
					
					//consider only CDS
					if (L[2].equals("CDS")){
						int ClusterNum = Integer.parseInt(L[9]);
						if (ClusterNum > MaxClusterNum){
							MaxClusterNum = ClusterNum;
						}
						//If the cluster is not already there, add
						if (!Clusters.contains(ClusterNum)){
							Clusters.add(ClusterNum);
						} else {
						//if it is, note this
							NonSingleCopyClusters.add(ClusterNum);
						}
						if (!AllClusters.contains(ClusterNum)){
							AllClusters.add(ClusterNum);
						}
					}

				}
				br2.close();
				
				//sort
				Collections.sort(Clusters);
				
				//store
				ClusterIDsinOrgs.put(s, Clusters);
				
				//System.out.println("Loaded " + s + ".");
				
			}
			
			System.out.println("Loaded Cluster ID Mappings.");

			
		} catch (Exception ex){
			ex.printStackTrace();
		}
		
	}
	
	//Import protein translation map
	public static void ImportTranslationFileList(){
		TranslationFiles = new LinkedHashMap<String,String>();
		try {
			BufferedReader br  = new BufferedReader(new FileReader(TranslationFile));
			String Line = null;
			while((Line = br.readLine()) != null){
				String[] Path = Line.split("/");
				String[] Name = Path[Path.length-1].split(".fasta");
				TranslationFiles.put(Name[0],Line);
			}
			br.close();
		} catch (Exception ex){
			ex.printStackTrace();
		}
	}

	//Retrieve a single stretch of nucleotides
	public static String GetNucleotides(String Organism, String Contig, int StartCoord, int StopCoord){
		
		//Initialize output 
		String nt = "";
		
		try{
			
			//Retrieve genome
			BufferedReader br = new BufferedReader(new FileReader(GenomeFiles.get(Organism)));
			String Line = null;
			boolean FoundContig = false;
			boolean StartedSequence = false;
			boolean NewLine = true;
			int CoordCounter = 0;
			while ((Line = br.readLine()) != null){
				
				//every new line - set switch
				NewLine = true;
				
				//find appropriate header
				if (Line.startsWith(">")){
					if (Line.contains(Contig)){
						FoundContig = true;
					}
				} else if (FoundContig){
					
					 //check for starting a sequence
					 if (!StartedSequence){
						 if (StartCoord - CoordCounter <= FastaLineLength){
							 nt = Line.substring((StartCoord-CoordCounter-1), Line.length()).toUpperCase();
							 StartedSequence = true;
							 NewLine = false;
						 } 
					 } else {
						 NewLine = true;
					 }
					 
				     //once started, write until appropriate to stop
					 if (StartedSequence){
						 if (StopCoord - CoordCounter <= FastaLineLength){
							 nt = nt + Line.substring((StopCoord-CoordCounter-1),Line.length()).toUpperCase();
							 break;
						 } else if (NewLine){ //add whole line
							 nt = nt + Line.toUpperCase();
						 }
					 }
					 
					 //increment counter, unless stop is nearby
					 CoordCounter = CoordCounter + FastaLineLength;
				}
			}
			
		} catch (Exception ex){
			ex.printStackTrace();
		}
		
		//return statement
		return nt;
	}

	//Import genome seq map
	public static void ImportGenomeFileList(){
		GenomeFiles = new LinkedHashMap<String,String>();
		try {
			BufferedReader br  = new BufferedReader(new FileReader(GenomeFile));
			String Line = null;
			while((Line = br.readLine()) != null){
				String[] Path = Line.split("/");
				String[] Name = Path[Path.length-1].split(".fasta");
				GenomeFiles.put(Name[0],Line);
			}
		} catch (Exception ex){
			ex.printStackTrace();
		}
	}
	
//	//Retrieve a single protein sequence
//	public static LinkedList<String> ProteinSequence(GenomicElement E){
//		
//		//Initialize output
//		LinkedList<String> ProteinSequence = new LinkedList<String>();
//		
//		try {
//			
//			//Retrieve file name
//			String TransFile = TranslationFiles.get(E.Genome);
//			
//			BufferedReader br = new BufferedReader(new FileReader(TransFile));
//			String Line = null;
//			boolean FoundTheProtein = false;
//			
//			while ((Line = br.readLine()) != null){
//				
//				//find protein, or read protein sequence.
//				if (Line.startsWith(">")){
//					
//					//Recover components from line
//					String[] GetContig = Line.split("\\{");
//					String[] FinishContig = GetContig[1].split("\\}");
//					String Contig = FinishContig[0];
//					
//					String[] GetCoords = Line.split("\\[");
//					String[] FinishCoords = GetCoords[1].split("\\]");
//					String[] GetIndCoords = FinishCoords[0].split(":");
//					int StartPos = Integer.parseInt(GetIndCoords[0]);
//					int StopPos = Integer.parseInt(GetIndCoords[1]);
//					
//					//check for match
//					if (E.Contig.equals(Contig) &&
//							E.Start == StartPos && E.Stop == StopPos){
//						FoundTheProtein = true;
//						String LineAndNewLine = Line + "\n";
//						ProteinSequence.add(LineAndNewLine);
//					}
//					
//				} else if (FoundTheProtein){
//					String LineAndNewLine = Line + "\n";
//					ProteinSequence.add(LineAndNewLine);
//					
//					//after the empty line, break out of loop.
//					if (Line.equals("")){
//						break;
//					}
//				}
//				
//			}
//			
//			//close file stream
//			br.close();
//			
//		} catch (Exception ex){
//			ex.printStackTrace();
//		}
//		
//		//return statement
//		return ProteinSequence;
//	}
	
//	//Create a list of sets that are filtered
//	public static void CreateFilteredSets(int OpD){
//		
//		//initialize output structures
//		ClusterIDsinOrgs = new LinkedHashMap<String, LinkedList<Integer>>();
//		OperonClusterIDsinOrgs = new LinkedHashMap<String, LinkedList<Integer>>();
//		AllClusters = new LinkedList<Integer>();
//		
//		//create filtered sets
//		for (String s : OS.keySet()){
//			
//			//Initialize lists
//			LinkedList<Integer> Clusters = new LinkedList<Integer>();
//			LinkedList<Integer> OperonClusters = new LinkedList<Integer>();
//			//check every element
//			LinkedList<GenomicElement> Elements = OS.get(s);
//			
//			//check each for operon
//			for (int i = 0; i < Elements.size(); i++){
//				
//				//reset designation - default = false
//				boolean InAnOperon = false;
//				
//				//Element
//				GenomicElement ECurrent = Elements.get(i);
//				
//				//only consider coding regions
//				if (ECurrent.Type.equals("CDS")){
//					
//					//check previous, when applicable
//					if (i > 0){
//						
//						//retrieve previous element
//						GenomicElement EPrevious = Elements.get(i-1);
//						
//						//strandedness + distance match
//						if ((ECurrent.IntStrand == 1 && EPrevious.IntStrand == 1) ||
//								(ECurrent.IntStrand == -1 && EPrevious.IntStrand == -1)){
//							if (ECurrent.Start - EPrevious.Stop <= OpD){
//								InAnOperon = true;
//							}
//						}
//						
//					}
//					
//					//check following, when applicable
//					if (i < Elements.size() - 1){
//						
//						//retrieve following element
//						GenomicElement EFollowing = Elements.get(i+1);
//						
//						//strandedness + distance match
//						if ((ECurrent.IntStrand == 1 && EFollowing.IntStrand == 1) ||
//								(ECurrent.IntStrand == -1 && EFollowing.IntStrand == -1)){
//							if (EFollowing.Start - ECurrent.Stop <= OpD){
//								InAnOperon = true;
//							}
//						}
//					}
//					
//					//if the gene is still to be regarded as in an operon, proceed
//					if (InAnOperon){
//						if (!OperonClusters.contains(ECurrent.ClusterID)){
//							OperonClusters.add(ECurrent.ClusterID);
//						}
//					}
//					
//					//also note genes that may or may not be in an operon
//					if (!Clusters.contains(ECurrent.ClusterID)){
//						Clusters.add(ECurrent.ClusterID);
//					}
//					
//					//keep track of every CDS cluster across the whole set
//					if (!AllClusters.contains(ECurrent.ClusterID)){
//						AllClusters.add(ECurrent.ClusterID);
//					}
//
//				}
//				
//			}
//			
//			//store this in hash map
//			//sort
//			Collections.sort(Clusters);
//			
//			//store
//			ClusterIDsinOrgs.put(s, Clusters);
//			OperonClusterIDsinOrgs.put(s, OperonClusters);
//
//		}
//		
//		System.out.println("Determined organism-specific cluster IDs.");
//		
//	}
	
}