package OperonEvolutionInHalos;
import genomeObjects.AnnotatedGenome;
import genomeObjects.ContextSet;
import genomeObjects.GenomicElement;
import genomeObjects.OrganismSet;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import org.biojava3.core.sequence.Strand;
public class LoadData {
/**
* @param args
*/
//Fields
public static String GFFDir = "/Users/phillipseitzer/Dropbox/GenomeSets/Sets/Haloarchaea";
public static String SpeciesNamesFile = "/Users/phillipseitzer/Dropbox/GenomeSets/Process/Halo/SpeciesNames.txt";
public static String TranslationFile = "/Users/phillipseitzer/Documents/Halophiles_2012/EightyHalophiles/MarkerGenesAnalysis/TranslationFiles.txt";
public static String GenomeFile = "/Users/phillipseitzer/Documents/Halophiles_2012/EightyHalophiles/GeneMisannotations/GenomeFiles.txt";
//cluster ID counts
public static LinkedHashMap<String, LinkedList<Integer>> ClusterIDsinOrgs;
public static LinkedHashMap<String, LinkedList<Integer>> OperonClusterIDsinOrgs;
public static LinkedHashMap<String, String> TranslationFiles; // OrgName,File
public static LinkedHashMap<String, String> GenomeFiles; //OrgName, File
public static LinkedList<Integer> AllClusters;
public static LinkedList<Integer> NonSingleCopyClusters;
public static int MaxClusterNum = 0;
public static int FastaLineLength = 70;
//gene order processing
public static LinkedList<Integer> Clusters2Include;
//Organism set in this case is just a hash map
public static OrganismSet OS;
//------- Set up OS --------//
//Import all genomes using JCE data structures
public static void ImportGenomes(){
//initialize organism set
OS = new OrganismSet();
//instructions on how to process various types
LinkedList<String> IncludeTypes = new LinkedList<String>();
IncludeTypes.add("CDS");
LinkedList<String> FeatureOnlyTypes = new LinkedList<String>();
FeatureOnlyTypes.add("tRNA");
FeatureOnlyTypes.add("rRNA");
FeatureOnlyTypes.add("mobile_element");
FeatureOnlyTypes.add("IS_element");
try {
BufferedReader br = new BufferedReader(new FileReader(SpeciesNamesFile));
String Line = null;
while ((Line = br.readLine()) != null){
//object from JCE library + with appropriate data included
AnnotatedGenome AG = new AnnotatedGenome();
String OrgFile = GFFDir + "/" + Line + ".gff";
AG.setSpecies(Line);
AG.setIncludeTypes(IncludeTypes);
AG.setDisplayOnlyTypes(FeatureOnlyTypes);
AG.importFromGFFFile(OrgFile);
//add to list
OS.getSpecies().put(Line, AG);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
//message
System.out.println("Genomes loaded from .GFF files.");
}
//Define the intergenic distance context set
public static void BasicOperons(int OpD){
//iterate through all.
for (AnnotatedGenome AG : OS.getSpecies().values()){
AG.ComputeContextSet("BasicOperons", OpD, true);
//System.out.println("Basic operons computed for " + AG.getSpecies() + ".");
}
//message
//System.out.println("All basic operons computed!");
}
// ------ Export ------//
//export operons as context set - for use with JCE, for visualization
public static void ExportOperonsAsContextSet(String OperonCSFile, String CSName, boolean IncludeSingleGeneOperonInstances){
try {
//open file stream
BufferedWriter bw = new BufferedWriter(new FileWriter(OperonCSFile));
//Initialize counter
int ContextCounter = 0;
//write for each organism in the set of organisms.
for (String s : OS.getSpecies().keySet()){
//retrieve the context set
ContextSet TheContextSet = null;
AnnotatedGenome AG = OS.getSpecies().get(s);
for (ContextSet CS : AG.getGroupings()){
if (CS.getName().equals(CSName)){
TheContextSet = CS;
break;
}
}
//retrieve mapping
HashMap<Integer,LinkedList<GenomicElement>> Mapping =
TheContextSet.getContextMapping();
//iterate through mapping + write to file
for (LinkedList<GenomicElement> L : Mapping.values()){
if (IncludeSingleGeneOperonInstances || (!IncludeSingleGeneOperonInstances && L.size() > 1)){
//Increment the context counter
ContextCounter++;
//Write each line to file.
for (GenomicElement E : L){
String Line = s + "\t"
+ E.getContig() + "\t"
+ E.getStart() + "\t"
+ E.getStop() + "\t"
+ String.valueOf(ContextCounter) + "\n";
bw.write(Line);
bw.flush();
}
}
}
}
//close file stream
bw.close();
//output statement
System.out.println("Operons successfully exported!");
} catch (Exception ex){
ex.printStackTrace();
}
}
//Export a gene order pair appropriate query set
public static void ExportGeneOrderPairQuerySet(String QuerySetFile){
/*
*
*/
}
//import a list of the clusters under investigation
public static void ImportClustersToInclude(String ClusterFile){
Clusters2Include = new LinkedList<Integer>();
try {
BufferedReader br = new BufferedReader(new FileReader(ClusterFile));
String Line = null;
while ((Line = br.readLine()) != null){
int Cluster = Integer.parseInt(Line.trim());
Clusters2Include.add(Cluster);
}
br.close();
} catch (Exception ex){
ex.printStackTrace();
}
}
//show all examples of local duplications
public static void ShowLocalOperonDuplications(int OpD){
LinkedList<LinkedList<Integer>> LocalDups = new LinkedList<LinkedList<Integer>>();
//iterate through all.
for (AnnotatedGenome AG : OS.getSpecies().values()){
for (int i = 1; i < AG.getElements().size()-2; i++){
GenomicElement E1 = AG.getElements().get(i-1);
GenomicElement E2 = AG.getElements().get(i);
GenomicElement E3 = AG.getElements().get(i+1);
GenomicElement E4 = AG.getElements().get(i+2);
//check for same strand, contig, + distance - in op together
if (E2.getStrand().equals(E3.getStrand())
&& E2.getContig().equals(E3.getContig())
&& E3.getStart()-E2.getStop() <= OpD){
//case 1: E2 duplication
if (E1.getStrand().equals(E2.getStrand())
&& E1.getContig().equals(E2.getContig())
&& E2.getStart()-E1.getStop() <= OpD
&& E1.getClusterID() == E2.getClusterID()){
LinkedList<Integer> ClustList = new LinkedList<Integer>();
ClustList.add(E1.getClusterID());
ClustList.add(E3.getClusterID());
Collections.sort(ClustList);
if (!LocalDups.contains(ClustList)){
LocalDups.add(ClustList);
}
}
//case 2: E3 duplication
if (E3.getStrand().equals(E4.getStrand())
&& E3.getContig().equals(E4.getContig())
&& E4.getStart()-E3.getStop() <= OpD
&& E3.getClusterID() == E4.getClusterID()){
LinkedList<Integer> ClustList = new LinkedList<Integer>();
ClustList.add(E2.getClusterID());
ClustList.add(E4.getClusterID());
Collections.sort(ClustList);
if (!LocalDups.contains(ClustList)){
LocalDups.add(ClustList);
}
}
}
}
}
//sort the list
Collections.sort(LocalDups, new OperonSet.SortListOfPairs());
//print statements
for (LinkedList<Integer> L : LocalDups){
String str = L.get(0) + " ; " + L.get(1);
System.out.println(str);
}
}
// ------ Deprecated ------ //
//Old import way - not taking advantage of JCE
//Import all genomic info
public static void ImportGenomesOld(){
OS = new OrganismSet();
try {
//First, build empty hash map
BufferedReader br = new BufferedReader(new FileReader(SpeciesNamesFile));
String Line = null;
while ((Line = br.readLine()) != null){
AnnotatedGenome AG = new AnnotatedGenome();
OS.getSpecies().put(Line, AG);
}
br.close();
for (String s : OS.getSpecies().keySet()){
//info
String OrgFile = GFFDir + "/" + s + ".gff";
LinkedList<GenomicElement> Elements = new LinkedList<GenomicElement>();
//read file, build list
BufferedReader br2 = new BufferedReader(new FileReader(OrgFile));
String Line2 = null;
while ((Line2 = br2.readLine()) != null){
//import each line of the .gff file
String ImportedLine[] = Line2.split("\t");
//create a new element
GenomicElement E = new GenomicElement();
//set appropriate fields of this genomic element with inputs achieved from the GFF file
E.setGenome(s);
E.setContig(ImportedLine[0]);
E.setType(ImportedLine[2]);
E.setStart(Integer.parseInt(ImportedLine[3]));
E.setStop(Integer.parseInt(ImportedLine[4]));
E.DetermineCenter();
try {
if(Integer.parseInt(ImportedLine[6])==1){
E.setStrand(Strand.POSITIVE);
}else{
E.setStrand(Strand.NEGATIVE);
}
} catch (Exception ex) {
if (ImportedLine[6].contentEquals("+")){
E.setStrand(Strand.POSITIVE);
} else {
E.setStrand(Strand.NEGATIVE);
}
}
//set annotation
E.setAnnotation(ImportedLine[8]);
//add gene IDs + homology clusters, if available
if (ImportedLine.length > 9){
int ClustID = Integer.parseInt(ImportedLine[9]);
E.setClusterID(ClustID);
//System.out.println("Set!");
if (ImportedLine.length > 10){
E.setGeneID(ImportedLine[10]);
}
}
//add to list, if it doesn't already exist.
Elements.add(E);
}
//close file stream
br2.close();
//update annotatedgenome
OS.getSpecies().get(s).setElements(Elements);
}
} catch (Exception ex){
ex.printStackTrace();
}
//output message
System.out.println("All organisms loaded.");
}
//Determine clusters only
public static void DetermineClustersInOrgs(){
//initialize output structures
ClusterIDsinOrgs = new LinkedHashMap<String, LinkedList<Integer>>();
AllClusters = new LinkedList<Integer>();
NonSingleCopyClusters = new LinkedList<Integer>();
try {
//First, build empty hash map
BufferedReader br = new BufferedReader(new FileReader(SpeciesNamesFile));
String Line = null;
while ((Line = br.readLine()) != null){
ClusterIDsinOrgs.put(Line, new LinkedList<Integer>());
}
br.close();
//next, import each file
for (String s : ClusterIDsinOrgs.keySet()){
//info
String OrgFile = GFFDir + "/" + s + ".gff";
LinkedList<Integer> Clusters = new LinkedList<Integer>();
//read file, build list
BufferedReader br2 = new BufferedReader(new FileReader(OrgFile));
String Line2 = null;
while ((Line2 = br2.readLine()) != null){
String[] L = Line2.split("\t");
//consider only CDS
if (L[2].equals("CDS")){
int ClusterNum = Integer.parseInt(L[9]);
if (ClusterNum > MaxClusterNum){
MaxClusterNum = ClusterNum;
}
//If the cluster is not already there, add
if (!Clusters.contains(ClusterNum)){
Clusters.add(ClusterNum);
} else {
//if it is, note this
NonSingleCopyClusters.add(ClusterNum);
}
if (!AllClusters.contains(ClusterNum)){
AllClusters.add(ClusterNum);
}
}
}
br2.close();
//sort
Collections.sort(Clusters);
//store
ClusterIDsinOrgs.put(s, Clusters);
//System.out.println("Loaded " + s + ".");
}
System.out.println("Loaded Cluster ID Mappings.");
} catch (Exception ex){
ex.printStackTrace();
}
}
//Import protein translation map
public static void ImportTranslationFileList(){
TranslationFiles = new LinkedHashMap<String,String>();
try {
BufferedReader br = new BufferedReader(new FileReader(TranslationFile));
String Line = null;
while((Line = br.readLine()) != null){
String[] Path = Line.split("/");
String[] Name = Path[Path.length-1].split(".fasta");
TranslationFiles.put(Name[0],Line);
}
br.close();
} catch (Exception ex){
ex.printStackTrace();
}
}
//Retrieve a single stretch of nucleotides
public static String GetNucleotides(String Organism, String Contig, int StartCoord, int StopCoord){
//Initialize output
String nt = "";
try{
//Retrieve genome
BufferedReader br = new BufferedReader(new FileReader(GenomeFiles.get(Organism)));
String Line = null;
boolean FoundContig = false;
boolean StartedSequence = false;
boolean NewLine = true;
int CoordCounter = 0;
while ((Line = br.readLine()) != null){
//every new line - set switch
NewLine = true;
//find appropriate header
if (Line.startsWith(">")){
if (Line.contains(Contig)){
FoundContig = true;
}
} else if (FoundContig){
//check for starting a sequence
if (!StartedSequence){
if (StartCoord - CoordCounter <= FastaLineLength){
nt = Line.substring((StartCoord-CoordCounter-1), Line.length()).toUpperCase();
StartedSequence = true;
NewLine = false;
}
} else {
NewLine = true;
}
//once started, write until appropriate to stop
if (StartedSequence){
if (StopCoord - CoordCounter <= FastaLineLength){
nt = nt + Line.substring((StopCoord-CoordCounter-1),Line.length()).toUpperCase();
break;
} else if (NewLine){ //add whole line
nt = nt + Line.toUpperCase();
}
}
//increment counter, unless stop is nearby
CoordCounter = CoordCounter + FastaLineLength;
}
}
} catch (Exception ex){
ex.printStackTrace();
}
//return statement
return nt;
}
//Import genome seq map
public static void ImportGenomeFileList(){
GenomeFiles = new LinkedHashMap<String,String>();
try {
BufferedReader br = new BufferedReader(new FileReader(GenomeFile));
String Line = null;
while((Line = br.readLine()) != null){
String[] Path = Line.split("/");
String[] Name = Path[Path.length-1].split(".fasta");
GenomeFiles.put(Name[0],Line);
}
} catch (Exception ex){
ex.printStackTrace();
}
}
// //Retrieve a single protein sequence
// public static LinkedList<String> ProteinSequence(GenomicElement E){
//
// //Initialize output
// LinkedList<String> ProteinSequence = new LinkedList<String>();
//
// try {
//
// //Retrieve file name
// String TransFile = TranslationFiles.get(E.Genome);
//
// BufferedReader br = new BufferedReader(new FileReader(TransFile));
// String Line = null;
// boolean FoundTheProtein = false;
//
// while ((Line = br.readLine()) != null){
//
// //find protein, or read protein sequence.
// if (Line.startsWith(">")){
//
// //Recover components from line
// String[] GetContig = Line.split("\\{");
// String[] FinishContig = GetContig[1].split("\\}");
// String Contig = FinishContig[0];
//
// String[] GetCoords = Line.split("\\[");
// String[] FinishCoords = GetCoords[1].split("\\]");
// String[] GetIndCoords = FinishCoords[0].split(":");
// int StartPos = Integer.parseInt(GetIndCoords[0]);
// int StopPos = Integer.parseInt(GetIndCoords[1]);
//
// //check for match
// if (E.Contig.equals(Contig) &&
// E.Start == StartPos && E.Stop == StopPos){
// FoundTheProtein = true;
// String LineAndNewLine = Line + "\n";
// ProteinSequence.add(LineAndNewLine);
// }
//
// } else if (FoundTheProtein){
// String LineAndNewLine = Line + "\n";
// ProteinSequence.add(LineAndNewLine);
//
// //after the empty line, break out of loop.
// if (Line.equals("")){
// break;
// }
// }
//
// }
//
// //close file stream
// br.close();
//
// } catch (Exception ex){
// ex.printStackTrace();
// }
//
// //return statement
// return ProteinSequence;
// }
// //Create a list of sets that are filtered
// public static void CreateFilteredSets(int OpD){
//
// //initialize output structures
// ClusterIDsinOrgs = new LinkedHashMap<String, LinkedList<Integer>>();
// OperonClusterIDsinOrgs = new LinkedHashMap<String, LinkedList<Integer>>();
// AllClusters = new LinkedList<Integer>();
//
// //create filtered sets
// for (String s : OS.keySet()){
//
// //Initialize lists
// LinkedList<Integer> Clusters = new LinkedList<Integer>();
// LinkedList<Integer> OperonClusters = new LinkedList<Integer>();
// //check every element
// LinkedList<GenomicElement> Elements = OS.get(s);
//
// //check each for operon
// for (int i = 0; i < Elements.size(); i++){
//
// //reset designation - default = false
// boolean InAnOperon = false;
//
// //Element
// GenomicElement ECurrent = Elements.get(i);
//
// //only consider coding regions
// if (ECurrent.Type.equals("CDS")){
//
// //check previous, when applicable
// if (i > 0){
//
// //retrieve previous element
// GenomicElement EPrevious = Elements.get(i-1);
//
// //strandedness + distance match
// if ((ECurrent.IntStrand == 1 && EPrevious.IntStrand == 1) ||
// (ECurrent.IntStrand == -1 && EPrevious.IntStrand == -1)){
// if (ECurrent.Start - EPrevious.Stop <= OpD){
// InAnOperon = true;
// }
// }
//
// }
//
// //check following, when applicable
// if (i < Elements.size() - 1){
//
// //retrieve following element
// GenomicElement EFollowing = Elements.get(i+1);
//
// //strandedness + distance match
// if ((ECurrent.IntStrand == 1 && EFollowing.IntStrand == 1) ||
// (ECurrent.IntStrand == -1 && EFollowing.IntStrand == -1)){
// if (EFollowing.Start - ECurrent.Stop <= OpD){
// InAnOperon = true;
// }
// }
// }
//
// //if the gene is still to be regarded as in an operon, proceed
// if (InAnOperon){
// if (!OperonClusters.contains(ECurrent.ClusterID)){
// OperonClusters.add(ECurrent.ClusterID);
// }
// }
//
// //also note genes that may or may not be in an operon
// if (!Clusters.contains(ECurrent.ClusterID)){
// Clusters.add(ECurrent.ClusterID);
// }
//
// //keep track of every CDS cluster across the whole set
// if (!AllClusters.contains(ECurrent.ClusterID)){
// AllClusters.add(ECurrent.ClusterID);
// }
//
// }
//
// }
//
// //store this in hash map
// //sort
// Collections.sort(Clusters);
//
// //store
// ClusterIDsinOrgs.put(s, Clusters);
// OperonClusterIDsinOrgs.put(s, OperonClusters);
//
// }
//
// System.out.println("Determined organism-specific cluster IDs.");
//
// }
}