package OperonEvolutionInHalos;
import genomeObjects.AnnotatedGenome;
import genomeObjects.ContextSet;
import genomeObjects.GenomicElement;
import genomeObjects.OrganismSet;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import operonClustering.CustomDissimilarity;
//An object to contain the operons in an organism.
public class OperonSet {
//Fields
//source data structure
public OrganismSet OS;
//Initial import data structures
public LinkedHashMap<String,HashMap<Integer,LinkedList<GenomicElement>>> OperonHash;
//Mapping of distances to phylogeny
public LinkedHashMap<LinkedList<String>,Double> PhyDistHash;
public Double MaxDist = 0.0;
public static final String PhyDistHashFile = "/Users/phillipseitzer/Dropbox/OperonEvolutionInHalophiles/Phylogeny/halo.phy_phyml_sh.txt";
//Trajectories
public LinkedHashMap<Integer, OperonTrajectory> Trajectories;
public LinkedHashMap<LinkedList<Integer>, DoubleOperonTrajectory> DoubleTrajectories;
public int RoundingConstant = 10000;
//constructor for direct loading of OS context to new object
public OperonSet(OrganismSet oS, String CSName){
//note data
this.OS = oS;
//Initialize hash
OperonHash = new LinkedHashMap<String,HashMap<Integer,LinkedList<GenomicElement>>>();
//build hash
for (String s : OS.getSpecies().keySet()){
//retrieve genome
AnnotatedGenome AG = OS.getSpecies().get(s);
//retrieve map
LinkedList<ContextSet> GroupNames = AG.getGroupings();
//add data
for (ContextSet s1: GroupNames){
if (s1.getName().equals(CSName)){
OperonHash.put(s, s1.getContextMapping());
break;
}
}
}
//output message
System.out.println("Created set of basic operons");
}
//dummy constructor
public OperonSet(){
}
// ====================== //
// ===== Methods ======== //
// ====================== //
// ===== Creation ======= //
//re-organize hash into list of trajectories
public void BuildOperonTrajectories(){
/*
* (1) Creates a mapping of cluster ID number to cross-species operon trajectories.
* (2) Simultaneously, basic statistics about the operon are also noted.
*
* Cluster ID has to be found in at least 10 different organisms (<= 5276)
* Cluster may not have a total number of instances greater than 100 (>= 272)
*
*/
//Initialize trajectory mapping + associated statistics
Trajectories = new LinkedHashMap<Integer, OperonTrajectory>();
//Counter variable
int TrajectoryCounter = 0;
//check all clusters
//for (int i = 272; i <= 5276; i++){ //the major set
for (int i = 5200; i <= 5300; i++){ //for testing
//Initialize this operon trajectory
OperonTrajectory OT = new OperonTrajectory();
//Initialize the map for this cluster ID
LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>> Trajectory
= new LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>>();
//Initialize a mapping to the operon data
LinkedHashMap<String, LinkedList<OperonData>> OpDataTraj
= new LinkedHashMap<String, LinkedList<OperonData>>();
//initialize organism counter
int OrgCounter = 0;
int TotalOpCounter = 0;
int NumNonSingle = 0;
LinkedList<Integer> Clusts = new LinkedList<Integer>();
LinkedList<Integer> Genes = new LinkedList<Integer>();
//check operons in each organism
for (String s : OperonHash.keySet()){
//recover operon mapping
HashMap<Integer, LinkedList<GenomicElement>> Operons = OperonHash.get(s);
//Initialize a list of operons to include
LinkedList<LinkedList<GenomicElement>> OrgList = new LinkedList<LinkedList<GenomicElement>>();
LinkedList<OperonData> OpDataList = new LinkedList<OperonData>();
//check each and every operon in the mapping
for (Integer x : Operons.keySet()){
//default: operon does not match
boolean OperonContainsCluster = false;
//recover genes in each operon
LinkedList<GenomicElement> L = Operons.get(x);
//check each gene for cluster ID
for (GenomicElement E : L){
if (E.getClusterID() == i){
OperonContainsCluster = true;
break;
}
}
//if the operon contains the cluster, add to list
if (OperonContainsCluster){
//increment counter
TotalOpCounter++;
//note if this "operon" is really just a singleton gene
if (L.size() > 1){
NumNonSingle++;
}
//Initialize a new operon data object
OperonData OD = new OperonData();
//add all appropriate cluster ids to the list of cluster ids
for (GenomicElement E : L){
//exclude self
if (E.getClusterID() != i){
//list of clusters
if (!Clusts.contains(E.getClusterID())){
Clusts.add(E.getClusterID());
}
//all genes - so, include every instance
Genes.add(E.getClusterID());
}
//adjust operon data
//adjust operon start coordinate
if (E.getStart() < OD.Start){
OD.Start = E.getStart();
}
//adjust operon end coordinate
if (E.getStop() > OD.Stop){
OD.Stop = E.getStop();
}
//strand
OD.TheStrand = E.getStrand();
//contig
OD.Contig = E.getContig();
}
//add data to lists
OrgList.add(L);
OpDataList.add(OD);
}
}
//add these to the trajectory mapping + inc org counter
if (OrgList.size() != 0){
Trajectory.put(s, OrgList);
OpDataTraj.put(s, OpDataList);
OrgCounter++;
}
}
//System.out.println("Breakpoint!");
//Write the entire trajectory to the hash map, if appropriate.
if (OrgCounter >= 10){
// ==== Add data to trajectory ====== //
//(2) ratio of non-single gene 'operons'
Double RatioNonSingle = (double) NumNonSingle / (double) TotalOpCounter;
RatioNonSingle = (double) Math.round(RoundingConstant*RatioNonSingle)/RoundingConstant;
//(3) total number of additional protein families included with cluster across set
Double RatioNovel;
if (Genes.size() != 0){
RatioNovel = (double) Clusts.size() / (double) Genes.size();
} else{
//genes that are always single copy don't mean much in our discussion of operon evolution.
RatioNovel = -1.0;
OT.AlwaysASingleGene = true;
}
RatioNovel = (double) Math.round(RoundingConstant*RatioNovel)/RoundingConstant;
//associate data
OT.ClusterID = i;
OT.OrgsFeatured = OrgCounter;
OT.Operonicity = RatioNonSingle;
OT.Variety = RatioNovel;
OT.TrajectoryHash = Trajectory;
OT.amalgamate(); //puts all ops in same organisms together
OT.OperonHash = OpDataTraj;
OT.OtherClusters = Clusts; //all other non-seed types of genes featured in the trajectory
//Determine overall phylogenetic spread of protein family
LinkedList<String> ListOfOrganisms = new LinkedList<String>(OT.OperonHash.keySet());
OT.ClusterPhylogeneticRange = DetermineMaxDist(ListOfOrganisms);
//Segregate trajectory - intensive, not always necessary
//Important: determine if these should be amalgamated or not!
//OT.OperonGroups = SegregateTrajectory(OT);
OT.OperonGroups = this.SegregateTrajectoryNoAmalg(OT);
//compute evo rate - open to re-analysis
OT.computeEvoRate();
//classify detected single gene modification events
OT.predictSingleGeneOperonModifications();
// //debugging - display
// System.out.println(OT.ClusterID + ":" +
// " Prepend: " + OT.isPrepend +
// " Insertion: " + OT.isInsertion +
// " Append: " + OT.isAppend);
//add to map + inc counter
Trajectories.put(i,OT);
TrajectoryCounter++;
}
//output message.
if (i%100 == 0){
System.out.println("Processed " + i +"/5276 operon trajectories.");
// System.out.println("Present in: " + Trajectories.get(i).OrgsFeatured);
// System.out.println("Non-singleton ratio: " + Trajectories.get(i).NonsingletonRatio);
// System.out.println("Novel cluster ratio: " + Trajectories.get(i).Novelty);
}
}
//final output message
System.out.println(TrajectoryCounter + " operon trajectories determined.");
}
//Calculate for disagreements at lowest level
public void AddLowestLevelPhyloDisagreement(OperonTrajectory OT, double PhyloMargin, boolean ignoreSingleGenes){
/*
* This algorithm checks the organismic sources of operons
* of identical topology, and identifies cases where a more
* distant relative exists that is not found in the group.
*
* In that case, the operon is said not to agree exactly with
* the phylogeny.
*
* If the "distance relative" is within a provided margin of
* error, this offender can be discarded.
*
*/
//default: this operon follows the phylogeny
boolean FollowsPhylogeny = true;
//only need to perform comparison if no alternative operon topologies featured
if (OT.OperonGroups.size() > 1){
for (OperonCluster OC1 : OT.OperonGroups){
//if an operon cluster contains only one organism,
//no intra-cluster divergence distance to measure.
if (OC1.Operons.size() != 1){
//optionally ignore single gene topology
if (!ignoreSingleGenes || (ignoreSingleGenes && OC1.OperonSize>1)){
//find the smallest inter-cluster distance
double MinDist = 999;
//compare this cluster to every other cluster
for (OperonCluster OC2: OT.OperonGroups){
//don't compare same
if (!OC1.equals(OC2)){
//optionally, ignore single gene topology
if (!ignoreSingleGenes || (ignoreSingleGenes && OC2.OperonSize>1)){
//compare
double NewDist = DetermineMinDist(OC1.Organisms,OC2.Organisms);
//update minimum
if (NewDist < MinDist){
MinDist = NewDist;
}
}
}
}
//compare internal phylo distances to smallest external distance
if (OC1.MaxInternalDist - PhyloMargin > MinDist){
FollowsPhylogeny = false;
break;
}
}
}
}
}
//update operon trajectory field
OT.AgreesWithPhylogenyAtLowestLevel = FollowsPhylogeny;
}
//calculate cassette for each trajectory: measure of operonic dispersion across species
public OperonTrajectory BuildCODFromBasic(OperonTrajectory Basic_OT, CODParameters CP){
/*
* a Cassette is a grouping encompassing local dispersion of genes, across species.
*
* Specifically, genes that are ever predicted to be in an operon in any species are
* searched nearby to the operon in species where they are not found.
*
* Same strand may be required, or not.
*
* It is NOT AN OPERON PREDICTON APPROACH, but rather, a technique to examine local
* dispersion. BTW by setting the "local" parameter very high, this could effectively
* encompass the whole genome.
*/
//Initialize output
OperonTrajectory Cassette_OT = new OperonTrajectory();
//this analysis is only relevant to actual operons.
if (!Basic_OT.AlwaysASingleGene){
//Transfer Identification information
Cassette_OT.ClusterID = Basic_OT.ClusterID;
Cassette_OT.OtherClusters = Basic_OT.OtherClusters;
Cassette_OT.OrgsFeatured = Basic_OT.OrgsFeatured;
//Initialize (important) gene mapping
LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>> CassetteTrajectoryHash
= new LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>>();
//Find new genomic mapping
for (String s : Basic_OT.OperonHash.keySet()){
//Retrieve initial lists
LinkedList<OperonData> DataList = Basic_OT.OperonHash.get(s);
//Initialize output
LinkedList<LinkedList<GenomicElement>> OrgList = new LinkedList<LinkedList<GenomicElement>>();
//check each operon in the list
for (OperonData OD : DataList){
//Initialize a list of genes
LinkedList<GenomicElement> L = new LinkedList<GenomicElement>();
//a flag for starting writing
boolean StartedWriting = false;
//check against genes in original organism
AnnotatedGenome AG = OS.getSpecies().get(s);
//check all elements
for (GenomicElement E : AG.getElements()){
//match same contig, collect all potentially interesting genes
if (E.getContig().equals(OD.Contig)){
//approaching from the left
if ((E.getCenter() + CP.RangeAroundOperon >= OD.Start) &&
(E.getClusterID() == Cassette_OT.ClusterID ||
Cassette_OT.OtherClusters.contains(E.getClusterID()))){
//First: flag is on
StartedWriting = true;
//record the gene, for later processing
L.add(E);
}
//passing through
if (StartedWriting){
//within range
if ((E.getCenter() - CP.RangeAroundOperon <= OD.Stop) &&
(E.getClusterID() == Cassette_OT.ClusterID ||
Cassette_OT.OtherClusters.contains(E.getClusterID()))){
//record gene, for later processing
L.add(E);
} else {
//stop searching through genes - all candidates have been noted.
break;
}
}
}
}
//post-processing of list
if (CP.RequireSameStrand){
OrgList.add(L);
} else {
LinkedList<GenomicElement> StrandCorrected = new LinkedList<GenomicElement>();
for (GenomicElement E : L){
if (E.getStrand().equals(OD.TheStrand)){
StrandCorrected.add(E);
}
}
OrgList.add(StrandCorrected);
}
}
//add these to the trajectory mapping
if (OrgList.size() != 0){
CassetteTrajectoryHash.put(s, OrgList);
}
}
//add values
Cassette_OT.TrajectoryHash = CassetteTrajectoryHash;
} else {
Cassette_OT = Basic_OT;
}
//output message.
//System.out.println("Cassette trajectory determined for " + Cassette_OT.ClusterID + ".");
//return statement
return Cassette_OT;
}
// ---- PhyloDist Mapping ----- //
//Import phylogenetic mapping info
public void BuildPhylogeneticDistanceMapping(){
//Initialize output
//LinkedHashMap<String,Integer> ComparisonHash = new LinkedHashMap<String,Integer>();
//Initialize hash map
PhyDistHash = new LinkedHashMap<LinkedList<String>,Double>();
//First, load in file
try {
//open file stream
BufferedReader br = new BufferedReader(new FileReader(PhyDistHashFile));
String Line = null;
//for comparisons
//int ComparisonCounter = 0;
//read through lines
while ((Line = br.readLine()) != null) {
//split by tabs
String[] L = Line.split("\t");
//check matches
boolean OneMatch = false;
boolean TwoMatch = false;
//org names
String Org1 = "";
String Org2 = "";
//check first entry
for (String s : OS.getSpecies().keySet()){
if (L[0].equals(s)){
OneMatch = true;
Org1 = s;
break;
}
}
//check second entry
for (String s : OS.getSpecies().keySet()){
if (L[1].equals(s)){
TwoMatch = true;
Org2 = s;
break;
}
}
if (OneMatch && TwoMatch){
//build key
LinkedList<String> OrgRelation = new LinkedList<String>();
OrgRelation.add(Org1);
OrgRelation.add(Org2);
Collections.sort(OrgRelation);
//build value
double Dist = Double.parseDouble(L[3]);
//update maximum distance
if (Dist > MaxDist){
MaxDist = Dist;
}
//write to hash
PhyDistHash.put(OrgRelation, Dist);
}
//DEBUGGING
// //only mappings between organisms we care about
// if (OneMatch && TwoMatch){
// //System.out.println(Line);
// ComparisonCounter++;
//
// //store these comparisons in the comparison hash
//
// //org 1
// if (ComparisonHash.get(Org1) != null){
// int Val = ComparisonHash.get(Org1);
// Val++;
// ComparisonHash.put(Org1, Val);
// } else {
// ComparisonHash.put(Org1, 1);
// }
//
// //org 2
// if (ComparisonHash.get(Org2) != null){
// int Val = ComparisonHash.get(Org2);
// Val++;
// ComparisonHash.put(Org2, Val);
// } else {
// ComparisonHash.put(Org2, 1);
// }
// }
}
//print summary
System.out.println(PhyDistHash.size() + " phylogenetic distances mapped.");
//3160 = 79*80/2, lower triangle of dissimilarity matrix
//close file stream
br.close();
} catch (Exception ex){
ex.printStackTrace();
}
//this was for debugging purposes
//return ComparisonHash;
}
//Build a generic histogram
public PhyloHistData BuildGenericDistanceMapping(String FileName){
//Initialize output
PhyloHistData PHD = new PhyloHistData();
//Initialize hash map
LinkedHashMap<LinkedList<String>,Double> PhyDistHash = new LinkedHashMap<LinkedList<String>,Double>();
Double MaxDist = -1.0;
//First, load in file
try {
//open file stream
BufferedReader br = new BufferedReader(new FileReader(FileName));
String Line = null;
//read through lines
while ((Line = br.readLine()) != null) {
//split by tabs
String[] L = Line.split("\t");
//updated - don't try to filter out entries (3/17/2014)
String Org1 = L[0].trim();
String Org2 = L[1].trim();
//build key
LinkedList<String> OrgRelation = new LinkedList<String>();
OrgRelation.add(Org1);
OrgRelation.add(Org2);
Collections.sort(OrgRelation);
//build value
double Dist = Double.parseDouble(L[2]);
//update maximum distance
if (Dist > MaxDist){
MaxDist = Dist;
}
//write to hash
PhyDistHash.put(OrgRelation, Dist);
}
//print summary
System.out.println(PhyDistHash.size() + " phylogenetic distances mapped.");
//close file stream
br.close();
//record data in structure
PHD.MaxDist = MaxDist;
PHD.PhyDistHash = PhyDistHash;
} catch (Exception ex){
ex.printStackTrace();
}
//return data
return PHD;
}
//Create a histogram of distances
public void PhyloHist(String HistDataFile, int bins, Double MaxDist, LinkedHashMap<LinkedList<String>,Double> PhyDistHash){
//based on number of bins, determine ranges appropriate for bins.
Double separationUnit = MaxDist/(double) bins;
//counts
LinkedHashMap<LinkedList<Double>,Integer> Counts
= new LinkedHashMap<LinkedList<Double>, Integer>();
//Initialize counts
for (int i = 0; i < bins; i++){
//create range
LinkedList<Double> Range = new LinkedList<Double>();
Range.add((double) i * separationUnit);
Range.add((double)(i+1)*separationUnit);
//add to counts
Counts.put(Range, 0);
}
//Iterate through hash map, and store in appropriate bin.
for (Double d : PhyDistHash.values()){
//find appropriate range
for (LinkedList<Double> L : Counts.keySet()){
//when the appropriate range is found, update counts + break out of inner loop.
if (d >= L.get(0) && d <= L.get(1)){
int count = Counts.get(L);
count++;
Counts.put(L, count);
break;
}
}
}
//write to file
try {
//create file writer
BufferedWriter bw = new BufferedWriter(new FileWriter(HistDataFile));
//display data
int counter = 0;
for (LinkedList<Double> L : Counts.keySet()){
counter++;
int Val = Counts.get(L);
Double Avg = 0.5*(L.get(0)+L.get(1));
bw.write(String.valueOf(Avg) + "\t" + String.valueOf(Val) + "\n");
bw.flush();
//System.out.println("bin " + counter + ": "+ L.get(0) + "-" + L.get(1) +":" + Val);
}
// System.out.println("Bins: " + Counts.size());
// System.out.println("Max: " + MaxDist);
//close file stream
bw.close();
} catch (Exception ex){
ex.printStackTrace();
}
}
// ------- Data Creation -------- //
//Create a set of CODs
public LinkedHashMap<Integer, OperonTrajectory> CreateCODSet(CODParameters COD){
//Initialize output
LinkedHashMap<Integer, OperonTrajectory> CODset = new LinkedHashMap<Integer, OperonTrajectory>();
for (Integer x : Trajectories.keySet()){
OperonTrajectory Basic = Trajectories.get(x);
CODset.put(x, BuildCODFromBasic(Basic,COD));
if (x%100 ==0){
System.out.println("Computed " + x + "/" + Trajectories.size() + " CODs.");
}
}
//return
return CODset;
}
//Create a list of multistable operon topologies
public LinkedList<Integer> FindMultiStableOperonTopologies(BistableParams BP){
/*
* Algorithm:
* (1) Segment each operon trajectory into non-overlapping clusters of identical
* protein family content
* (2) Filter set by minimum number of operonic members in each group
* (3) Filter set by minimum size requirement for operons
* (4) For each operon, determine the range of phylogenetic variety
* (5) Filter set by minimum internal range of phylogenetic variety
* (6) Compute gene content variety between each group (Dice/Jaccard for non-seed)
* (7) Compute the minimal phylogenetic distance between two groups (compare all indiv. members)
* (8) Retain protein families that match content variety + min grp distance specifications
* (9) Num of operon groups that are retained from this process describes stability
*/
//Initialize output
LinkedList<Integer> MultiStableTrajectories = new LinkedList<Integer>();
for (OperonTrajectory OT : Trajectories.values()){
//only check actual operons
if (!OT.AlwaysASingleGene){
//generate report
OperonStabilityReport OSR = MultiStableOperonTopology(BP,OT);
if (OSR.MultiStable){
MultiStableTrajectories.add(OSR.ClusterID);
System.out.println("Multistable trajectory discovered: " + OT.ClusterID + ".");
}
}
}
//return output
return MultiStableTrajectories;
}
//Single multistable trajectory
public OperonStabilityReport MultiStableOperonTopology(BistableParams BP, OperonTrajectory OT){
/*
* Algorithm:
* (1) Segment operon trajectory into non-overlapping clusters of identical
* protein family content
* (2) For each operon, determine the range of phylogenetic variety
* (3) Filter set by minimum number of operonic members in each group
* (4) Filter set by minimum size requirement for operons
* (5) Filter set by maximum internal range of phylogenetic variety
* (6) Compute gene content variety between each group (Dice/Jaccard for non-seed)
* (7) Compute the minimal phylogenetic distance between two groups (compare all indiv. members)
* (8) Retain protein families that match content variety + min grp distance specifications
* (9) Num of operon groups that are retained from this process describes stability
*/
//(1) + (2) Create operon groups
LinkedList<OperonCluster> OperonGroups = SegregateTrajectory(OT);
//(3) - (5) Filtering by operon group properties
LinkedList<OperonCluster> FilteredOperonGroups = new LinkedList<OperonCluster>();
for (OperonCluster OC : OperonGroups){
if (OC.Operons.size() >= BP.MinGrpMemSize &&
OC.OperonSize >= BP.MinOpSize &&
OC.MaxInternalDist <= BP.MaxSameGrpPhyloDist){
FilteredOperonGroups.add(OC);
}
}
//(6) - (8) Compare cluster groups + filter appropriately
//Initialize output
OperonStabilityReport OSR = new OperonStabilityReport(OT.ClusterID);
//Compare + build output report
for (OperonCluster OC1 : FilteredOperonGroups){
for (OperonCluster OC2 : FilteredOperonGroups){
if (!OC1.equals(OC2)){
//Build list
LinkedList<OperonCluster> OCL = new LinkedList<OperonCluster>();
OCL.add(OC1);
OCL.add(OC2);
Collections.sort(OCL, new OCSorter());
//System.out.println("Breakpoint!");
//evaluate distances
double PhyDist = DetermineMinDist(OCL.get(0).Organisms,OCL.get(1).Organisms);
double ContentDist = ContentDissimilarity(OCL.get(0), OCL.get(1));
//write if the filtering matches
if (PhyDist >= BP.MinDiffGrpPhyloDist &&
ContentDist <= BP.MaxContentDiss){
//add stability relationships + components
OSR.addOperonClusters(OCL);
}
}
}
}
//(9) Return output
return OSR;
}
//determine the maximum distance between elements in a list of organisms
public double DetermineMaxDist(LinkedList<String> Organisms){
//Initialize
double MaxDist = -1;
for (String s1 : Organisms){
for (String s2: Organisms){
if (!s1.equals(s2)){
//retrieve value
LinkedList<String> OrgList = new LinkedList<String>();
OrgList.add(s1);
OrgList.add(s2);
Collections.sort(OrgList);
double d = PhyDistHash.get(OrgList);
//compare + possibly adjust max
if (d > MaxDist){
MaxDist = d;
}
}
}
}
//return updated value
return MaxDist;
}
//determine the minimum distance between elements in two separate lists of organisms
public double DetermineMinDist(LinkedList<String> L1, LinkedList<String> L2){
//Initialize
double MinDist = 9999;
for (String s1 : L1){
for (String s2: L2){
if (!s1.equals(s2)){
//retrieve value
LinkedList<String> OrgList = new LinkedList<String>();
OrgList.add(s1);
OrgList.add(s2);
Collections.sort(OrgList);
double d = PhyDistHash.get(OrgList);
//compare + possibly adjust max
if (d < MinDist){
MinDist = d;
}
}
}
}
//return updated value
return MinDist;
}
//determine the content dissimilarity
public double ContentDissimilarity(OperonCluster OC1, OperonCluster OC2){
//at present: just evaluate by protein content using generalize dice/jaccard
CustomDissimilarity CD = new CustomDissimilarity();
double Dice = CD.GeneralizedDiceOrJaccard(OC1.ClustersFeatured, OC2.ClustersFeatured, true, "Dice");
//return dissimilarity
return Dice;
}
//segregate a single operon trajectory into multiple clusters
public LinkedList<OperonCluster> SegregateTrajectory(OperonTrajectory OT){
//initialize hash
LinkedHashMap<LinkedList<Integer>,OperonCluster> Segregation
= new LinkedHashMap<LinkedList<Integer>, OperonCluster>();
//initialize sorting number
int SortingNumber = 0;
// //(1) Segregate operons into appropriate groups
// for (String s : OT.TrajectoryHash.keySet()){
// for (LinkedList<GenomicElement> L : OT.TrajectoryHash.get(s)){
//
// // determine featured protein families
// LinkedList<Integer> FamClust = new LinkedList<Integer>();
//
// //an operon
// for (GenomicElement E : L){
// FamClust.add(E.getClusterID());
// }
//
// //build the key for the hash
// Collections.sort(FamClust);
//
// OperonCluster OC;
// if (Segregation.get(FamClust) != null){
// OC = Segregation.get(FamClust);
// } else {
// OC = new OperonCluster();
// SortingNumber++;
// }
//
// //adjust contents + store in hash
// OC.SeedCluster = OT.ClusterID;
// OC.SortingNumber = SortingNumber;
// OC.addOrg(s);
// OC.Operons.add(L);
// OC.addClustersFeatured(FamClust);
//
// Segregation.put(FamClust, OC);
// }
// }
//(1) UPDATE - use species-amalgamated operons instead of separate operons
for (String s : OT.AmalgamatedOperons.keySet()){
LinkedList<GenomicElement> L = OT.AmalgamatedOperons.get(s);
// determine featured protein families
LinkedList<Integer> FamClust = new LinkedList<Integer>();
//an operon
for (GenomicElement E : L){
FamClust.add(E.getClusterID());
}
//build the key for the hash
Collections.sort(FamClust);
OperonCluster OC;
if (Segregation.get(FamClust) != null){
OC = Segregation.get(FamClust);
} else {
OC = new OperonCluster();
SortingNumber++;
}
//adjust contents + store in hash
OC.SeedCluster = OT.ClusterID;
OC.SortingNumber = SortingNumber;
OC.addOrg(s);
OC.Operons.add(L);
OC.addClustersFeatured(FamClust);
Segregation.put(FamClust, OC);
}
//(2) Calculate features of each operon cluster.
LinkedList<OperonCluster> OpGroups = new LinkedList<OperonCluster>();
//iterate through each, compute values, store in new list
for (OperonCluster OC : Segregation.values()){
OC.OperonSize = OC.ClustersFeatured.size();
OC.MaxInternalDist = DetermineMaxDist(OC.Organisms);
OpGroups.add(OC);
}
//return statement
return OpGroups;
}
//segregate a single operon trajectory into multiple clusters - no amalg
public LinkedList<OperonCluster> SegregateTrajectoryNoAmalg(OperonTrajectory OT){
//initialize hash
LinkedHashMap<LinkedList<Integer>,OperonCluster> Segregation
= new LinkedHashMap<LinkedList<Integer>, OperonCluster>();
//initialize sorting number
int SortingNumber = 0;
//(1) Segregate operons into appropriate groups
for (String s : OT.TrajectoryHash.keySet()){
for (LinkedList<GenomicElement> L : OT.TrajectoryHash.get(s)){
// determine featured protein families
LinkedList<Integer> FamClust = new LinkedList<Integer>();
//an operon
for (GenomicElement E : L){
FamClust.add(E.getClusterID());
}
//build the key for the hash
Collections.sort(FamClust);
OperonCluster OC;
if (Segregation.get(FamClust) != null){
OC = Segregation.get(FamClust);
} else {
OC = new OperonCluster();
SortingNumber++;
}
//adjust contents + store in hash
OC.SeedCluster = OT.ClusterID;
OC.SortingNumber = SortingNumber;
OC.addOrg(s);
OC.Operons.add(L);
OC.addClustersFeatured(FamClust);
Segregation.put(FamClust, OC);
}
}
//(2) Calculate features of each operon cluster.
LinkedList<OperonCluster> OpGroups = new LinkedList<OperonCluster>();
//iterate through each, compute values, store in new list
for (OperonCluster OC : Segregation.values()){
OC.OperonSize = OC.ClustersFeatured.size();
OC.MaxInternalDist = DetermineMaxDist(OC.Organisms);
OpGroups.add(OC);
}
//print statement
//System.out.println("Breakpoint!");
//return statement
return OpGroups;
}
//amalgamate a set of clusters into a new set of clusters
public LinkedList<OperonCluster> GeneOrderSetAmalgamate(LinkedList<OperonCluster> UnfilteredSet){
//Initialize output
LinkedList<OperonCluster> FilteredSet = new LinkedList<OperonCluster>();
//Remove single sets
for (OperonCluster OC : UnfilteredSet){
if (OC.OperonSize > 0){
FilteredSet.add(OC);
}
}
//Iterate amalgamation protocol
//Initialize - try to amalgamate
boolean FinishedAmalgamation = false;
while (!FinishedAmalgamation){
//re-set place-holder set, and re-initialize filtered set
LinkedList<OperonCluster> PlaceHolder = FilteredSet;
FilteredSet = new LinkedList<OperonCluster>();
//check for 2-element overlaps
//check every operon cluster...
for (int i = 0; i < PlaceHolder.size(); i++){
//retrieve one
OperonCluster OC1 = PlaceHolder.get(i);
//Check number of amalgamations
int AmalgCounter = 0;
//... against every other operon cluster
for (int j = i+1; j < PlaceHolder.size(); j++){
//retrieve the other
OperonCluster OC2 = PlaceHolder.get(j);
//check for at least two common elements, if so, schedule for amalgamation
}
}
}
//return output
return FilteredSet;
}
//build 2-gene trajectories - for use with gene order set, etc
public void BuildDoubleTrajectories(){
}
//split a single trajectory into a set of double trajectories
public LinkedList<DoubleOperonTrajectory> Single2DoubleTrajectory(OperonTrajectory OT){
//Initialize output
LinkedList<DoubleOperonTrajectory> SplitTrajectory = new LinkedList<DoubleOperonTrajectory>();
//create a hash map -> data for trajectories
LinkedHashMap<LinkedList<Integer>, LinkedHashMap<String,LinkedList<LinkedList<GenomicElement>>>> Mapping
= new LinkedHashMap<LinkedList<Integer>, LinkedHashMap<String,LinkedList<LinkedList<GenomicElement>>>>();
//Initialize empty hash maps
//Iterate through all species featured
for (String s : OT.TrajectoryHash.keySet()){
//all operons from speices s
LinkedList<LinkedList<GenomicElement>> OpSet = OT.TrajectoryHash.get(s);
//every individual operon
for (LinkedList<GenomicElement> L : OpSet){
//keep track of clusters featured
LinkedList<Integer> ClustersFeatured = new LinkedList<Integer>();
}
}
//return output
return SplitTrajectory;
}
// ===== Export ======== //
//Create a list of operon trajectories appropriate for gene-order analysis
public LinkedList<LinkedList<GenomicElement>> GenerateGeneOrderAppropriateGeneSets(LinkedHashMap<Integer, OperonTrajectory> Trajectories){
/*
* Not all operon trajectories have the potential to exhibit an interesting change
* in gene order. Trajectories should be filtered+split so that all have at least two common
* elements - only if there are two elements can there be a relative change in gene order.
*
* This may mean that trajectories are divided up into multiple groups. That's okay -
* groups are stored based on lists of genes, so no danger at naming confusion
*
* How to properly compare a set of component operons should be built into the JCE software:
* For example, assessing changes in "before" and "after" needs to be compared considering strand
* commonality: in other words, a reverse in order of the genes in an operon reflects a
* change in gene order iff the strand is the same in both operons.
*
* Algorithm:
* (1) For each operon trajectory, divide into non-overlapping clusters based on at least 2 common genes
* (2) Each non-overlapping cluster is a single context set group.
* (3) Store all groups in a master list, checking for overlaps.
* (4)
*/
//Inititalize output
LinkedList<LinkedList<GenomicElement>> NonOverlappingGeneGroups =
new LinkedList<LinkedList<GenomicElement>>();
//Iterate through each trajectory
for (Integer x : Trajectories.keySet()){
//Retrieve trajectory
OperonTrajectory OT = Trajectories.get(x);
//split all operons into groups
LinkedList<OperonCluster> IsolatedGroups = SegregateTrajectoryNoAmalg(OT);
//build amalgamated groups from these
LinkedList<OperonCluster> AmalgamatedGroups = GeneOrderSetAmalgamate(IsolatedGroups);
}
//return completed hash map
return NonOverlappingGeneGroups;
}
//Export a whole set of operon stats sorted by several different variables
public void ExportByDifferentVariables(String BaseFile, boolean IncludeSingletons){
//Retrieve list of trajectories
LinkedList<OperonTrajectory> L = new LinkedList<OperonTrajectory>(Trajectories.values());
//sort + export
//(1) by NSR
Collections.sort(L, new SortbyOperonicity());
String NSRFile = BaseFile + "_byOperonicity.txt";
ExportTrajectoryStatistics(NSRFile,L,IncludeSingletons);
//(2) by Novelty
Collections.sort(L, new SortbyVariety());
String NoveltyFile = BaseFile + "_byVariety.txt";
ExportTrajectoryStatistics(NoveltyFile,L,IncludeSingletons);
//(3) by OrgsFeatured
Collections.sort(L, new SortbyOrgsFeatured());
String OrgsFeaturedFile = BaseFile + "_byOrgsFeatured.txt";
ExportTrajectoryStatistics(OrgsFeaturedFile,L,IncludeSingletons);
//(4) NSR, Novelty
Collections.sort(L, new SortbyOperonicityThenVariety());
String NSR_then_NoveltyFile = BaseFile + "_byOperonicity_then_Variety.txt";
ExportTrajectoryStatistics(NSR_then_NoveltyFile,L,IncludeSingletons);
//(5) Novelty, NSR
Collections.sort(L, new SortbyVarietyThenOperonicity());
String Novelty_then_NSRFile = BaseFile + "_byNVariety_then_Operonicity.txt";
ExportTrajectoryStatistics(Novelty_then_NSRFile,L,IncludeSingletons);
//(6) by evolutionary rate
Collections.sort(L, new SortbyEvoRate());
String EvoRateFile = BaseFile + "_byRate.txt";
ExportTrajectoryStatistics(EvoRateFile,L,IncludeSingletons);
//output message
System.out.println("Files Successfully Exported!");
}
//Export a single set of operon trajectory statistics
public void ExportTrajectoryStatistics(String FileName, LinkedList<OperonTrajectory> SortedTrajectories, boolean IncludeSingletons){
try {
//initialize file writer
BufferedWriter bw = new BufferedWriter(new FileWriter(FileName));
//initialize header + write to file
String Header = "cluster_num\tnum_orgs\toperonicity\tvariety\tevolutionary_rate\n";
bw.write(Header);
bw.flush();
//enumerate through genomes
for (OperonTrajectory OT : SortedTrajectories){
//build line
String ln = String.valueOf(OT.ClusterID) +"\t"
+ String.valueOf(OT.OrgsFeatured) + "\t"
+ String.valueOf(OT.Operonicity) + "\t"
+ String.valueOf(OT.Variety) + "\t"
+ String.valueOf(OT.EvoRate) + "\n";
//write line to file
if (IncludeSingletons || (!IncludeSingletons && !OT.AlwaysASingleGene)){
bw.write(ln);
bw.flush();
}
}
bw.close();
} catch (Exception ex){
ex.printStackTrace();
}
}
//export a query set, with options to vary operonicity + novelty (towards highly conserved)
//Export a query set of trajectories
public void ExportQuerySet(String QuerySetFile, LinkedHashMap<Integer,OperonTrajectory> Trajectories, Double MinOperonicity, Double MaxNovelty, int MinOrgsFeatured){
try {
//open file stream
BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile));
//initialize counter.
int Counter = 0;
//export all appropriate trajectories
for (Integer x : Trajectories.keySet()){
//retrieve trajectory
OperonTrajectory OT = Trajectories.get(x);
//check parameters
if (OT.Variety <= MaxNovelty &&
OT.Operonicity >= MinOperonicity &&
OT.OrgsFeatured >= MinOrgsFeatured){
//build string, write and export
String ln = x + "\n";
bw.write(ln);
bw.flush();
//increment counter
Counter++;
}
}
//close file stream
bw.close();
//output message
System.out.println("Exported query set containing " + Counter + " trajectories.");
} catch (Exception ex){
ex.printStackTrace();
}
}
//Export a set of pairwise searches appropriate for export
public void ExportGeneOrderAnalysisQuerySet(String QuerySetFile, LinkedHashMap<Integer,OperonTrajectory> Trajectories, Double MinOperonicity){
//Initialize a list of query-pairs
LinkedList<LinkedList<Integer>> QueryPairs = new LinkedList<LinkedList<Integer>>();
//iterate through all values
for (Integer x : Trajectories.keySet()){
//retrieve trajectory
OperonTrajectory OT = Trajectories.get(x);
//proceed if appropriate
if (OT.Operonicity >= MinOperonicity){
//create each pair
for (Integer y : OT.OtherClusters){
//re-format pair as sorted linked list
LinkedList<Integer> GenePair = new LinkedList<Integer>();
GenePair.add(x);
GenePair.add(y);
Collections.sort(GenePair);
//add list to set of all pairs
if (!QueryPairs.contains(GenePair)){
QueryPairs.add(GenePair);
}
}
}
//output message.
if (x%100 == 0){
System.out.println("Built Query Pairs for " + x +"/5276 operon trajectories.");
}
}
//System.out.println("Sorting!");
//Sort the list
Collections.sort(QueryPairs, new SortListOfPairs());
//export to file
try {
//open file stream
BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile));
//export each query pair
for (LinkedList<Integer> L : QueryPairs){
//build string
String str = String.valueOf(L.get(0)) + " $$ " + String.valueOf(L.get(1)) + "\n";
//write to file stream
bw.write(str);
bw.flush();
}
//close file stream
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//Export a set of n-wise searches appropriate for export
public void ExportIdenticalContentQuerySet(String QuerySetFile, LinkedHashMap<Integer,OperonTrajectory> Trajectories, boolean IncludeSingles){
//Initialize a list of query-pairs
LinkedList<LinkedList<Integer>> QueryGroups = new LinkedList<LinkedList<Integer>>();
//iterate through all values
for (Integer x : Trajectories.keySet()){
//retrieve trajectory
OperonTrajectory OT = Trajectories.get(x);
//determine all non-overlapping groups
LinkedList<OperonCluster> Clusters = SegregateTrajectoryNoAmalg(OT);
//iterate through + create groups
for (OperonCluster OC : Clusters){
//create group
LinkedList<Integer> ClusterGrp = new LinkedList<Integer>();
//add this cluster to the list
ClusterGrp.add(x);
//build list
for (Object o : OC.ClustersFeatured){
Integer objint = (Integer) o;
ClusterGrp.add(objint);
}
//arrange in ascending order
Collections.sort(ClusterGrp);
//add list to set of all pairs
if ((!QueryGroups.contains(ClusterGrp)
&& ClusterGrp.size() > 0
&& OC.Operons.size() > 1) &&
(IncludeSingles || !IncludeSingles && ClusterGrp.size() > 1) //either include singles, or not
){
QueryGroups.add(ClusterGrp);
}
}
//output message.
if (x%100 == 0){
System.out.println("Built Query Groups for " + x +"/5276 operon trajectories.");
}
}
//Sort the list
Collections.sort(QueryGroups, new SortListOfGroups());
//export to file
try {
//open file stream
BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile));
//export each query pair
for (LinkedList<Integer> L : QueryGroups){
//initialize an index counter
int IndexCounter = 0;
//initialize string
String str = "&&only ";
while (IndexCounter < L.size()) {
//add next index
str = str + String.valueOf(L.get(IndexCounter));
//anticipate additional genes, if necessary
if (IndexCounter+1 < L.size()){
str = str + " $$ ";
}
//increment counter
IndexCounter++;
}
//add new line
str = str + "\n";
//write to file stream
bw.write(str);
bw.flush();
}
//close file stream
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//Export the set of adjacent gene pairs that are represented in all operon topologies
public void ExportAdjacentGenePairs(String QuerySetFile, LinkedHashMap<Integer, OperonTrajectory> Trajectories){
//Initialize a list of query-pairs
LinkedList<LinkedList<Integer>> QueryGroups = new LinkedList<LinkedList<Integer>>();
//iterate through all values
for (Integer x : Trajectories.keySet()){
//retrieve trajectory
OperonTrajectory OT = Trajectories.get(x);
//determine all non-overlapping groups
LinkedList<OperonCluster> Clusters = SegregateTrajectoryNoAmalg(OT);
//iterate through + create groups
for (OperonCluster OC : Clusters){
//check every operon instance
for (LinkedList<GenomicElement> L : OC.Operons){
//iterate through operon
for (int i = 0; i < L.size()-1; i++){
//note every pair
LinkedList<Integer> ClusterGrp = new LinkedList<Integer>();
ClusterGrp.add(L.get(i).getClusterID());
ClusterGrp.add(L.get(i+1).getClusterID());
//sort + add to set
Collections.sort(ClusterGrp);
if (!QueryGroups.contains(ClusterGrp)){
QueryGroups.add(ClusterGrp);
}
}
}
}
//output message.
if (x%100 == 0){
System.out.println("Built Query Groups for " + x +"/5276 operon trajectories.");
}
}
//Sort the list
Collections.sort(QueryGroups, new SortListOfPairs());
//export to file
try {
//open file stream
BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile));
//export each query pair
for (LinkedList<Integer> L : QueryGroups){
//create string
String str = String.valueOf(L.get(0)) + " ; " + String.valueOf(L.get(1)) + "\n";
//write to file stream
bw.write(str);
bw.flush();
}
//close file stream
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//Export "clear" cases
public void ExportClearSingleGeneModifications(String file){
//determine counts
int OnlyAppend = 0;
int OnlyPrepend = 0;
int OnlyInsertion = 0;
//count multiple trajectories
int MultipleTrajectoryCount = 0;
int ClearExampleCount = 0;
int NoExamplesCount = 0;
try {
//open file stream
BufferedWriter bw = new BufferedWriter(new FileWriter(file));
//Iterate through all trajectories
for (OperonTrajectory OT : Trajectories.values()){
//initialize counter
int TypeCounter = 0;
//initialize string
String str = String.valueOf(OT.ClusterID) + "\t";
//prepend only
if (OT.isPrepend && !OT.isInsertion && !OT.isAppend){
OnlyPrepend++;
str = str + "Prepend\n";
bw.write(str);
bw.flush();
//insertion only
} else if (!OT.isPrepend && OT.isInsertion && !OT.isAppend){
OnlyInsertion++;
str = str + "Insertion\n";
bw.write(str);
bw.flush();
//append only
} else if (!OT.isPrepend && !OT.isInsertion && OT.isAppend){
OnlyAppend++;
str = str + "Append\n";
bw.write(str);
bw.flush();
}
if (OT.isPrepend){
TypeCounter++;
}
if (OT.isAppend){
TypeCounter++;
}
if (OT.isInsertion){
TypeCounter++;
}
// 2 or more: it's a multi
if (TypeCounter >= 2){
MultipleTrajectoryCount++;
//just 1: it's a clear case
} else if (TypeCounter == 1){
ClearExampleCount++;
//zero: no clear examples
} else {
NoExamplesCount++;
}
}
//display results
System.out.println("Total: " + (OnlyPrepend+OnlyInsertion+OnlyAppend)
+ " Prepend: " + OnlyPrepend
+ " Insertion: " + OnlyInsertion
+ " Append: " + OnlyAppend
);
//new display
System.out.println("New Checks - All Trajectories:" + Trajectories.values().size());
System.out.println("New Checks - Multiple Trajectories:" + MultipleTrajectoryCount);
System.out.println("New Checks - Clear Examples: " + ClearExampleCount);
System.out.println("new Checks - No Examples: " + NoExamplesCount);
//close file stream
bw.close();
} catch (Exception ex){
ex.printStackTrace();
}
}
// ===== Sorting Classes ====== //
//sort by NSR
// ===== Sorting Classes ======== //
public class SortbyOperonicity implements Comparator<OperonTrajectory>{
@Override
public int compare(OperonTrajectory o1, OperonTrajectory o2) {
if (o1.Operonicity<o2.Operonicity) return 1;
if (o1.Operonicity>o2.Operonicity) return -1;
if (o1.Operonicity==o2.Operonicity){
return o1.ClusterID-o2.ClusterID;
}
return 0;
}
}
//sort by novelty
public class SortbyVariety implements Comparator<OperonTrajectory>{
@Override
public int compare(OperonTrajectory o1, OperonTrajectory o2) {
if (o1.Variety<o2.Variety) return -1;
if (o1.Variety>o2.Variety) return 1;
if (o1.Variety == o2.Variety){
return o1.ClusterID - o2.ClusterID;
}
return 0;
}
}
//sort by number of orgs
public class SortbyOrgsFeatured implements Comparator<OperonTrajectory>{
@Override
public int compare(OperonTrajectory o1, OperonTrajectory o2) {
if (o1.OrgsFeatured != o2.OrgsFeatured){
return -1*(o1.OrgsFeatured-o2.OrgsFeatured);
} else {
return -1*(o1.ClusterID - o2.ClusterID);
}
}
}
//NSR, novelty
public class SortbyOperonicityThenVariety implements Comparator<OperonTrajectory>{
@Override
public int compare(OperonTrajectory o1, OperonTrajectory o2) {
if (o1.Operonicity<o2.Operonicity){
return 1;
}
if (o1.Operonicity>o2.Operonicity){
return -1;
}
if (o1.Operonicity==o2.Operonicity){
if (o1.Variety<o2.Variety){
return -1;
}
if (o1.Variety>o2.Variety){
return 1;
}
if (o1.Variety == o2.Variety){
return o1.ClusterID - o2.ClusterID;
}
}
return 0;
}
}
//Novelty, NSR
//NSR, novelty
public class SortbyVarietyThenOperonicity implements Comparator<OperonTrajectory>{
@Override
public int compare(OperonTrajectory o1, OperonTrajectory o2) {
if (o1.Variety<o2.Variety) return -1;
if (o1.Variety>o2.Variety) return 1;
if (o1.Variety == o2.Variety){
if (o1.Operonicity<o2.Operonicity) return 1;
if (o1.Operonicity>o2.Operonicity) return -1;
if (o1.Operonicity==o2.Operonicity){
return o1.ClusterID - o2.ClusterID;
}
}
return 0;
}
}
public static class SortbyEvoRate implements Comparator<OperonTrajectory>{
@Override
public int compare(OperonTrajectory o1, OperonTrajectory o2) {
if (o1.EvoRate<o2.EvoRate) return -1;
if (o1.EvoRate>o2.EvoRate) return 1;
if (o1.EvoRate == o2.EvoRate){
return o1.ClusterID - o2.ClusterID;
}
return 0;
}
}
public class OCSorter implements Comparator<OperonCluster>{
@Override
public int compare(OperonCluster o1, OperonCluster o2) {
return o1.SortingNumber-o2.SortingNumber;
}
}
public static class SortListOfPairs implements Comparator<LinkedList<Integer>>{
@Override
public int compare(LinkedList<Integer> o1, LinkedList<Integer> o2) {
try {
if (o1.get(0) == o2.get(0)){
return o1.get(1) - o2.get(1);
} else {
return o1.get(0) - o2.get(0);
}
} catch (Exception ex){
return 0;
}
}
}
public class SortListOfGroups implements Comparator<LinkedList<Integer>>{
@Override
public int compare(LinkedList<Integer> o1, LinkedList<Integer> o2) {
//march down list, while elements still around
int ReturnVal = 0;
int ElementCounter = 0;
//sort the list for as long as possible.
while(o1.size() > ElementCounter && o2.size() > ElementCounter){
if (o1.get(ElementCounter) != o2.get(ElementCounter)){
ReturnVal = o1.get(ElementCounter) - o2.get(ElementCounter);
break;
} else{
ElementCounter++;
}
}
//return the determined value!
return ReturnVal;
}
}
// ===== Deprecated ======== //
//DEPRECATED
//Export trajectories as context set
public void ExportTrajectoriesAsContextSet(String ContextSetFile, boolean OperonsOnly, LinkedHashMap<Integer, OperonTrajectory> Trajectories){
try {
//file writing
BufferedWriter bw = new BufferedWriter(new FileWriter(ContextSetFile));
//Initialize a hash set of string
LinkedList<String> Lines2Export = new LinkedList<String>();
//Initialize a line + counter
String Line = "";
String LineKey = "";
int ProcessCounter = 0;
for (Integer x : Trajectories.keySet()){
//retrieve trajectory
OperonTrajectory OT = Trajectories.get(x);
//option to only export operons
if (!OperonsOnly || (OperonsOnly && !OT.AlwaysASingleGene)){
//increment counter
ProcessCounter++;
//for each organism's gene instances in the trajectory
for (String s : OT.TrajectoryHash.keySet()){
//Export amalgamated set
LinkedList<GenomicElement> L = OT.AmalgamatedOperons.get(s);
for (GenomicElement E : L){
//build line
//key - the data itself
LineKey = s + "\t"
+ E.getContig() + "\t"
+ E.getStart() + "\t"
+ E.getStop() + "\t";
//the line itself in context set file
Line = LineKey + x + "\n";
//if this data point has not yet been exported, export.
if (!Lines2Export.contains(LineKey)){
//store key
Lines2Export.add(LineKey);
//write line to file
bw.write(Line);
bw.flush();
}
}
}
}
//output message.
if (ProcessCounter%100 == 0){
System.out.println("Exported " + ProcessCounter +"/"+ Trajectories.size() +" operon trajectories.");
}
}
//close file writer
bw.close();
//last message
System.out.println("Export complete!");
} catch (Exception ex){
ex.printStackTrace();
}
}
}