package OperonEvolutionInHalos; import genomeObjects.AnnotatedGenome; import genomeObjects.ContextSet; import genomeObjects.GenomicElement; import genomeObjects.OrganismSet; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import operonClustering.CustomDissimilarity; //An object to contain the operons in an organism. public class OperonSet { //Fields //source data structure public OrganismSet OS; //Initial import data structures public LinkedHashMap<String,HashMap<Integer,LinkedList<GenomicElement>>> OperonHash; //Mapping of distances to phylogeny public LinkedHashMap<LinkedList<String>,Double> PhyDistHash; public Double MaxDist = 0.0; public static final String PhyDistHashFile = "/Users/phillipseitzer/Dropbox/OperonEvolutionInHalophiles/Phylogeny/halo.phy_phyml_sh.txt"; //Trajectories public LinkedHashMap<Integer, OperonTrajectory> Trajectories; public LinkedHashMap<LinkedList<Integer>, DoubleOperonTrajectory> DoubleTrajectories; public int RoundingConstant = 10000; //constructor for direct loading of OS context to new object public OperonSet(OrganismSet oS, String CSName){ //note data this.OS = oS; //Initialize hash OperonHash = new LinkedHashMap<String,HashMap<Integer,LinkedList<GenomicElement>>>(); //build hash for (String s : OS.getSpecies().keySet()){ //retrieve genome AnnotatedGenome AG = OS.getSpecies().get(s); //retrieve map LinkedList<ContextSet> GroupNames = AG.getGroupings(); //add data for (ContextSet s1: GroupNames){ if (s1.getName().equals(CSName)){ OperonHash.put(s, s1.getContextMapping()); break; } } } //output message System.out.println("Created set of basic operons"); } //dummy constructor public OperonSet(){ } // ====================== // // ===== Methods ======== // // ====================== // // ===== Creation ======= // //re-organize hash into list of trajectories public void BuildOperonTrajectories(){ /* * (1) Creates a mapping of cluster ID number to cross-species operon trajectories. * (2) Simultaneously, basic statistics about the operon are also noted. * * Cluster ID has to be found in at least 10 different organisms (<= 5276) * Cluster may not have a total number of instances greater than 100 (>= 272) * */ //Initialize trajectory mapping + associated statistics Trajectories = new LinkedHashMap<Integer, OperonTrajectory>(); //Counter variable int TrajectoryCounter = 0; //check all clusters //for (int i = 272; i <= 5276; i++){ //the major set for (int i = 5200; i <= 5300; i++){ //for testing //Initialize this operon trajectory OperonTrajectory OT = new OperonTrajectory(); //Initialize the map for this cluster ID LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>> Trajectory = new LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>>(); //Initialize a mapping to the operon data LinkedHashMap<String, LinkedList<OperonData>> OpDataTraj = new LinkedHashMap<String, LinkedList<OperonData>>(); //initialize organism counter int OrgCounter = 0; int TotalOpCounter = 0; int NumNonSingle = 0; LinkedList<Integer> Clusts = new LinkedList<Integer>(); LinkedList<Integer> Genes = new LinkedList<Integer>(); //check operons in each organism for (String s : OperonHash.keySet()){ //recover operon mapping HashMap<Integer, LinkedList<GenomicElement>> Operons = OperonHash.get(s); //Initialize a list of operons to include LinkedList<LinkedList<GenomicElement>> OrgList = new LinkedList<LinkedList<GenomicElement>>(); LinkedList<OperonData> OpDataList = new LinkedList<OperonData>(); //check each and every operon in the mapping for (Integer x : Operons.keySet()){ //default: operon does not match boolean OperonContainsCluster = false; //recover genes in each operon LinkedList<GenomicElement> L = Operons.get(x); //check each gene for cluster ID for (GenomicElement E : L){ if (E.getClusterID() == i){ OperonContainsCluster = true; break; } } //if the operon contains the cluster, add to list if (OperonContainsCluster){ //increment counter TotalOpCounter++; //note if this "operon" is really just a singleton gene if (L.size() > 1){ NumNonSingle++; } //Initialize a new operon data object OperonData OD = new OperonData(); //add all appropriate cluster ids to the list of cluster ids for (GenomicElement E : L){ //exclude self if (E.getClusterID() != i){ //list of clusters if (!Clusts.contains(E.getClusterID())){ Clusts.add(E.getClusterID()); } //all genes - so, include every instance Genes.add(E.getClusterID()); } //adjust operon data //adjust operon start coordinate if (E.getStart() < OD.Start){ OD.Start = E.getStart(); } //adjust operon end coordinate if (E.getStop() > OD.Stop){ OD.Stop = E.getStop(); } //strand OD.TheStrand = E.getStrand(); //contig OD.Contig = E.getContig(); } //add data to lists OrgList.add(L); OpDataList.add(OD); } } //add these to the trajectory mapping + inc org counter if (OrgList.size() != 0){ Trajectory.put(s, OrgList); OpDataTraj.put(s, OpDataList); OrgCounter++; } } //System.out.println("Breakpoint!"); //Write the entire trajectory to the hash map, if appropriate. if (OrgCounter >= 10){ // ==== Add data to trajectory ====== // //(2) ratio of non-single gene 'operons' Double RatioNonSingle = (double) NumNonSingle / (double) TotalOpCounter; RatioNonSingle = (double) Math.round(RoundingConstant*RatioNonSingle)/RoundingConstant; //(3) total number of additional protein families included with cluster across set Double RatioNovel; if (Genes.size() != 0){ RatioNovel = (double) Clusts.size() / (double) Genes.size(); } else{ //genes that are always single copy don't mean much in our discussion of operon evolution. RatioNovel = -1.0; OT.AlwaysASingleGene = true; } RatioNovel = (double) Math.round(RoundingConstant*RatioNovel)/RoundingConstant; //associate data OT.ClusterID = i; OT.OrgsFeatured = OrgCounter; OT.Operonicity = RatioNonSingle; OT.Variety = RatioNovel; OT.TrajectoryHash = Trajectory; OT.amalgamate(); //puts all ops in same organisms together OT.OperonHash = OpDataTraj; OT.OtherClusters = Clusts; //all other non-seed types of genes featured in the trajectory //Determine overall phylogenetic spread of protein family LinkedList<String> ListOfOrganisms = new LinkedList<String>(OT.OperonHash.keySet()); OT.ClusterPhylogeneticRange = DetermineMaxDist(ListOfOrganisms); //Segregate trajectory - intensive, not always necessary //Important: determine if these should be amalgamated or not! //OT.OperonGroups = SegregateTrajectory(OT); OT.OperonGroups = this.SegregateTrajectoryNoAmalg(OT); //compute evo rate - open to re-analysis OT.computeEvoRate(); //classify detected single gene modification events OT.predictSingleGeneOperonModifications(); // //debugging - display // System.out.println(OT.ClusterID + ":" + // " Prepend: " + OT.isPrepend + // " Insertion: " + OT.isInsertion + // " Append: " + OT.isAppend); //add to map + inc counter Trajectories.put(i,OT); TrajectoryCounter++; } //output message. if (i%100 == 0){ System.out.println("Processed " + i +"/5276 operon trajectories."); // System.out.println("Present in: " + Trajectories.get(i).OrgsFeatured); // System.out.println("Non-singleton ratio: " + Trajectories.get(i).NonsingletonRatio); // System.out.println("Novel cluster ratio: " + Trajectories.get(i).Novelty); } } //final output message System.out.println(TrajectoryCounter + " operon trajectories determined."); } //Calculate for disagreements at lowest level public void AddLowestLevelPhyloDisagreement(OperonTrajectory OT, double PhyloMargin, boolean ignoreSingleGenes){ /* * This algorithm checks the organismic sources of operons * of identical topology, and identifies cases where a more * distant relative exists that is not found in the group. * * In that case, the operon is said not to agree exactly with * the phylogeny. * * If the "distance relative" is within a provided margin of * error, this offender can be discarded. * */ //default: this operon follows the phylogeny boolean FollowsPhylogeny = true; //only need to perform comparison if no alternative operon topologies featured if (OT.OperonGroups.size() > 1){ for (OperonCluster OC1 : OT.OperonGroups){ //if an operon cluster contains only one organism, //no intra-cluster divergence distance to measure. if (OC1.Operons.size() != 1){ //optionally ignore single gene topology if (!ignoreSingleGenes || (ignoreSingleGenes && OC1.OperonSize>1)){ //find the smallest inter-cluster distance double MinDist = 999; //compare this cluster to every other cluster for (OperonCluster OC2: OT.OperonGroups){ //don't compare same if (!OC1.equals(OC2)){ //optionally, ignore single gene topology if (!ignoreSingleGenes || (ignoreSingleGenes && OC2.OperonSize>1)){ //compare double NewDist = DetermineMinDist(OC1.Organisms,OC2.Organisms); //update minimum if (NewDist < MinDist){ MinDist = NewDist; } } } } //compare internal phylo distances to smallest external distance if (OC1.MaxInternalDist - PhyloMargin > MinDist){ FollowsPhylogeny = false; break; } } } } } //update operon trajectory field OT.AgreesWithPhylogenyAtLowestLevel = FollowsPhylogeny; } //calculate cassette for each trajectory: measure of operonic dispersion across species public OperonTrajectory BuildCODFromBasic(OperonTrajectory Basic_OT, CODParameters CP){ /* * a Cassette is a grouping encompassing local dispersion of genes, across species. * * Specifically, genes that are ever predicted to be in an operon in any species are * searched nearby to the operon in species where they are not found. * * Same strand may be required, or not. * * It is NOT AN OPERON PREDICTON APPROACH, but rather, a technique to examine local * dispersion. BTW by setting the "local" parameter very high, this could effectively * encompass the whole genome. */ //Initialize output OperonTrajectory Cassette_OT = new OperonTrajectory(); //this analysis is only relevant to actual operons. if (!Basic_OT.AlwaysASingleGene){ //Transfer Identification information Cassette_OT.ClusterID = Basic_OT.ClusterID; Cassette_OT.OtherClusters = Basic_OT.OtherClusters; Cassette_OT.OrgsFeatured = Basic_OT.OrgsFeatured; //Initialize (important) gene mapping LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>> CassetteTrajectoryHash = new LinkedHashMap<String, LinkedList<LinkedList<GenomicElement>>>(); //Find new genomic mapping for (String s : Basic_OT.OperonHash.keySet()){ //Retrieve initial lists LinkedList<OperonData> DataList = Basic_OT.OperonHash.get(s); //Initialize output LinkedList<LinkedList<GenomicElement>> OrgList = new LinkedList<LinkedList<GenomicElement>>(); //check each operon in the list for (OperonData OD : DataList){ //Initialize a list of genes LinkedList<GenomicElement> L = new LinkedList<GenomicElement>(); //a flag for starting writing boolean StartedWriting = false; //check against genes in original organism AnnotatedGenome AG = OS.getSpecies().get(s); //check all elements for (GenomicElement E : AG.getElements()){ //match same contig, collect all potentially interesting genes if (E.getContig().equals(OD.Contig)){ //approaching from the left if ((E.getCenter() + CP.RangeAroundOperon >= OD.Start) && (E.getClusterID() == Cassette_OT.ClusterID || Cassette_OT.OtherClusters.contains(E.getClusterID()))){ //First: flag is on StartedWriting = true; //record the gene, for later processing L.add(E); } //passing through if (StartedWriting){ //within range if ((E.getCenter() - CP.RangeAroundOperon <= OD.Stop) && (E.getClusterID() == Cassette_OT.ClusterID || Cassette_OT.OtherClusters.contains(E.getClusterID()))){ //record gene, for later processing L.add(E); } else { //stop searching through genes - all candidates have been noted. break; } } } } //post-processing of list if (CP.RequireSameStrand){ OrgList.add(L); } else { LinkedList<GenomicElement> StrandCorrected = new LinkedList<GenomicElement>(); for (GenomicElement E : L){ if (E.getStrand().equals(OD.TheStrand)){ StrandCorrected.add(E); } } OrgList.add(StrandCorrected); } } //add these to the trajectory mapping if (OrgList.size() != 0){ CassetteTrajectoryHash.put(s, OrgList); } } //add values Cassette_OT.TrajectoryHash = CassetteTrajectoryHash; } else { Cassette_OT = Basic_OT; } //output message. //System.out.println("Cassette trajectory determined for " + Cassette_OT.ClusterID + "."); //return statement return Cassette_OT; } // ---- PhyloDist Mapping ----- // //Import phylogenetic mapping info public void BuildPhylogeneticDistanceMapping(){ //Initialize output //LinkedHashMap<String,Integer> ComparisonHash = new LinkedHashMap<String,Integer>(); //Initialize hash map PhyDistHash = new LinkedHashMap<LinkedList<String>,Double>(); //First, load in file try { //open file stream BufferedReader br = new BufferedReader(new FileReader(PhyDistHashFile)); String Line = null; //for comparisons //int ComparisonCounter = 0; //read through lines while ((Line = br.readLine()) != null) { //split by tabs String[] L = Line.split("\t"); //check matches boolean OneMatch = false; boolean TwoMatch = false; //org names String Org1 = ""; String Org2 = ""; //check first entry for (String s : OS.getSpecies().keySet()){ if (L[0].equals(s)){ OneMatch = true; Org1 = s; break; } } //check second entry for (String s : OS.getSpecies().keySet()){ if (L[1].equals(s)){ TwoMatch = true; Org2 = s; break; } } if (OneMatch && TwoMatch){ //build key LinkedList<String> OrgRelation = new LinkedList<String>(); OrgRelation.add(Org1); OrgRelation.add(Org2); Collections.sort(OrgRelation); //build value double Dist = Double.parseDouble(L[3]); //update maximum distance if (Dist > MaxDist){ MaxDist = Dist; } //write to hash PhyDistHash.put(OrgRelation, Dist); } //DEBUGGING // //only mappings between organisms we care about // if (OneMatch && TwoMatch){ // //System.out.println(Line); // ComparisonCounter++; // // //store these comparisons in the comparison hash // // //org 1 // if (ComparisonHash.get(Org1) != null){ // int Val = ComparisonHash.get(Org1); // Val++; // ComparisonHash.put(Org1, Val); // } else { // ComparisonHash.put(Org1, 1); // } // // //org 2 // if (ComparisonHash.get(Org2) != null){ // int Val = ComparisonHash.get(Org2); // Val++; // ComparisonHash.put(Org2, Val); // } else { // ComparisonHash.put(Org2, 1); // } // } } //print summary System.out.println(PhyDistHash.size() + " phylogenetic distances mapped."); //3160 = 79*80/2, lower triangle of dissimilarity matrix //close file stream br.close(); } catch (Exception ex){ ex.printStackTrace(); } //this was for debugging purposes //return ComparisonHash; } //Build a generic histogram public PhyloHistData BuildGenericDistanceMapping(String FileName){ //Initialize output PhyloHistData PHD = new PhyloHistData(); //Initialize hash map LinkedHashMap<LinkedList<String>,Double> PhyDistHash = new LinkedHashMap<LinkedList<String>,Double>(); Double MaxDist = -1.0; //First, load in file try { //open file stream BufferedReader br = new BufferedReader(new FileReader(FileName)); String Line = null; //read through lines while ((Line = br.readLine()) != null) { //split by tabs String[] L = Line.split("\t"); //updated - don't try to filter out entries (3/17/2014) String Org1 = L[0].trim(); String Org2 = L[1].trim(); //build key LinkedList<String> OrgRelation = new LinkedList<String>(); OrgRelation.add(Org1); OrgRelation.add(Org2); Collections.sort(OrgRelation); //build value double Dist = Double.parseDouble(L[2]); //update maximum distance if (Dist > MaxDist){ MaxDist = Dist; } //write to hash PhyDistHash.put(OrgRelation, Dist); } //print summary System.out.println(PhyDistHash.size() + " phylogenetic distances mapped."); //close file stream br.close(); //record data in structure PHD.MaxDist = MaxDist; PHD.PhyDistHash = PhyDistHash; } catch (Exception ex){ ex.printStackTrace(); } //return data return PHD; } //Create a histogram of distances public void PhyloHist(String HistDataFile, int bins, Double MaxDist, LinkedHashMap<LinkedList<String>,Double> PhyDistHash){ //based on number of bins, determine ranges appropriate for bins. Double separationUnit = MaxDist/(double) bins; //counts LinkedHashMap<LinkedList<Double>,Integer> Counts = new LinkedHashMap<LinkedList<Double>, Integer>(); //Initialize counts for (int i = 0; i < bins; i++){ //create range LinkedList<Double> Range = new LinkedList<Double>(); Range.add((double) i * separationUnit); Range.add((double)(i+1)*separationUnit); //add to counts Counts.put(Range, 0); } //Iterate through hash map, and store in appropriate bin. for (Double d : PhyDistHash.values()){ //find appropriate range for (LinkedList<Double> L : Counts.keySet()){ //when the appropriate range is found, update counts + break out of inner loop. if (d >= L.get(0) && d <= L.get(1)){ int count = Counts.get(L); count++; Counts.put(L, count); break; } } } //write to file try { //create file writer BufferedWriter bw = new BufferedWriter(new FileWriter(HistDataFile)); //display data int counter = 0; for (LinkedList<Double> L : Counts.keySet()){ counter++; int Val = Counts.get(L); Double Avg = 0.5*(L.get(0)+L.get(1)); bw.write(String.valueOf(Avg) + "\t" + String.valueOf(Val) + "\n"); bw.flush(); //System.out.println("bin " + counter + ": "+ L.get(0) + "-" + L.get(1) +":" + Val); } // System.out.println("Bins: " + Counts.size()); // System.out.println("Max: " + MaxDist); //close file stream bw.close(); } catch (Exception ex){ ex.printStackTrace(); } } // ------- Data Creation -------- // //Create a set of CODs public LinkedHashMap<Integer, OperonTrajectory> CreateCODSet(CODParameters COD){ //Initialize output LinkedHashMap<Integer, OperonTrajectory> CODset = new LinkedHashMap<Integer, OperonTrajectory>(); for (Integer x : Trajectories.keySet()){ OperonTrajectory Basic = Trajectories.get(x); CODset.put(x, BuildCODFromBasic(Basic,COD)); if (x%100 ==0){ System.out.println("Computed " + x + "/" + Trajectories.size() + " CODs."); } } //return return CODset; } //Create a list of multistable operon topologies public LinkedList<Integer> FindMultiStableOperonTopologies(BistableParams BP){ /* * Algorithm: * (1) Segment each operon trajectory into non-overlapping clusters of identical * protein family content * (2) Filter set by minimum number of operonic members in each group * (3) Filter set by minimum size requirement for operons * (4) For each operon, determine the range of phylogenetic variety * (5) Filter set by minimum internal range of phylogenetic variety * (6) Compute gene content variety between each group (Dice/Jaccard for non-seed) * (7) Compute the minimal phylogenetic distance between two groups (compare all indiv. members) * (8) Retain protein families that match content variety + min grp distance specifications * (9) Num of operon groups that are retained from this process describes stability */ //Initialize output LinkedList<Integer> MultiStableTrajectories = new LinkedList<Integer>(); for (OperonTrajectory OT : Trajectories.values()){ //only check actual operons if (!OT.AlwaysASingleGene){ //generate report OperonStabilityReport OSR = MultiStableOperonTopology(BP,OT); if (OSR.MultiStable){ MultiStableTrajectories.add(OSR.ClusterID); System.out.println("Multistable trajectory discovered: " + OT.ClusterID + "."); } } } //return output return MultiStableTrajectories; } //Single multistable trajectory public OperonStabilityReport MultiStableOperonTopology(BistableParams BP, OperonTrajectory OT){ /* * Algorithm: * (1) Segment operon trajectory into non-overlapping clusters of identical * protein family content * (2) For each operon, determine the range of phylogenetic variety * (3) Filter set by minimum number of operonic members in each group * (4) Filter set by minimum size requirement for operons * (5) Filter set by maximum internal range of phylogenetic variety * (6) Compute gene content variety between each group (Dice/Jaccard for non-seed) * (7) Compute the minimal phylogenetic distance between two groups (compare all indiv. members) * (8) Retain protein families that match content variety + min grp distance specifications * (9) Num of operon groups that are retained from this process describes stability */ //(1) + (2) Create operon groups LinkedList<OperonCluster> OperonGroups = SegregateTrajectory(OT); //(3) - (5) Filtering by operon group properties LinkedList<OperonCluster> FilteredOperonGroups = new LinkedList<OperonCluster>(); for (OperonCluster OC : OperonGroups){ if (OC.Operons.size() >= BP.MinGrpMemSize && OC.OperonSize >= BP.MinOpSize && OC.MaxInternalDist <= BP.MaxSameGrpPhyloDist){ FilteredOperonGroups.add(OC); } } //(6) - (8) Compare cluster groups + filter appropriately //Initialize output OperonStabilityReport OSR = new OperonStabilityReport(OT.ClusterID); //Compare + build output report for (OperonCluster OC1 : FilteredOperonGroups){ for (OperonCluster OC2 : FilteredOperonGroups){ if (!OC1.equals(OC2)){ //Build list LinkedList<OperonCluster> OCL = new LinkedList<OperonCluster>(); OCL.add(OC1); OCL.add(OC2); Collections.sort(OCL, new OCSorter()); //System.out.println("Breakpoint!"); //evaluate distances double PhyDist = DetermineMinDist(OCL.get(0).Organisms,OCL.get(1).Organisms); double ContentDist = ContentDissimilarity(OCL.get(0), OCL.get(1)); //write if the filtering matches if (PhyDist >= BP.MinDiffGrpPhyloDist && ContentDist <= BP.MaxContentDiss){ //add stability relationships + components OSR.addOperonClusters(OCL); } } } } //(9) Return output return OSR; } //determine the maximum distance between elements in a list of organisms public double DetermineMaxDist(LinkedList<String> Organisms){ //Initialize double MaxDist = -1; for (String s1 : Organisms){ for (String s2: Organisms){ if (!s1.equals(s2)){ //retrieve value LinkedList<String> OrgList = new LinkedList<String>(); OrgList.add(s1); OrgList.add(s2); Collections.sort(OrgList); double d = PhyDistHash.get(OrgList); //compare + possibly adjust max if (d > MaxDist){ MaxDist = d; } } } } //return updated value return MaxDist; } //determine the minimum distance between elements in two separate lists of organisms public double DetermineMinDist(LinkedList<String> L1, LinkedList<String> L2){ //Initialize double MinDist = 9999; for (String s1 : L1){ for (String s2: L2){ if (!s1.equals(s2)){ //retrieve value LinkedList<String> OrgList = new LinkedList<String>(); OrgList.add(s1); OrgList.add(s2); Collections.sort(OrgList); double d = PhyDistHash.get(OrgList); //compare + possibly adjust max if (d < MinDist){ MinDist = d; } } } } //return updated value return MinDist; } //determine the content dissimilarity public double ContentDissimilarity(OperonCluster OC1, OperonCluster OC2){ //at present: just evaluate by protein content using generalize dice/jaccard CustomDissimilarity CD = new CustomDissimilarity(); double Dice = CD.GeneralizedDiceOrJaccard(OC1.ClustersFeatured, OC2.ClustersFeatured, true, "Dice"); //return dissimilarity return Dice; } //segregate a single operon trajectory into multiple clusters public LinkedList<OperonCluster> SegregateTrajectory(OperonTrajectory OT){ //initialize hash LinkedHashMap<LinkedList<Integer>,OperonCluster> Segregation = new LinkedHashMap<LinkedList<Integer>, OperonCluster>(); //initialize sorting number int SortingNumber = 0; // //(1) Segregate operons into appropriate groups // for (String s : OT.TrajectoryHash.keySet()){ // for (LinkedList<GenomicElement> L : OT.TrajectoryHash.get(s)){ // // // determine featured protein families // LinkedList<Integer> FamClust = new LinkedList<Integer>(); // // //an operon // for (GenomicElement E : L){ // FamClust.add(E.getClusterID()); // } // // //build the key for the hash // Collections.sort(FamClust); // // OperonCluster OC; // if (Segregation.get(FamClust) != null){ // OC = Segregation.get(FamClust); // } else { // OC = new OperonCluster(); // SortingNumber++; // } // // //adjust contents + store in hash // OC.SeedCluster = OT.ClusterID; // OC.SortingNumber = SortingNumber; // OC.addOrg(s); // OC.Operons.add(L); // OC.addClustersFeatured(FamClust); // // Segregation.put(FamClust, OC); // } // } //(1) UPDATE - use species-amalgamated operons instead of separate operons for (String s : OT.AmalgamatedOperons.keySet()){ LinkedList<GenomicElement> L = OT.AmalgamatedOperons.get(s); // determine featured protein families LinkedList<Integer> FamClust = new LinkedList<Integer>(); //an operon for (GenomicElement E : L){ FamClust.add(E.getClusterID()); } //build the key for the hash Collections.sort(FamClust); OperonCluster OC; if (Segregation.get(FamClust) != null){ OC = Segregation.get(FamClust); } else { OC = new OperonCluster(); SortingNumber++; } //adjust contents + store in hash OC.SeedCluster = OT.ClusterID; OC.SortingNumber = SortingNumber; OC.addOrg(s); OC.Operons.add(L); OC.addClustersFeatured(FamClust); Segregation.put(FamClust, OC); } //(2) Calculate features of each operon cluster. LinkedList<OperonCluster> OpGroups = new LinkedList<OperonCluster>(); //iterate through each, compute values, store in new list for (OperonCluster OC : Segregation.values()){ OC.OperonSize = OC.ClustersFeatured.size(); OC.MaxInternalDist = DetermineMaxDist(OC.Organisms); OpGroups.add(OC); } //return statement return OpGroups; } //segregate a single operon trajectory into multiple clusters - no amalg public LinkedList<OperonCluster> SegregateTrajectoryNoAmalg(OperonTrajectory OT){ //initialize hash LinkedHashMap<LinkedList<Integer>,OperonCluster> Segregation = new LinkedHashMap<LinkedList<Integer>, OperonCluster>(); //initialize sorting number int SortingNumber = 0; //(1) Segregate operons into appropriate groups for (String s : OT.TrajectoryHash.keySet()){ for (LinkedList<GenomicElement> L : OT.TrajectoryHash.get(s)){ // determine featured protein families LinkedList<Integer> FamClust = new LinkedList<Integer>(); //an operon for (GenomicElement E : L){ FamClust.add(E.getClusterID()); } //build the key for the hash Collections.sort(FamClust); OperonCluster OC; if (Segregation.get(FamClust) != null){ OC = Segregation.get(FamClust); } else { OC = new OperonCluster(); SortingNumber++; } //adjust contents + store in hash OC.SeedCluster = OT.ClusterID; OC.SortingNumber = SortingNumber; OC.addOrg(s); OC.Operons.add(L); OC.addClustersFeatured(FamClust); Segregation.put(FamClust, OC); } } //(2) Calculate features of each operon cluster. LinkedList<OperonCluster> OpGroups = new LinkedList<OperonCluster>(); //iterate through each, compute values, store in new list for (OperonCluster OC : Segregation.values()){ OC.OperonSize = OC.ClustersFeatured.size(); OC.MaxInternalDist = DetermineMaxDist(OC.Organisms); OpGroups.add(OC); } //print statement //System.out.println("Breakpoint!"); //return statement return OpGroups; } //amalgamate a set of clusters into a new set of clusters public LinkedList<OperonCluster> GeneOrderSetAmalgamate(LinkedList<OperonCluster> UnfilteredSet){ //Initialize output LinkedList<OperonCluster> FilteredSet = new LinkedList<OperonCluster>(); //Remove single sets for (OperonCluster OC : UnfilteredSet){ if (OC.OperonSize > 0){ FilteredSet.add(OC); } } //Iterate amalgamation protocol //Initialize - try to amalgamate boolean FinishedAmalgamation = false; while (!FinishedAmalgamation){ //re-set place-holder set, and re-initialize filtered set LinkedList<OperonCluster> PlaceHolder = FilteredSet; FilteredSet = new LinkedList<OperonCluster>(); //check for 2-element overlaps //check every operon cluster... for (int i = 0; i < PlaceHolder.size(); i++){ //retrieve one OperonCluster OC1 = PlaceHolder.get(i); //Check number of amalgamations int AmalgCounter = 0; //... against every other operon cluster for (int j = i+1; j < PlaceHolder.size(); j++){ //retrieve the other OperonCluster OC2 = PlaceHolder.get(j); //check for at least two common elements, if so, schedule for amalgamation } } } //return output return FilteredSet; } //build 2-gene trajectories - for use with gene order set, etc public void BuildDoubleTrajectories(){ } //split a single trajectory into a set of double trajectories public LinkedList<DoubleOperonTrajectory> Single2DoubleTrajectory(OperonTrajectory OT){ //Initialize output LinkedList<DoubleOperonTrajectory> SplitTrajectory = new LinkedList<DoubleOperonTrajectory>(); //create a hash map -> data for trajectories LinkedHashMap<LinkedList<Integer>, LinkedHashMap<String,LinkedList<LinkedList<GenomicElement>>>> Mapping = new LinkedHashMap<LinkedList<Integer>, LinkedHashMap<String,LinkedList<LinkedList<GenomicElement>>>>(); //Initialize empty hash maps //Iterate through all species featured for (String s : OT.TrajectoryHash.keySet()){ //all operons from speices s LinkedList<LinkedList<GenomicElement>> OpSet = OT.TrajectoryHash.get(s); //every individual operon for (LinkedList<GenomicElement> L : OpSet){ //keep track of clusters featured LinkedList<Integer> ClustersFeatured = new LinkedList<Integer>(); } } //return output return SplitTrajectory; } // ===== Export ======== // //Create a list of operon trajectories appropriate for gene-order analysis public LinkedList<LinkedList<GenomicElement>> GenerateGeneOrderAppropriateGeneSets(LinkedHashMap<Integer, OperonTrajectory> Trajectories){ /* * Not all operon trajectories have the potential to exhibit an interesting change * in gene order. Trajectories should be filtered+split so that all have at least two common * elements - only if there are two elements can there be a relative change in gene order. * * This may mean that trajectories are divided up into multiple groups. That's okay - * groups are stored based on lists of genes, so no danger at naming confusion * * How to properly compare a set of component operons should be built into the JCE software: * For example, assessing changes in "before" and "after" needs to be compared considering strand * commonality: in other words, a reverse in order of the genes in an operon reflects a * change in gene order iff the strand is the same in both operons. * * Algorithm: * (1) For each operon trajectory, divide into non-overlapping clusters based on at least 2 common genes * (2) Each non-overlapping cluster is a single context set group. * (3) Store all groups in a master list, checking for overlaps. * (4) */ //Inititalize output LinkedList<LinkedList<GenomicElement>> NonOverlappingGeneGroups = new LinkedList<LinkedList<GenomicElement>>(); //Iterate through each trajectory for (Integer x : Trajectories.keySet()){ //Retrieve trajectory OperonTrajectory OT = Trajectories.get(x); //split all operons into groups LinkedList<OperonCluster> IsolatedGroups = SegregateTrajectoryNoAmalg(OT); //build amalgamated groups from these LinkedList<OperonCluster> AmalgamatedGroups = GeneOrderSetAmalgamate(IsolatedGroups); } //return completed hash map return NonOverlappingGeneGroups; } //Export a whole set of operon stats sorted by several different variables public void ExportByDifferentVariables(String BaseFile, boolean IncludeSingletons){ //Retrieve list of trajectories LinkedList<OperonTrajectory> L = new LinkedList<OperonTrajectory>(Trajectories.values()); //sort + export //(1) by NSR Collections.sort(L, new SortbyOperonicity()); String NSRFile = BaseFile + "_byOperonicity.txt"; ExportTrajectoryStatistics(NSRFile,L,IncludeSingletons); //(2) by Novelty Collections.sort(L, new SortbyVariety()); String NoveltyFile = BaseFile + "_byVariety.txt"; ExportTrajectoryStatistics(NoveltyFile,L,IncludeSingletons); //(3) by OrgsFeatured Collections.sort(L, new SortbyOrgsFeatured()); String OrgsFeaturedFile = BaseFile + "_byOrgsFeatured.txt"; ExportTrajectoryStatistics(OrgsFeaturedFile,L,IncludeSingletons); //(4) NSR, Novelty Collections.sort(L, new SortbyOperonicityThenVariety()); String NSR_then_NoveltyFile = BaseFile + "_byOperonicity_then_Variety.txt"; ExportTrajectoryStatistics(NSR_then_NoveltyFile,L,IncludeSingletons); //(5) Novelty, NSR Collections.sort(L, new SortbyVarietyThenOperonicity()); String Novelty_then_NSRFile = BaseFile + "_byNVariety_then_Operonicity.txt"; ExportTrajectoryStatistics(Novelty_then_NSRFile,L,IncludeSingletons); //(6) by evolutionary rate Collections.sort(L, new SortbyEvoRate()); String EvoRateFile = BaseFile + "_byRate.txt"; ExportTrajectoryStatistics(EvoRateFile,L,IncludeSingletons); //output message System.out.println("Files Successfully Exported!"); } //Export a single set of operon trajectory statistics public void ExportTrajectoryStatistics(String FileName, LinkedList<OperonTrajectory> SortedTrajectories, boolean IncludeSingletons){ try { //initialize file writer BufferedWriter bw = new BufferedWriter(new FileWriter(FileName)); //initialize header + write to file String Header = "cluster_num\tnum_orgs\toperonicity\tvariety\tevolutionary_rate\n"; bw.write(Header); bw.flush(); //enumerate through genomes for (OperonTrajectory OT : SortedTrajectories){ //build line String ln = String.valueOf(OT.ClusterID) +"\t" + String.valueOf(OT.OrgsFeatured) + "\t" + String.valueOf(OT.Operonicity) + "\t" + String.valueOf(OT.Variety) + "\t" + String.valueOf(OT.EvoRate) + "\n"; //write line to file if (IncludeSingletons || (!IncludeSingletons && !OT.AlwaysASingleGene)){ bw.write(ln); bw.flush(); } } bw.close(); } catch (Exception ex){ ex.printStackTrace(); } } //export a query set, with options to vary operonicity + novelty (towards highly conserved) //Export a query set of trajectories public void ExportQuerySet(String QuerySetFile, LinkedHashMap<Integer,OperonTrajectory> Trajectories, Double MinOperonicity, Double MaxNovelty, int MinOrgsFeatured){ try { //open file stream BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile)); //initialize counter. int Counter = 0; //export all appropriate trajectories for (Integer x : Trajectories.keySet()){ //retrieve trajectory OperonTrajectory OT = Trajectories.get(x); //check parameters if (OT.Variety <= MaxNovelty && OT.Operonicity >= MinOperonicity && OT.OrgsFeatured >= MinOrgsFeatured){ //build string, write and export String ln = x + "\n"; bw.write(ln); bw.flush(); //increment counter Counter++; } } //close file stream bw.close(); //output message System.out.println("Exported query set containing " + Counter + " trajectories."); } catch (Exception ex){ ex.printStackTrace(); } } //Export a set of pairwise searches appropriate for export public void ExportGeneOrderAnalysisQuerySet(String QuerySetFile, LinkedHashMap<Integer,OperonTrajectory> Trajectories, Double MinOperonicity){ //Initialize a list of query-pairs LinkedList<LinkedList<Integer>> QueryPairs = new LinkedList<LinkedList<Integer>>(); //iterate through all values for (Integer x : Trajectories.keySet()){ //retrieve trajectory OperonTrajectory OT = Trajectories.get(x); //proceed if appropriate if (OT.Operonicity >= MinOperonicity){ //create each pair for (Integer y : OT.OtherClusters){ //re-format pair as sorted linked list LinkedList<Integer> GenePair = new LinkedList<Integer>(); GenePair.add(x); GenePair.add(y); Collections.sort(GenePair); //add list to set of all pairs if (!QueryPairs.contains(GenePair)){ QueryPairs.add(GenePair); } } } //output message. if (x%100 == 0){ System.out.println("Built Query Pairs for " + x +"/5276 operon trajectories."); } } //System.out.println("Sorting!"); //Sort the list Collections.sort(QueryPairs, new SortListOfPairs()); //export to file try { //open file stream BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile)); //export each query pair for (LinkedList<Integer> L : QueryPairs){ //build string String str = String.valueOf(L.get(0)) + " $$ " + String.valueOf(L.get(1)) + "\n"; //write to file stream bw.write(str); bw.flush(); } //close file stream bw.close(); } catch (IOException e) { e.printStackTrace(); } } //Export a set of n-wise searches appropriate for export public void ExportIdenticalContentQuerySet(String QuerySetFile, LinkedHashMap<Integer,OperonTrajectory> Trajectories, boolean IncludeSingles){ //Initialize a list of query-pairs LinkedList<LinkedList<Integer>> QueryGroups = new LinkedList<LinkedList<Integer>>(); //iterate through all values for (Integer x : Trajectories.keySet()){ //retrieve trajectory OperonTrajectory OT = Trajectories.get(x); //determine all non-overlapping groups LinkedList<OperonCluster> Clusters = SegregateTrajectoryNoAmalg(OT); //iterate through + create groups for (OperonCluster OC : Clusters){ //create group LinkedList<Integer> ClusterGrp = new LinkedList<Integer>(); //add this cluster to the list ClusterGrp.add(x); //build list for (Object o : OC.ClustersFeatured){ Integer objint = (Integer) o; ClusterGrp.add(objint); } //arrange in ascending order Collections.sort(ClusterGrp); //add list to set of all pairs if ((!QueryGroups.contains(ClusterGrp) && ClusterGrp.size() > 0 && OC.Operons.size() > 1) && (IncludeSingles || !IncludeSingles && ClusterGrp.size() > 1) //either include singles, or not ){ QueryGroups.add(ClusterGrp); } } //output message. if (x%100 == 0){ System.out.println("Built Query Groups for " + x +"/5276 operon trajectories."); } } //Sort the list Collections.sort(QueryGroups, new SortListOfGroups()); //export to file try { //open file stream BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile)); //export each query pair for (LinkedList<Integer> L : QueryGroups){ //initialize an index counter int IndexCounter = 0; //initialize string String str = "&&only "; while (IndexCounter < L.size()) { //add next index str = str + String.valueOf(L.get(IndexCounter)); //anticipate additional genes, if necessary if (IndexCounter+1 < L.size()){ str = str + " $$ "; } //increment counter IndexCounter++; } //add new line str = str + "\n"; //write to file stream bw.write(str); bw.flush(); } //close file stream bw.close(); } catch (IOException e) { e.printStackTrace(); } } //Export the set of adjacent gene pairs that are represented in all operon topologies public void ExportAdjacentGenePairs(String QuerySetFile, LinkedHashMap<Integer, OperonTrajectory> Trajectories){ //Initialize a list of query-pairs LinkedList<LinkedList<Integer>> QueryGroups = new LinkedList<LinkedList<Integer>>(); //iterate through all values for (Integer x : Trajectories.keySet()){ //retrieve trajectory OperonTrajectory OT = Trajectories.get(x); //determine all non-overlapping groups LinkedList<OperonCluster> Clusters = SegregateTrajectoryNoAmalg(OT); //iterate through + create groups for (OperonCluster OC : Clusters){ //check every operon instance for (LinkedList<GenomicElement> L : OC.Operons){ //iterate through operon for (int i = 0; i < L.size()-1; i++){ //note every pair LinkedList<Integer> ClusterGrp = new LinkedList<Integer>(); ClusterGrp.add(L.get(i).getClusterID()); ClusterGrp.add(L.get(i+1).getClusterID()); //sort + add to set Collections.sort(ClusterGrp); if (!QueryGroups.contains(ClusterGrp)){ QueryGroups.add(ClusterGrp); } } } } //output message. if (x%100 == 0){ System.out.println("Built Query Groups for " + x +"/5276 operon trajectories."); } } //Sort the list Collections.sort(QueryGroups, new SortListOfPairs()); //export to file try { //open file stream BufferedWriter bw = new BufferedWriter(new FileWriter(QuerySetFile)); //export each query pair for (LinkedList<Integer> L : QueryGroups){ //create string String str = String.valueOf(L.get(0)) + " ; " + String.valueOf(L.get(1)) + "\n"; //write to file stream bw.write(str); bw.flush(); } //close file stream bw.close(); } catch (IOException e) { e.printStackTrace(); } } //Export "clear" cases public void ExportClearSingleGeneModifications(String file){ //determine counts int OnlyAppend = 0; int OnlyPrepend = 0; int OnlyInsertion = 0; //count multiple trajectories int MultipleTrajectoryCount = 0; int ClearExampleCount = 0; int NoExamplesCount = 0; try { //open file stream BufferedWriter bw = new BufferedWriter(new FileWriter(file)); //Iterate through all trajectories for (OperonTrajectory OT : Trajectories.values()){ //initialize counter int TypeCounter = 0; //initialize string String str = String.valueOf(OT.ClusterID) + "\t"; //prepend only if (OT.isPrepend && !OT.isInsertion && !OT.isAppend){ OnlyPrepend++; str = str + "Prepend\n"; bw.write(str); bw.flush(); //insertion only } else if (!OT.isPrepend && OT.isInsertion && !OT.isAppend){ OnlyInsertion++; str = str + "Insertion\n"; bw.write(str); bw.flush(); //append only } else if (!OT.isPrepend && !OT.isInsertion && OT.isAppend){ OnlyAppend++; str = str + "Append\n"; bw.write(str); bw.flush(); } if (OT.isPrepend){ TypeCounter++; } if (OT.isAppend){ TypeCounter++; } if (OT.isInsertion){ TypeCounter++; } // 2 or more: it's a multi if (TypeCounter >= 2){ MultipleTrajectoryCount++; //just 1: it's a clear case } else if (TypeCounter == 1){ ClearExampleCount++; //zero: no clear examples } else { NoExamplesCount++; } } //display results System.out.println("Total: " + (OnlyPrepend+OnlyInsertion+OnlyAppend) + " Prepend: " + OnlyPrepend + " Insertion: " + OnlyInsertion + " Append: " + OnlyAppend ); //new display System.out.println("New Checks - All Trajectories:" + Trajectories.values().size()); System.out.println("New Checks - Multiple Trajectories:" + MultipleTrajectoryCount); System.out.println("New Checks - Clear Examples: " + ClearExampleCount); System.out.println("new Checks - No Examples: " + NoExamplesCount); //close file stream bw.close(); } catch (Exception ex){ ex.printStackTrace(); } } // ===== Sorting Classes ====== // //sort by NSR // ===== Sorting Classes ======== // public class SortbyOperonicity implements Comparator<OperonTrajectory>{ @Override public int compare(OperonTrajectory o1, OperonTrajectory o2) { if (o1.Operonicity<o2.Operonicity) return 1; if (o1.Operonicity>o2.Operonicity) return -1; if (o1.Operonicity==o2.Operonicity){ return o1.ClusterID-o2.ClusterID; } return 0; } } //sort by novelty public class SortbyVariety implements Comparator<OperonTrajectory>{ @Override public int compare(OperonTrajectory o1, OperonTrajectory o2) { if (o1.Variety<o2.Variety) return -1; if (o1.Variety>o2.Variety) return 1; if (o1.Variety == o2.Variety){ return o1.ClusterID - o2.ClusterID; } return 0; } } //sort by number of orgs public class SortbyOrgsFeatured implements Comparator<OperonTrajectory>{ @Override public int compare(OperonTrajectory o1, OperonTrajectory o2) { if (o1.OrgsFeatured != o2.OrgsFeatured){ return -1*(o1.OrgsFeatured-o2.OrgsFeatured); } else { return -1*(o1.ClusterID - o2.ClusterID); } } } //NSR, novelty public class SortbyOperonicityThenVariety implements Comparator<OperonTrajectory>{ @Override public int compare(OperonTrajectory o1, OperonTrajectory o2) { if (o1.Operonicity<o2.Operonicity){ return 1; } if (o1.Operonicity>o2.Operonicity){ return -1; } if (o1.Operonicity==o2.Operonicity){ if (o1.Variety<o2.Variety){ return -1; } if (o1.Variety>o2.Variety){ return 1; } if (o1.Variety == o2.Variety){ return o1.ClusterID - o2.ClusterID; } } return 0; } } //Novelty, NSR //NSR, novelty public class SortbyVarietyThenOperonicity implements Comparator<OperonTrajectory>{ @Override public int compare(OperonTrajectory o1, OperonTrajectory o2) { if (o1.Variety<o2.Variety) return -1; if (o1.Variety>o2.Variety) return 1; if (o1.Variety == o2.Variety){ if (o1.Operonicity<o2.Operonicity) return 1; if (o1.Operonicity>o2.Operonicity) return -1; if (o1.Operonicity==o2.Operonicity){ return o1.ClusterID - o2.ClusterID; } } return 0; } } public static class SortbyEvoRate implements Comparator<OperonTrajectory>{ @Override public int compare(OperonTrajectory o1, OperonTrajectory o2) { if (o1.EvoRate<o2.EvoRate) return -1; if (o1.EvoRate>o2.EvoRate) return 1; if (o1.EvoRate == o2.EvoRate){ return o1.ClusterID - o2.ClusterID; } return 0; } } public class OCSorter implements Comparator<OperonCluster>{ @Override public int compare(OperonCluster o1, OperonCluster o2) { return o1.SortingNumber-o2.SortingNumber; } } public static class SortListOfPairs implements Comparator<LinkedList<Integer>>{ @Override public int compare(LinkedList<Integer> o1, LinkedList<Integer> o2) { try { if (o1.get(0) == o2.get(0)){ return o1.get(1) - o2.get(1); } else { return o1.get(0) - o2.get(0); } } catch (Exception ex){ return 0; } } } public class SortListOfGroups implements Comparator<LinkedList<Integer>>{ @Override public int compare(LinkedList<Integer> o1, LinkedList<Integer> o2) { //march down list, while elements still around int ReturnVal = 0; int ElementCounter = 0; //sort the list for as long as possible. while(o1.size() > ElementCounter && o2.size() > ElementCounter){ if (o1.get(ElementCounter) != o2.get(ElementCounter)){ ReturnVal = o1.get(ElementCounter) - o2.get(ElementCounter); break; } else{ ElementCounter++; } } //return the determined value! return ReturnVal; } } // ===== Deprecated ======== // //DEPRECATED //Export trajectories as context set public void ExportTrajectoriesAsContextSet(String ContextSetFile, boolean OperonsOnly, LinkedHashMap<Integer, OperonTrajectory> Trajectories){ try { //file writing BufferedWriter bw = new BufferedWriter(new FileWriter(ContextSetFile)); //Initialize a hash set of string LinkedList<String> Lines2Export = new LinkedList<String>(); //Initialize a line + counter String Line = ""; String LineKey = ""; int ProcessCounter = 0; for (Integer x : Trajectories.keySet()){ //retrieve trajectory OperonTrajectory OT = Trajectories.get(x); //option to only export operons if (!OperonsOnly || (OperonsOnly && !OT.AlwaysASingleGene)){ //increment counter ProcessCounter++; //for each organism's gene instances in the trajectory for (String s : OT.TrajectoryHash.keySet()){ //Export amalgamated set LinkedList<GenomicElement> L = OT.AmalgamatedOperons.get(s); for (GenomicElement E : L){ //build line //key - the data itself LineKey = s + "\t" + E.getContig() + "\t" + E.getStart() + "\t" + E.getStop() + "\t"; //the line itself in context set file Line = LineKey + x + "\n"; //if this data point has not yet been exported, export. if (!Lines2Export.contains(LineKey)){ //store key Lines2Export.add(LineKey); //write line to file bw.write(Line); bw.flush(); } } } } //output message. if (ProcessCounter%100 == 0){ System.out.println("Exported " + ProcessCounter +"/"+ Trajectories.size() +" operon trajectories."); } } //close file writer bw.close(); //last message System.out.println("Export complete!"); } catch (Exception ex){ ex.printStackTrace(); } } }