package operonClustering; import genomeObjects.GenomicElement; import genomeObjects.GenomicElementAndQueryMatch; import java.awt.Point; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedList; import org.biojava3.core.sequence.Strand; import moduls.frm.children.GapPoint; import moduls.frm.children.GapPointMapping; public class CustomDissimilarity implements Serializable { /** * */ private static final long serialVersionUID = 629102024437887278L; //Fields //General private String Name; private String AmalgamationType; private LinkedList<String> Factors; private Double ImportanceFraction; //Factor 1: Presence/absence of common genes private String CGCompareType; private boolean CGDuplicatesUnique; private double CGWeight; private int CGImportance; //Factor 2: Presence/absence of common motifs private LinkedList<String> CMMotifNames; private String CMCompareType; private boolean CMDuplicatesUnique; private double CMWeight; private int CMImportance; public boolean CMExcludeOperonHead; public boolean CMExcludeOperonTail; //Factor 3: Gene order private String GOCompareType; private boolean HeadPos; private boolean PairOrd; private boolean LinearOrd; private double RelWeightHeadPos; private double RelWeightPairOrd; private double RelWeightLinearOrd; private double GOWeight; private int GOImportance; //Factor 4: Intragenic Gap Sizes private GapPointMapping GPM; private double GGWeight; private int GGImportance; //Factor 5: Changes in strandedness private boolean SSIndividualGenes; private boolean SSWholeGroup; private double SSRelWeightIndGenes; private double SSRelWeightWholeGroup; private double SSWeight; private int SSImportance; //extra public boolean DisplayMsgAboutInternalMotif = false; //Constructor public CustomDissimilarity(String name2,String amalgamationType2,LinkedList<String> factors2, double ImpFactor, String cGCompareType2,boolean cGDuplicatesUnique2,double cGWeight2,int cGImportance2, LinkedList<String> cMMotifNames2,String cMCompareType2,boolean cMDuplicatesUnique2, boolean cMExcludeOperonHead, boolean cMExcludeOperonTail, double cMWeight2,int cMImportance2, boolean HeadPos, boolean PairOrd, boolean LinearOrd, double HeadPoswt, double PairOrdwt, double LinearOrdwt, double gOWeight2, int gOImportance2, GapPointMapping gapSizeDissMapping2, double gGWeight2,int gGImportance2, boolean individualGenes2, boolean wholeGroup2, double relWeightIndGenes2, double relWeightWholeGroup2, double sSWeight2, int sSImportance2){ //parameters //general. this.setName(name2); this.Factors = factors2; this.AmalgamationType = amalgamationType2; this.ImportanceFraction = ImpFactor; //factor 1: common genes this.CGCompareType = cGCompareType2; this.CGDuplicatesUnique = cGDuplicatesUnique2; this.CGImportance = cGImportance2; this.CGWeight = cGWeight2; //factor 2: common motifs this.CMMotifNames = cMMotifNames2; this.CMCompareType = cMCompareType2; this.CMDuplicatesUnique = cMDuplicatesUnique2; this.CMExcludeOperonHead = cMExcludeOperonHead; this.CMExcludeOperonTail = cMExcludeOperonTail; //excluding both the head and tail is meaningless, so ignore this possibility. if (CMExcludeOperonHead && CMExcludeOperonTail){ CMExcludeOperonHead = false; CMExcludeOperonTail = false; } this.CMImportance = cMImportance2; this.CMWeight = cMWeight2; //factor 3: gene order this.HeadPos = HeadPos; this.PairOrd = PairOrd; this.LinearOrd = LinearOrd; this.RelWeightHeadPos = HeadPoswt; this.RelWeightPairOrd = PairOrdwt; this.RelWeightLinearOrd = LinearOrdwt; this.GOImportance = gOImportance2; this.GOWeight = gOWeight2; //factor 4: gene gaps this.GPM = gapSizeDissMapping2; this.GGImportance = gGImportance2; this.GGWeight = gGWeight2; //factor 5: Strandedness this.SSIndividualGenes = individualGenes2; this.SSWholeGroup = wholeGroup2; this.SSRelWeightIndGenes = relWeightIndGenes2; this.SSRelWeightWholeGroup = relWeightWholeGroup2; this.SSImportance = sSImportance2; this.SSWeight = sSWeight2; } //dummy constructor public CustomDissimilarity(){ } // -------- Compute Dissimilarity -------------------------// // -------- General ----------// //Generalized Dice/Jaccard public double GeneralizedDiceOrJaccard(LinkedList<Object> O1, LinkedList<Object> O2, boolean TreatDuplicatesAsUnique, String Type){ //Initialize values double Dissimilarity = 0; double NumIntersecting = 0; double SizeO1; double SizeO2; double SizeUnion = 0; //Hash Sets HashSet<Object> O1Hash = new HashSet<Object>(O1); HashSet<Object> O2Hash = new HashSet<Object>(O2); HashSet<Object> IntersectionHash = new HashSet<Object>(O1); HashSet<Object> UnionHash = new HashSet<Object>(O1); IntersectionHash.retainAll(O2Hash); UnionHash.addAll(O2Hash); if (TreatDuplicatesAsUnique){ SizeO1 = O1.size(); SizeO2 = O2.size(); //Find all intersecting types, and find the number that intersect. for (Object O : IntersectionHash){ NumIntersecting = NumIntersecting + Math.min(Collections.frequency(O1, O), Collections.frequency(O2, O)); } //compute union SizeUnion = SizeO1 + SizeO2 - NumIntersecting; } else { SizeO1 = O1Hash.size(); SizeO2 = O2Hash.size(); NumIntersecting = IntersectionHash.size(); SizeUnion = UnionHash.size(); } //compute dissimilarity from computed size / union / intersection values if (Type.equals("Dice")){ if (!((SizeO1 == 0) && (SizeO2 == 0))){ Dissimilarity = 1-(2*NumIntersecting)/(SizeO1+SizeO2); } else { //divide by zero case Dissimilarity = 0; } } else { //Jaccard if (SizeUnion != 0) { Dissimilarity = 1-(NumIntersecting/SizeUnion); } else { //divide by zero case Dissimilarity = 0; } } //debugging // if (O1.contains("BOP") && O1.contains("Promoter") && // O2.contains("BOP") && O2.contains("Promoter")){ // // System.out.println("O1 Objects:"); // for (Object o : O1){ // System.out.println(o.toString()); // } // System.out.println("O2 Objects:"); // for (Object o : O2){ // System.out.println(o.toString()); // } // // System.out.println("|O1|: " + SizeO1); // System.out.println("|O2|: " + SizeO2); // System.out.println("|O1 A O2|: " + NumIntersecting); // System.out.println("Diss: " + Dissimilarity); // } return Dissimilarity; } //Head position based dissimilarity (GO) public double HeadPosDiss(LinkedList<Object> O1Values, LinkedList<Object> O2Values, boolean isStrandFlipped){ double HeadPosDissimilarity = 0.0; /* * Algorithm: * (1) determine # common elements (dup. unique) * (2) Set first as pivot, head = start of first list * (3) count common positions from second forwards * (4) count common positions from second reversed * (5) retain higher count * (6) diss = 1 - (higher count) / (# common elements [dup unique]) */ //re-sizing - O1 Values must always be larger if (O1Values.size() < O2Values.size()){ LinkedList<Object> Temp = O1Values; O1Values = O2Values; O2Values = Temp; } //(1) common elements int NumIntersecting = 0; HashSet<Object> O2Hash = new HashSet<Object>(O2Values); HashSet<Object> IntersectionHash = new HashSet<Object>(O1Values); IntersectionHash.retainAll(O2Hash); //Find all intersecting types, and find the number that intersect. for (Object O : IntersectionHash){ NumIntersecting = NumIntersecting + Math.min(Collections.frequency(O1Values, O), Collections.frequency(O2Values, O)); } //(2)-(5) Counts int FwdCount = 0; int RevCount = 0; int MaxCount = 0; for (int i = 0; i < O2Values.size(); i++){ if (O2Values.get(i).equals(O1Values.get(i))){ FwdCount++; } if (O2Values.get(i).equals(O1Values.get(O1Values.size()-1-i))){ RevCount++; } } //old way - benefit of doubt //MaxCount = Math.max(FwdCount, RevCount); //new way - based on supplied strand if (isStrandFlipped){ MaxCount = RevCount; } else { MaxCount = FwdCount; } //(6) Compute dissimilarity if (NumIntersecting != 0){ HeadPosDissimilarity = 1 - ((double)MaxCount/(double)NumIntersecting); } else { HeadPosDissimilarity = 0; } return HeadPosDissimilarity; } //Pair ordering based dissimilarity (GO) public double PairOrdDiss(LinkedList<Object> O1Values, LinkedList<Object> O2Values, boolean isStrandFlipped){ double PairOrdDissimilarity = 0; /* * Algorithm: * (1) Build O1 Adjacencies * (2) Build O2 Adjacencies + O2 reverse adjacencies * (3) Count common, and take higher */ //Initialize adjacencies LinkedList<LinkedList<Object>> O1Adjacencies = new LinkedList<LinkedList<Object>>(); LinkedList<LinkedList<Object>> O2AdjacenciesFwd = new LinkedList<LinkedList<Object>>(); LinkedList<LinkedList<Object>> O2AdjacenciesRev = new LinkedList<LinkedList<Object>>(); //Build adjacencies for (int i = 0; i <O1Values.size()-1; i++){ LinkedList<Object> SingleAdjacency = new LinkedList<Object>(); SingleAdjacency.add(O1Values.get(i)); SingleAdjacency.add(O1Values.get(i+1)); O1Adjacencies.add(SingleAdjacency); } for (int i = 0; i <O2Values.size()-1; i++){ LinkedList<Object> FwdAdjacency = new LinkedList<Object>(); FwdAdjacency.add(O2Values.get(i)); FwdAdjacency.add(O2Values.get(i+1)); O2AdjacenciesFwd.add(FwdAdjacency); } for (int i = O2Values.size()-2; i >= 0; i--){ LinkedList<Object> RevAdjacency = new LinkedList<Object>(); RevAdjacency.add(O2Values.get(i+1)); RevAdjacency.add(O2Values.get(i)); O2AdjacenciesRev.add(RevAdjacency); } //find intersection O2AdjacenciesFwd.retainAll(O1Adjacencies); O2AdjacenciesRev.retainAll(O1Adjacencies); //only consider the //compute dissimilarity int SmallerSize = Math.min(O1Values.size(), O2Values.size()); //old way - don't consider pre-determined strandedness //int MostAdjacencies = Math.max(O2AdjacenciesFwd.size(), O2AdjacenciesRev.size()); //new way: only if the strand is explicitly reported as flipped, consider the reverse int MostAdjacencies = -1; if (isStrandFlipped){ MostAdjacencies = O2AdjacenciesRev.size(); } else { MostAdjacencies = O2AdjacenciesFwd.size(); } if (SmallerSize > 1){ PairOrdDissimilarity = 1 - ((double)MostAdjacencies/(double)(SmallerSize-1)); } else { PairOrdDissimilarity = 0; } return PairOrdDissimilarity; } //Dissimilarity by overall shape (tolerate insertions/deletions) (GO) public double LinearOrderDiss(LinkedList<Object> O1Values, LinkedList<Object> O2Values, boolean isStrandFlipped){ /* * Algorithm: * (1) Build O2 in reverse order * (2) Determine list of elements common to both sets, in the order they occur * (3) check for disagreements in ordering * * Current state - duplicates pose problems * if there is an unequal number of duplicates, very hard to compare * rather, just count up until the maximum number of common duplicates * this is likely only a big problem when there are a very different number * of duplicates. Something to think about / worry about for the future. */ /* * Debugging * */ //initialize output double LinearOrderDissimilarity = 0.0; boolean MultipleElementsExist = false; boolean DuplicatesExist = false; //build reverse list LinkedList<Object> O2ValuesRev = new LinkedList<Object>(); for (int i = O2Values.size()-1; i >=0; i--){ O2ValuesRev.add(O2Values.get(i)); } //build hash of counts - determine the number of elements appropriate LinkedHashMap<Object, Integer> CommonValuesCounts = new LinkedHashMap<Object, Integer>(); for (Object O : O1Values){ if (O2Values.contains(O)){ int Count = 1; if (CommonValuesCounts.get(O) != null){ Count = CommonValuesCounts.get(O); Count++; DuplicatesExist = true; } CommonValuesCounts.put(O,Count); } } //a re-ordering can only possibly occur if multiple elements exist. if (CommonValuesCounts.size() > 1){ MultipleElementsExist = true; } //common elements from each set LinkedList<Object> O1CommonValues = new LinkedList<Object>(); LinkedList<Object> O2CommonValues = new LinkedList<Object>(); LinkedList<Object> O2RevCommonValues = new LinkedList<Object>(); //keep track of the number of common values in each list LinkedHashMap<Object, Integer> O1CommonCounts = new LinkedHashMap<Object, Integer>(); LinkedHashMap<Object, Integer> O2CommonCounts = new LinkedHashMap<Object, Integer>(); LinkedHashMap<Object, Integer> O2RevCommonCounts = new LinkedHashMap<Object, Integer>(); //build common elements lists //O1 List for (Object O : O1Values){ if (O2Values.contains(O)){ //check current count with master list, and update hash. int Count = 1; if (O1CommonCounts.get(O) != null){ Count = O1CommonCounts.get(O); Count++; } O1CommonCounts.put(O,Count); //optionally add this to the list. if (Count <= CommonValuesCounts.get(O)){ O1CommonValues.add(O); } } } //O2 List for (Object O : O2Values){ if (O1Values.contains(O)){ //check current count with master list, and update hash. int Count = 1; if (O2CommonCounts.get(O) != null){ Count = O2CommonCounts.get(O); Count++; } O2CommonCounts.put(O,Count); //optionally add this to the list. if (Count <= CommonValuesCounts.get(O)){ O2CommonValues.add(O); } } } //O2Rev List for (Object O : O2ValuesRev){ if (O1Values.contains(O)){ //check current count with master list, and update hash. int Count = 1; if (O2RevCommonCounts.get(O) != null){ Count = O2RevCommonCounts.get(O); Count++; } O2RevCommonCounts.put(O,Count); //optionally add this to the list. if (Count <= CommonValuesCounts.get(O)){ O2RevCommonValues.add(O); } } } // //debugging. // if (O1Values.size() == 3 && O2Values.size() == 3 // && O1Values.contains(15) && O1Values.contains(309) // && O2Values.contains(15) && O2Values.contains(309) // ){ // System.out.println("Breakpoint!"); // } //if the first list agrees with either of the other lists, record no change in gene order. //otherwise, maximum dissimilarity. if (O1CommonValues.equals(O2CommonValues) || (O1CommonValues.equals(O2RevCommonValues) && isStrandFlipped)){ LinearOrderDissimilarity = 0.0; //Tricky balance between 0 transitivity and segregating single-gene commonalities // //Solution: need to eliminate 0 transitivity. //To eliminate single-gene commonalities, //create a better context set. // //if only one element exists in common, need to break 0 transitivities. // if (O1CommonValues.size()==1){ // LinearOrderDissimilarity = 1.0; // } //if there is only one kind of common element, no way to talk about re-ordering if (!MultipleElementsExist){ LinearOrderDissimilarity = 1.0; } } else{ LinearOrderDissimilarity = 1.0; } //return output return LinearOrderDissimilarity; } // ------- Factors -----------// //Common Genes public double CGDissimilarity(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2, String Type){ double Dissimilarity = 0; LinkedList<Object> O1Values = new LinkedList<Object>(); LinkedList<Object> O2Values = new LinkedList<Object>(); //determine appropriate data types if (Type.equals("annotation")){ //add elements for (GenomicElementAndQueryMatch E: O1){ O1Values.add(E.getE().getAnnotation().toUpperCase()); } for (GenomicElementAndQueryMatch E: O2){ O2Values.add(E.getE().getAnnotation().toUpperCase()); } } else { int NegativeCounter = -10; //add elements //if clusterID = 0, this is really probably unique, treat all cluster == 0 as unique sets. for (GenomicElementAndQueryMatch E: O1){ if (E.getE().getClusterID() == 0){ NegativeCounter--; O1Values.add(NegativeCounter); } else { O1Values.add(E.getE().getClusterID()); } } for (GenomicElementAndQueryMatch E: O2){ if (E.getE().getClusterID() == 0){ NegativeCounter--; O2Values.add(NegativeCounter); } else { O2Values.add(E.getE().getClusterID()); } } } //pass into method Dissimilarity = GeneralizedDiceOrJaccard(O1Values,O2Values,CGDuplicatesUnique,CGCompareType); //Maximum dissimilarity is 1, minimum is 0. if (Dissimilarity > 1){ Dissimilarity = 1; } else if (Dissimilarity <= 0){ Dissimilarity = 0; } return Dissimilarity; } //Common Motifs public double CMDissimilarity(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2, String Type){ double Dissimilarity = 0; /* * Approach: Find all common gene pairs, and assess all associated motifs, * using dice or jaccard. Take average of all individual common gene motif-related * dissimilarities. * * In the case of multiple duplicate common genes existing between gene groupings, * compute all possible pairwise dissimilarities between genes, and use the * Munkres-Hungarian algorithm to determine the assignments that minimize the sum * of all dissimilarities. * * If there is only one common element across sets (the most common case), then * the matrix is not very interesting and the Hungarian algorithm just returns the element. */ // //debugging // boolean Breakit = false; // for (GenomicElementAndQueryMatch GandE : O1){ // if (GandE.getE().getStart() == 348280 && // GandE.getE().getStop() == 349017){ // Breakit = true; // } // } // if (Breakit){ // for (GenomicElementAndQueryMatch GandE2 : O2){ // if (GandE2.getE().getStart() == 65555 && // GandE2.getE().getStop() == 66292){ // System.out.println("Breakpoint!"); // } // } // } if (Type.equals("annotation")){ //First, isolate all common types. //initialize lists ArrayList<String> O1Values = new ArrayList<String>(); ArrayList<String> O2Values = new ArrayList<String>(); //add elements for (GenomicElementAndQueryMatch E: O1){ O1Values.add(E.getE().getAnnotation().toUpperCase()); } for (GenomicElementAndQueryMatch E: O2){ O2Values.add(E.getE().getAnnotation().toUpperCase()); } //determine intersection of common genes (by annotation) HashSet<String> O1Hash = new HashSet<String>(O1Values); HashSet<String> O2Hash = new HashSet<String>(O2Values); HashSet<String> Intersection = new HashSet<String>(O1Values); Intersection.retainAll(O2Hash); Intersection.retainAll(O1Hash); //Sum of Dissimilarities double SumDissimilarity = 0; //determine sum of dissimilarities for (String s : Intersection){ //find all instances in each set LinkedList<GenomicElementAndQueryMatch> InstancesIn1 = new LinkedList<GenomicElementAndQueryMatch>(); for (GenomicElementAndQueryMatch E: O1){ if (E.getE().getAnnotation().toUpperCase().equals(s)){ InstancesIn1.add(E); } } LinkedList<GenomicElementAndQueryMatch> InstancesIn2 = new LinkedList<GenomicElementAndQueryMatch>(); for (GenomicElementAndQueryMatch E: O2){ if (E.getE().getAnnotation().toUpperCase().equals(s)){ InstancesIn2.add(E); } } //compute motif dissimilarity of all pairwise, to determine best matching set //double[][] MotifComparisons = new double[InstancesIn1.size()][InstancesIn2.size()]; //make the matrix square, if it needs to be. int MatrixSize = Math.max(InstancesIn1.size(), InstancesIn2.size()); double[][] MotifComparisons = new double[MatrixSize][MatrixSize]; for (int i = 0; i < MatrixSize; i++){ for (int j = 0; j < MatrixSize; j++){ MotifComparisons[i][j] = 0; } } for (int i = 0; i < InstancesIn1.size(); i++){ for (int j = 0; j < InstancesIn2.size(); j++){ //retrieve all motifs LinkedList<Object> MotifsIn1 = InstancesIn1.get(i).getE().getAssociatedMotifsAsObjects(CMMotifNames); LinkedList<Object> MotifsIn2 = InstancesIn2.get(j).getE().getAssociatedMotifsAsObjects(CMMotifNames); //fill in array MotifComparisons[i][j] = this.GeneralizedDiceOrJaccard(MotifsIn1, MotifsIn2, CMDuplicatesUnique, CMCompareType); // if (MotifsIn1.contains("BOP") && MotifsIn1.contains("Promoter") && // MotifsIn2.contains("BOP") && MotifsIn2.contains("Promoter")){ // System.out.println("Diss: " + MotifComparisons[i][j]); // } } } //Call Hungarian algorithm, sum dissimilarities, add to running total SumDissimilarity = SumDissimilarity + (HungarianAlgorithm.JCEAssignment(MotifComparisons) / (double) Math.min(InstancesIn1.size(), InstancesIn2.size())); } if (Intersection.size() > 0){ //Normalize Dissimilarity = SumDissimilarity / Intersection.size(); } } else { //Common Cluster ID //initialize lists ArrayList<Integer> O1Values = new ArrayList<Integer>(); ArrayList<Integer> O2Values = new ArrayList<Integer>(); int NegativeCounter = -10; //add elements //if clusterID = 0, this is really probably unique, treat all cluster == 0 as unique sets. for (GenomicElementAndQueryMatch E: O1){ if (E.getE().getClusterID() == 0){ NegativeCounter--; O1Values.add(NegativeCounter); } else { O1Values.add(E.getE().getClusterID()); } } for (GenomicElementAndQueryMatch E: O2){ if (E.getE().getClusterID() == 0){ NegativeCounter--; O2Values.add(NegativeCounter); } else { O2Values.add(E.getE().getClusterID()); } } //determine intersection of common genes (by cluster ID) HashSet<Integer> O1Hash = new HashSet<Integer>(O1Values); HashSet<Integer> O2Hash = new HashSet<Integer>(O2Values); HashSet<Integer> Intersection = new HashSet<Integer>(O1Values); Intersection.retainAll(O2Hash); Intersection.retainAll(O1Hash); //Sum of Dissimilarities double SumDissimilarity = 0; //determine sum of dissimilarities for (Integer s : Intersection){ //find all instances in each set LinkedList<GenomicElementAndQueryMatch> InstancesIn1 = new LinkedList<GenomicElementAndQueryMatch>(); for (GenomicElementAndQueryMatch E: O1){ if (E.getE().getClusterID() == s){ InstancesIn1.add(E); } } LinkedList<GenomicElementAndQueryMatch> InstancesIn2 = new LinkedList<GenomicElementAndQueryMatch>(); for (GenomicElementAndQueryMatch E: O2){ if (E.getE().getClusterID() == s){ InstancesIn2.add(E); } } //compute motif dissimilarity of all pairwise, to determine best matching set //double[][] MotifComparisons = new double[InstancesIn1.size()][InstancesIn2.size()]; //make the matrix square, if it needs to be. int MatrixSize = Math.max(InstancesIn1.size(), InstancesIn2.size()); double[][] MotifComparisons = new double[MatrixSize][MatrixSize]; for (int i = 0; i < MatrixSize; i++){ for (int j = 0; j < MatrixSize; j++){ MotifComparisons[i][j] = 0; } } for (int i = 0; i < InstancesIn1.size(); i++){ for (int j = 0; j < InstancesIn2.size(); j++){ //retrieve all motifs LinkedList<Object> MotifsIn1 = InstancesIn1.get(i).getE().getAssociatedMotifsAsObjects(CMMotifNames); LinkedList<Object> MotifsIn2 = InstancesIn2.get(j).getE().getAssociatedMotifsAsObjects(CMMotifNames); //fill in array MotifComparisons[i][j] = this.GeneralizedDiceOrJaccard(MotifsIn1, MotifsIn2, CMDuplicatesUnique, CMCompareType); } } // //debugging // for (int i = 0; i < MatrixSize; i++){ // String Line = ""; // for (int j =0; j <MatrixSize; j++){ // Line = Line + " " + MotifComparisons[i][j]; // } // Line = Line + ";"; // System.out.println(Line); // } //Call Hungarian algorithm, sum dissimilarities, add to running total SumDissimilarity = SumDissimilarity + (HungarianAlgorithm.JCEAssignment(MotifComparisons) / (double) Math.min(InstancesIn1.size(), InstancesIn2.size())); } if (Intersection.size() > 0){ //Normalize Dissimilarity = SumDissimilarity / Intersection.size(); } } //adjust dissimilarity before returning if (Dissimilarity > 1){ Dissimilarity = 1; } else if (Dissimilarity < 0){ Dissimilarity = 0; } return Dissimilarity; } //Gene Order public double GODissimilarity(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2, String Type){ //Initialize output double Dissimilarity = 0; double HeadPosDissimilarity = 0; double PairOrdDissimilarity = 0; double LinearOrdDissimilarity = 0; //Strand counts int StrandMatches = 0; //attempt to detect a complete strand switching LinkedList<Object> O1Values = new LinkedList<Object>(); LinkedList<Object> O2Values = new LinkedList<Object>(); //determine appropriate data types if (Type.equals("annotation")){ //add elements for (GenomicElementAndQueryMatch E: O1){ O1Values.add(E.getE().getAnnotation().toUpperCase()); } for (GenomicElementAndQueryMatch E: O2){ O2Values.add(E.getE().getAnnotation().toUpperCase()); } } else { int NegativeCounter = -10; //add elements //if clusterID = 0, this is really probably unique, treat all cluster == 0 as unique sets. for (GenomicElementAndQueryMatch E: O1){ if (E.getE().getClusterID() == 0){ NegativeCounter--; O1Values.add(NegativeCounter); } else { O1Values.add(E.getE().getClusterID()); } } for (GenomicElementAndQueryMatch E: O2){ if (E.getE().getClusterID() == 0){ NegativeCounter--; O2Values.add(NegativeCounter); } else { O2Values.add(E.getE().getClusterID()); } } } //(1) common elements int NumIntersecting = 0; HashSet<Object> O2Hash = new HashSet<Object>(O2Values); HashSet<Object> IntersectionHash = new HashSet<Object>(O1Values); IntersectionHash.retainAll(O2Hash); //Find all intersecting types, and find the number that intersect. for (Object O : IntersectionHash){ NumIntersecting = NumIntersecting + Math.min(Collections.frequency(O1Values, O), Collections.frequency(O2Values, O)); } //determine if the total strand has flipped boolean isStrandFlipped = TotalGroupStrandFlip(O1,O2,Type); //Determine relative weights double TotalRelativeWeights = 0; if (HeadPos){ //increment total weights contribution TotalRelativeWeights = TotalRelativeWeights + RelWeightHeadPos; //Compute head position contribution HeadPosDissimilarity = this.HeadPosDiss(O1Values, O2Values, isStrandFlipped); } if (PairOrd){ //increment total weights contribution TotalRelativeWeights = TotalRelativeWeights + RelWeightPairOrd; //Compute number of common pairs contribution PairOrdDissimilarity = this.PairOrdDiss(O1Values, O2Values, isStrandFlipped); } if (LinearOrd){ //increment total weights contribution TotalRelativeWeights = TotalRelativeWeights + RelWeightLinearOrd; //Compute the dissimilarity from this factor LinearOrdDissimilarity = this.LinearOrderDiss(O1Values, O2Values, isStrandFlipped); } //Amalgamate into dissimilarity Dissimilarity = (RelWeightHeadPos/TotalRelativeWeights) * HeadPosDissimilarity + (RelWeightPairOrd/TotalRelativeWeights) * PairOrdDissimilarity + (RelWeightLinearOrd/TotalRelativeWeights) * LinearOrdDissimilarity; //Maximum dissimilarity is 1, minimum is 0. if (Dissimilarity > 1){ Dissimilarity = 1; } else if (Dissimilarity <= 0){ Dissimilarity = 0; } return Dissimilarity; } //Gene Gaps public double GGDissimilarity(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2, String Type){ // //breakpoint! // for (GenomicElementAndQueryMatch GandE : O1){ // if (GandE.getE().getStart() == 192504 && // GandE.getE().getStop() == 193139){ // System.out.println("breakpoint!"); // } // } //determine if the total strand has flipped boolean isStrandFlipped = TotalGroupStrandFlip(O1,O2,Type); //only for customized DisplayMsgAboutInternalMotif = false; //re-sizing - O1 Values must always be larger if (O1.size() < O2.size()){ LinkedList<GenomicElementAndQueryMatch> Temp = O1; O1 = O2; O2 = Temp; } //Initialize Dissimilarity double Dissimilarity = 0.0; //Initialize adjacencies LinkedList<LinkedList<GenomicElementAndQueryMatch>> O1Adjacencies = new LinkedList<LinkedList<GenomicElementAndQueryMatch>>(); LinkedList<LinkedList<GenomicElementAndQueryMatch>> O2AdjacenciesFwd = new LinkedList<LinkedList<GenomicElementAndQueryMatch>>(); LinkedList<LinkedList<GenomicElementAndQueryMatch>> O2AdjacenciesRev = new LinkedList<LinkedList<GenomicElementAndQueryMatch>>(); //Build adjacencies for (int i = 0; i < O1.size()-1; i++){ LinkedList<GenomicElementAndQueryMatch> SingleAdjacency = new LinkedList<GenomicElementAndQueryMatch>(); SingleAdjacency.add(O1.get(i)); SingleAdjacency.add(O1.get(i+1)); O1Adjacencies.add(SingleAdjacency); } for (int i = 0; i <O2.size()-1; i++){ LinkedList<GenomicElementAndQueryMatch> FwdAdjacency = new LinkedList<GenomicElementAndQueryMatch>(); FwdAdjacency.add(O2.get(i)); FwdAdjacency.add(O2.get(i+1)); O2AdjacenciesFwd.add(FwdAdjacency); } for (int i = O2.size()-2; i >= 0; i--){ LinkedList<GenomicElementAndQueryMatch> RevAdjacency = new LinkedList<GenomicElementAndQueryMatch>(); RevAdjacency.add(O2.get(i+1)); RevAdjacency.add(O2.get(i)); O2AdjacenciesRev.add(RevAdjacency); } //initialize dissimilarities double ForwardDissimilarity = 0; double ReverseDissimilarity = 0; int FwdMatch = 0; int RevMatch = 0; //When multiple homologous pairs present, pick the smallest one. LinkedHashMap<LinkedList<Object>, Integer> FwdAdjHash = new LinkedHashMap<LinkedList<Object>, Integer>(); LinkedHashMap<LinkedList<Object>, Integer> RevAdjHash = new LinkedHashMap<LinkedList<Object>, Integer>(); //Walk along adjacencies. for (LinkedList<GenomicElementAndQueryMatch> Adjacency : O1Adjacencies){ //check forward for (LinkedList<GenomicElementAndQueryMatch> FwdAdj : O2AdjacenciesFwd){ boolean EquivalentAdjacency = false; if (Type.equals("annotation")){ if (Adjacency.get(0).getE().getAnnotation().toUpperCase() .equals(FwdAdj.get(0).getE().getAnnotation().toUpperCase()) && //annotation match Adjacency.get(1).getE().getAnnotation().toUpperCase() .equals(FwdAdj.get(1).getE().getAnnotation().toUpperCase()) && //annotation match Adjacency.get(0).getE().getContig().equals(Adjacency.get(1).getE().getContig()) && //internal contig match FwdAdj.get(0).getE().getContig().equals(FwdAdj.get(1).getE().getContig())){ //internal contig match EquivalentAdjacency = true; FwdMatch++; } } else { if (Adjacency.get(0).getE().getClusterID() == FwdAdj.get(0).getE().getClusterID() && //cluster ID match Adjacency.get(1).getE().getClusterID() == FwdAdj.get(1).getE().getClusterID() && //cluster ID match Adjacency.get(0).getE().getContig().equals(Adjacency.get(1).getE().getContig()) && //internal contig match FwdAdj.get(0).getE().getContig().equals(FwdAdj.get(1).getE().getContig())){ //internal contig match EquivalentAdjacency = true; FwdMatch++; } } //If the two adjacencies are equivalent, compute a gap penalty if (EquivalentAdjacency){ //gap computation int gap1 = Adjacency.get(1).getE().getStart() - Adjacency.get(0).getE().getStop(); int gap2 = FwdAdj.get(1).getE().getStart() - FwdAdj.get(0).getE().getStop(); int gapDiff = Math.abs(gap2-gap1); double gapDissimilarity = 0; //determine gap dissimilarity if (gapDiff > this.GPM.MaxGapLimit){ gapDissimilarity = this.GPM.MaxDissimilarity; //only for customized //if (!DisplayMsgAboutInternalMotif) // DisplayMsgAboutInternalMotif = CheckForInternalPromoter(Adjacency,FwdAdj); } else if (gapDiff > this.GPM.MinGaplimit){ gapDissimilarity = this.GPM.Mapping.get(gapDiff); } //add to running total ForwardDissimilarity = ForwardDissimilarity + gapDissimilarity; //reset adjacency EquivalentAdjacency = false; //add to hash map LinkedList<Object> ObjKey; if (Type.equals("cluster")){ LinkedList<Integer> Key = new LinkedList<Integer>(); Key.add(Adjacency.get(1).getE().getClusterID()); Key.add(Adjacency.get(0).getE().getClusterID()); Collections.sort(Key); ObjKey = new LinkedList<Object>(Key); } else { LinkedList<String> Key = new LinkedList<String>(); Key.add(Adjacency.get(1).getE().getAnnotation().toUpperCase()); Key.add(Adjacency.get(0).getE().getAnnotation().toUpperCase()); Collections.sort(Key); ObjKey = new LinkedList<Object>(Key); } if (FwdAdjHash.get(ObjKey) != null){ int StoredGapDiff = FwdAdjHash.get(ObjKey); if (gapDiff < StoredGapDiff){ StoredGapDiff = gapDiff; } FwdAdjHash.put(ObjKey, StoredGapDiff); } else{ FwdAdjHash.put(ObjKey, gapDiff); } } } //check reverse for (LinkedList<GenomicElementAndQueryMatch> RevAdj : O2AdjacenciesRev){ boolean EquivalentAdjacencyRev = false; if (Type.equals("annotation")){ if (Adjacency.get(0).getE().getAnnotation().toUpperCase() .equals(RevAdj.get(0).getE().getAnnotation().toUpperCase()) && //annotation match Adjacency.get(1).getE().getAnnotation().toUpperCase() .equals(RevAdj.get(1).getE().getAnnotation().toUpperCase()) && //annotation match Adjacency.get(0).getE().getContig().equals(Adjacency.get(1).getE().getContig()) && //internal contig match RevAdj.get(0).getE().getContig().equals(RevAdj.get(1).getE().getContig())){ //internal contig match EquivalentAdjacencyRev = true; RevMatch++; } } else { if (Adjacency.get(0).getE().getClusterID() == RevAdj.get(0).getE().getClusterID() && //cluster ID match Adjacency.get(1).getE().getClusterID() == RevAdj.get(1).getE().getClusterID() && //cluster ID match Adjacency.get(0).getE().getContig().equals(Adjacency.get(1).getE().getContig()) && //internal contig match RevAdj.get(0).getE().getContig().equals(RevAdj.get(1).getE().getContig())){ //internal contig match EquivalentAdjacencyRev = true; RevMatch++; } } //If the two adjacencies are equivalent, compute a gap penalty if (EquivalentAdjacencyRev){ //gap computation int gap1 = Adjacency.get(1).getE().getStart() - Adjacency.get(0).getE().getStop(); int gap2 = RevAdj.get(0).getE().getStart() - RevAdj.get(1).getE().getStop(); int gapDiff = Math.abs(gap2-gap1); double gapDissimilarity = 0; //determine gap dissimilarity if (gapDiff > this.GPM.MaxGapLimit){ gapDissimilarity = this.GPM.MaxDissimilarity; //only for customized //if (!DisplayMsgAboutInternalMotif) // DisplayMsgAboutInternalMotif = CheckForInternalPromoter(Adjacency,RevAdj); } else if (gapDiff > this.GPM.MinGaplimit){ gapDissimilarity = this.GPM.Mapping.get(gapDiff); } //add to running total ReverseDissimilarity = ReverseDissimilarity + gapDissimilarity; //reset adjacency EquivalentAdjacencyRev = false; //add to hash map LinkedList<Object> ObjKey; if (Type.equals("cluster")){ LinkedList<Integer> Key = new LinkedList<Integer>(); Key.add(Adjacency.get(1).getE().getClusterID()); Key.add(Adjacency.get(0).getE().getClusterID()); Collections.sort(Key); ObjKey = new LinkedList<Object>(Key); } else { LinkedList<String> Key = new LinkedList<String>(); Key.add(Adjacency.get(1).getE().getAnnotation().toUpperCase()); Key.add(Adjacency.get(0).getE().getAnnotation().toUpperCase()); Collections.sort(Key); ObjKey = new LinkedList<Object>(Key); } if (RevAdjHash.get(ObjKey) != null){ int StoredGapDiff = RevAdjHash.get(ObjKey); if (gapDiff < StoredGapDiff){ StoredGapDiff = gapDiff; } RevAdjHash.put(ObjKey, StoredGapDiff); } else{ RevAdjHash.put(ObjKey,gapDiff); } } } } //NEW METHOD Jan 13, 2014 //build dissimilarities from hash maps //forward ForwardDissimilarity = 0.0; for (Integer x : FwdAdjHash.values()){ if (x > this.GPM.MaxGapLimit){ ForwardDissimilarity = ForwardDissimilarity + this.GPM.MaxDissimilarity; } else { ForwardDissimilarity = ForwardDissimilarity + this.GPM.Mapping.get(x); } } //reverse ReverseDissimilarity = 0.0; for (Integer x : RevAdjHash.values()){ if (x > this.GPM.MaxGapLimit){ ReverseDissimilarity = ReverseDissimilarity + this.GPM.MaxDissimilarity; } else { ReverseDissimilarity = ReverseDissimilarity + this.GPM.Mapping.get(x); } } //The proper orientation is the one with more common adjacent pairs with O1. // if (FwdMatch >= RevMatch){ // Dissimilarity = ForwardDissimilarity; // } else { // Dissimilarity = ReverseDissimilarity; // } if (!isStrandFlipped){ Dissimilarity = ForwardDissimilarity; } else { Dissimilarity = ReverseDissimilarity; } //Maximum dissimilarity is 1, minimum is 0. if (Dissimilarity > 1){ Dissimilarity = 1; } else if (Dissimilarity <= 0){ Dissimilarity = 0; } //debugging //System.out.println("Dissimilarity: " + Dissimilarity); return Dissimilarity; } //Strandedness public double SSDissimilarity(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2, String Type){ //Initialize output double Dissimilarity = 0; double IndividualDissimilarity = 0; double WholeGroupDissimilarity = 0; double TotalRelativeWeights = 0; //Create special sets for counts LinkedList<GenomicElementAndQueryMatch> O1Counts = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O2Counts = new LinkedList<GenomicElementAndQueryMatch>(); //Add elements found in both sets. boolean RetainElement; //from O1 for (GenomicElementAndQueryMatch GandE1 : O1){ RetainElement = false; for (GenomicElementAndQueryMatch GandE2 : O2){ if (Type.equals("annotation")){ if (GandE1.getE().getAnnotation().toUpperCase().equals(GandE2.getE().getAnnotation().toUpperCase())){ RetainElement = true; break; } } else { if (GandE1.getE().getClusterID() == GandE2.getE().getClusterID()){ RetainElement = true; break; } } } //add element elements if (RetainElement){ O1Counts.add(GandE1); } } //from O2 for (GenomicElementAndQueryMatch GandE2 : O2){ RetainElement = false; for (GenomicElementAndQueryMatch GandE1 : O1){ if (GandE2.getE().getAnnotation().toUpperCase().equals(GandE1.getE().getAnnotation().toUpperCase())){ RetainElement = true; break; } else { if (GandE2.getE().getClusterID() == GandE1.getE().getClusterID()){ RetainElement = true; break; } } } //remove elements if (RetainElement){ O2Counts.add(GandE2); } } boolean MatchingElement; int CommonTotal; int CommonSameStrand = 0; int CommonDiffStrand = 0; //Create special sets for strand-matching counts LinkedList<GenomicElementAndQueryMatch> O1CountsM = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O2CountsM = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O1CountsN = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O2CountsN = new LinkedList<GenomicElementAndQueryMatch>(); //sort O1Counts appropriately for (GenomicElementAndQueryMatch GandE1 : O1Counts){ //find a common element in O2Counts that matches, if possible MatchingElement = false; for (GenomicElementAndQueryMatch GandE2 : O2Counts){ if (Type.equals("annotation")){ if (GandE1.getE().getAnnotation().toUpperCase().equals(GandE2.getE().getAnnotation().toUpperCase()) && GandE1.getE().getStrand().equals(GandE2.getE().getStrand()) && !O1CountsM.contains(GandE1)){ MatchingElement = true; break; } } else { if (GandE1.getE().getClusterID() == GandE2.getE().getClusterID() && GandE1.getE().getStrand().equals(GandE2.getE().getStrand()) && !O1CountsM.contains(GandE1)){ MatchingElement = true; break; } } } // Update counts, and remove from list if (MatchingElement){ O1CountsM.add(GandE1); } else { O1CountsN.add(GandE1); } } //sort O1Counts appropriately for (GenomicElementAndQueryMatch GandE2 : O2Counts){ //find a common element in O2Counts that matches, if possible MatchingElement = false; for (GenomicElementAndQueryMatch GandE1 : O1){ if (Type.equals("annotation")){ if (GandE2.getE().getAnnotation().toUpperCase().equals(GandE1.getE().getAnnotation().toUpperCase()) && GandE2.getE().getStrand().equals(GandE1.getE().getStrand()) && !O2CountsM.contains(GandE2)){ MatchingElement = true; break; } } else { if (GandE2.getE().getClusterID() == GandE1.getE().getClusterID() && GandE2.getE().getStrand().equals(GandE1.getE().getStrand()) && !O2CountsM.contains(GandE2)){ MatchingElement = true; break; } } } // Update counts, and remove from list if (MatchingElement){ O2CountsM.add(GandE2); } else { O2CountsN.add(GandE2); } } //Finalize counts CommonSameStrand = O1CountsM.size(); CommonDiffStrand = O1CountsN.size() + O2CountsN.size(); CommonTotal = CommonSameStrand + CommonDiffStrand; //Dissimilarity for individual changes in strandedness if (SSIndividualGenes){ //Jaccard dissimilarity IndividualDissimilarity = 1.0 - (Math.max((double) CommonSameStrand, (double) CommonDiffStrand) / (double)CommonTotal); //increment total weights contribution TotalRelativeWeights = TotalRelativeWeights + SSRelWeightIndGenes; } if (SSWholeGroup) { //whole group is switched if (CommonDiffStrand > CommonSameStrand ){ WholeGroupDissimilarity = 1.0; } else { WholeGroupDissimilarity = 0.0; } //increment total weights contribution TotalRelativeWeights = TotalRelativeWeights + SSRelWeightWholeGroup; } //compute overall dissimilarity Dissimilarity = (SSRelWeightIndGenes/TotalRelativeWeights) * IndividualDissimilarity + (SSRelWeightWholeGroup/TotalRelativeWeights) * WholeGroupDissimilarity; //adjust dissimilarity if (Dissimilarity > 1){ Dissimilarity = 1; } else if (Dissimilarity < 0){ Dissimilarity = 0; } return Dissimilarity; } //determine if the whole group has flipped or not public boolean TotalGroupStrandFlip(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2, String Type){ //Create special sets for counts LinkedList<GenomicElementAndQueryMatch> O1Counts = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O2Counts = new LinkedList<GenomicElementAndQueryMatch>(); //Add elements found in both sets. boolean RetainElement; //from O1 for (GenomicElementAndQueryMatch GandE1 : O1){ RetainElement = false; for (GenomicElementAndQueryMatch GandE2 : O2){ if (Type.equals("annotation")){ if (GandE1.getE().getAnnotation().toUpperCase().equals(GandE2.getE().getAnnotation().toUpperCase())){ RetainElement = true; break; } } else { if (GandE1.getE().getClusterID() == GandE2.getE().getClusterID()){ RetainElement = true; break; } } } //add element elements if (RetainElement){ O1Counts.add(GandE1); } } //from O2 for (GenomicElementAndQueryMatch GandE2 : O2){ RetainElement = false; for (GenomicElementAndQueryMatch GandE1 : O1){ if (GandE2.getE().getAnnotation().toUpperCase().equals(GandE1.getE().getAnnotation().toUpperCase())){ RetainElement = true; break; } else { if (GandE2.getE().getClusterID() == GandE1.getE().getClusterID()){ RetainElement = true; break; } } } //remove elements if (RetainElement){ O2Counts.add(GandE2); } } boolean MatchingElement; int CommonSameStrand = 0; int CommonDiffStrand = 0; //Create special sets for strand-matching counts LinkedList<GenomicElementAndQueryMatch> O1CountsM = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O2CountsM = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O1CountsN = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> O2CountsN = new LinkedList<GenomicElementAndQueryMatch>(); //sort O1Counts appropriately for (GenomicElementAndQueryMatch GandE1 : O1Counts){ //find a common element in O2Counts that matches, if possible MatchingElement = false; for (GenomicElementAndQueryMatch GandE2 : O2Counts){ if (Type.equals("annotation")){ if (GandE1.getE().getAnnotation().toUpperCase().equals(GandE2.getE().getAnnotation().toUpperCase()) && GandE1.getE().getStrand().equals(GandE2.getE().getStrand()) && !O1CountsM.contains(GandE1)){ MatchingElement = true; break; } } else { if (GandE1.getE().getClusterID() == GandE2.getE().getClusterID() && GandE1.getE().getStrand().equals(GandE2.getE().getStrand()) && !O1CountsM.contains(GandE1)){ MatchingElement = true; break; } } } // Update counts, and remove from list if (MatchingElement){ O1CountsM.add(GandE1); } else { O1CountsN.add(GandE1); } } //sort O1Counts appropriately for (GenomicElementAndQueryMatch GandE2 : O2Counts){ //find a common element in O2Counts that matches, if possible MatchingElement = false; for (GenomicElementAndQueryMatch GandE1 : O1){ if (Type.equals("annotation")){ if (GandE2.getE().getAnnotation().toUpperCase().equals(GandE1.getE().getAnnotation().toUpperCase()) && GandE2.getE().getStrand().equals(GandE1.getE().getStrand()) && !O2CountsM.contains(GandE2)){ MatchingElement = true; break; } } else { if (GandE2.getE().getClusterID() == GandE1.getE().getClusterID() && GandE2.getE().getStrand().equals(GandE1.getE().getStrand()) && !O2CountsM.contains(GandE2)){ MatchingElement = true; break; } } } // Update counts, and remove from list if (MatchingElement){ O2CountsM.add(GandE2); } else { O2CountsN.add(GandE2); } } //Finalize counts CommonSameStrand = O1CountsM.size(); CommonDiffStrand = O1CountsN.size() + O2CountsN.size(); //whole group is switched if (CommonDiffStrand > CommonSameStrand ){ return true; //strand has flipped } else { return false; // strand has not flipped } } //Total Dissimilarity public double TotalDissimilarity(LinkedList<GenomicElementAndQueryMatch> G1, LinkedList<GenomicElementAndQueryMatch> G2, String T){ //refactor, if appropriate //Linear Scale if (AmalgamationType.equals("Linear")){ //determine total weight, to scale by. Double AllProvidedWeights = 0.0; //initialize values Double CGContribution = 0.0; Double CMContribution = 0.0; Double GOContribution = 0.0; Double GGContribution = 0.0; Double SSContribution = 0.0; //Determine Factors if (Factors.contains("CG")){ AllProvidedWeights = AllProvidedWeights + CGWeight; CGContribution = CGDissimilarity(G1,G2,T); } if (Factors.contains("CM")){ //running weights total AllProvidedWeights = AllProvidedWeights + CMWeight; //exclude the head prior to determination if (CMExcludeOperonHead){ //create special dummy lists just to assess the dissimilarity LinkedList<GenomicElementAndQueryMatch> G1mod = new LinkedList<GenomicElementAndQueryMatch>(G1); LinkedList<GenomicElementAndQueryMatch> G2mod = new LinkedList<GenomicElementAndQueryMatch>(G2); //exclude the head of the operon, if appropriate //first list if (G1mod.size() > 1){ if (G1mod.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G1mod.remove(0); } else { G1mod.remove(G1mod.size()-1); } } //second list if (G2mod.size() > 1){ if (G2mod.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G2mod.remove(0); } else { G2mod.remove(G2mod.size()-1); } } CMContribution = CMDissimilarity(G1mod,G2mod,T); } else if (CMExcludeOperonTail){ //create special dummy lists just to assess the dissimilarity LinkedList<GenomicElementAndQueryMatch> G1mod = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> G2mod = new LinkedList<GenomicElementAndQueryMatch>(); //first list if (G1.size() > 1){ if (G1.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G1mod.add(G1.get(0)); } else { G1mod.add(G1.get(G1.size()-1)); } } //second list //first list if (G2.size() > 1){ if (G2.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G2mod.add(G2.get(0)); } else { G2mod.add(G2.get(G2.size()-1)); } } CMContribution = CMDissimilarity(G1mod,G2mod,T); } else { CMContribution = CMDissimilarity(G1,G2,T); } } if (Factors.contains("GO")){ AllProvidedWeights = AllProvidedWeights + GOWeight; GOContribution = GODissimilarity(G1,G2,T); } if (Factors.contains("GG")){ AllProvidedWeights = AllProvidedWeights + GGWeight; GGContribution = GGDissimilarity(G1,G2,T); } if (Factors.contains("SS")){ AllProvidedWeights = AllProvidedWeights + SSWeight; SSContribution = SSDissimilarity(G1,G2,T); } //weighted average. double TotalContribution = (CGWeight/AllProvidedWeights) * CGContribution + (CMWeight/AllProvidedWeights) * CMContribution + (GOWeight/AllProvidedWeights) * GOContribution + (GGWeight/AllProvidedWeights) * GGContribution + (SSWeight/AllProvidedWeights) * SSContribution; /* * Modification: empty set comparisons should either yield maximum dissimilarity * (empty set versus non-empty set) or maximum similarity (empty set vs. empty set). */ if (G1.size() == 0){ //double empty set case (empty G1, empty G2) if (G2.size() == 0){ TotalContribution = 0.0; //empty set versus non-empty set case I (empty G1, non-empty G2) } else { TotalContribution = 1.0; } } else { //other empty set versus non-empty case II (non-empty G1, empty G2) if (G2.size() == 0){ TotalContribution = 1.0; } } //return modified (if necessary) contribution return TotalContribution; } else { /* * Scale-hierarchy ensures that a dissimilarity contribution of lower importance * never overtakes a contribution of higher importance. * * Individual dissimilarities are computed, and factors are truncated at their maximum * allowed fraction of the next ranking: * * Dissimilarity From Lower Importance * <= (Dissimilarity from Higher Importance * Importance Fraction) */ //initialize values Double TotalContribution = 0.0; Double CGContribution = 0.0; Double CMContribution = 0.0; Double GOContribution = 0.0; Double GGContribution = 0.0; Double SSContribution = 0.0; LinkedList<ImportanceMapping> ImpMapping = new LinkedList<ImportanceMapping>(); //Determine Factors if (Factors.contains("CG")){ CGContribution = CGDissimilarity(G1,G2,T); ImportanceMapping IM = new ImportanceMapping(); IM.FactorType = "CG"; IM.Dissimilarity = CGContribution; IM.Importance = CGImportance; ImpMapping.add(IM); } if (Factors.contains("CM")){ //exclude the head prior to determination if (CMExcludeOperonHead){ //create special dummy lists just to assess the dissimilarity LinkedList<GenomicElementAndQueryMatch> G1mod = new LinkedList<GenomicElementAndQueryMatch>(G1); LinkedList<GenomicElementAndQueryMatch> G2mod = new LinkedList<GenomicElementAndQueryMatch>(G2); //exclude the head of the operon, if appropriate //first list if (G1mod.size() > 1){ if (G1mod.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G1mod.remove(0); } else { G1mod.remove(G1mod.size()-1); } } //second list if (G2mod.size() > 1){ if (G2mod.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G2mod.remove(0); } else { G2mod.remove(G2mod.size()-1); } } CMContribution = CMDissimilarity(G1mod,G2mod,T); } else if (CMExcludeOperonTail){ //create special dummy lists just to assess the dissimilarity LinkedList<GenomicElementAndQueryMatch> G1mod = new LinkedList<GenomicElementAndQueryMatch>(); LinkedList<GenomicElementAndQueryMatch> G2mod = new LinkedList<GenomicElementAndQueryMatch>(); //first list if (G1.size() > 1){ if (G1.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G1mod.add(G1.get(0)); } else { G1mod.add(G1.get(G1.size()-1)); } } //second list //first list if (G2.size() > 1){ if (G2.get(0).getE().getStrand().equals(Strand.POSITIVE)){ G2mod.add(G2.get(0)); } else { G2mod.add(G2.get(G2.size()-1)); } } CMContribution = CMDissimilarity(G1mod,G2mod,T); } else { CMContribution = CMDissimilarity(G1,G2,T); } ImportanceMapping IM = new ImportanceMapping(); IM.FactorType = "CM"; IM.Dissimilarity = CMContribution; IM.Importance = CMImportance; ImpMapping.add(IM); } if (Factors.contains("GO")){ GOContribution = GODissimilarity(G1,G2,T); ImportanceMapping IM = new ImportanceMapping(); IM.FactorType = "GO"; IM.Dissimilarity = GOContribution; IM.Importance = GOImportance; ImpMapping.add(IM); } if (Factors.contains("GG")){ GGContribution = GGDissimilarity(G1,G2,T); ImportanceMapping IM = new ImportanceMapping(); IM.FactorType = "GG"; IM.Dissimilarity = GGContribution; IM.Importance = GGImportance; ImpMapping.add(IM); } if (Factors.contains("SS")){ SSContribution = SSDissimilarity(G1,G2,T); ImportanceMapping IM = new ImportanceMapping(); IM.FactorType = "SS"; IM.Dissimilarity = SSContribution; IM.Importance = SSImportance; ImpMapping.add(IM); } //sort importance mapping Collections.sort(ImpMapping, new IMComparator()); if (ImpMapping.size() > 1){ TotalContribution = ImpMapping.get(0).Dissimilarity; for (int i = 0; i < ImpMapping.size()-1; i++){ //check for different levels if (ImpMapping.get(i).Importance != ImpMapping.get(i+1).Importance){ if (ImpMapping.get(i).Dissimilarity * ImportanceFraction < ImpMapping.get(i+1).Dissimilarity){ ImpMapping.get(i+1).Dissimilarity = ImpMapping.get(i).Dissimilarity * ImportanceFraction; } TotalContribution = TotalContribution + ImpMapping.get(i+1).Dissimilarity; } } //adjust contribution by size TotalContribution = TotalContribution / (double)ImpMapping.size(); } else { TotalContribution = ImpMapping.getFirst().Dissimilarity; } /* * Modification: empty set comparisons should either yield maximum dissimilarity * (empty set versus non-empty set) or maximum similarity (empty set vs. empty set). */ if (G1.size() == 0){ //double empty set case (empty G1, empty G2) if (G2.size() == 0){ TotalContribution = 0.0; //empty set versus non-empty set case I (empty G1, non-empty G2) } else { TotalContribution = 1.0; } } else { //other empty set versus non-empty case II (non-empty G1, empty G2) if (G2.size() == 0){ TotalContribution = 1.0; } } return TotalContribution; } } public String getName() { return Name; } public void setName(String name) { Name = name; } public Double getImportanceFraction() { return ImportanceFraction; } public void setImportanceFraction(Double importanceFraction) { ImportanceFraction = importanceFraction; } // ------- Extra methods -----------// //a customized method for use only for 80 halophiles/operon search stuff. public boolean CheckForInternalPromoter(LinkedList<GenomicElementAndQueryMatch> Adj1, LinkedList<GenomicElementAndQueryMatch> Adj2){ //initialize return value boolean NewInternalMotifInGap = false; //Adj 1 derives from a (+)-stranded operon if (Adj1.get(0).getE().getStrand().equals(Strand.POSITIVE)){ //Adj 2 derives from a (+)-stranded operon if (Adj2.get(0).getE().getStrand().equals(Strand.POSITIVE)){ //Adj1 is the case with the widened gap. if (Adj1.get(1).getE().getStart()-Adj1.get(0).getE().getStop() > Adj2.get(1).getE().getStart()-Adj2.get(0).getE().getStop()){ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(1).getE().getAssociatedMotifs().size() > Adj2.get(1).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } //Adj2 is the case with widened gap. } else{ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(1).getE().getAssociatedMotifs().size() < Adj2.get(1).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } } //Adj 2 derives from a (-)-stranded operon } else { //Adj1 is the case with the widened gap. if (Adj1.get(1).getE().getStart()-Adj1.get(0).getE().getStop() > Adj2.get(1).getE().getStart()-Adj2.get(0).getE().getStop()){ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(1).getE().getAssociatedMotifs().size() > Adj2.get(0).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } //Adj2 is the case with widened gap. } else{ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(1).getE().getAssociatedMotifs().size() < Adj2.get(0).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } } } //Adj 1 derives from a (-)-stranded operon } else { //Adj 2 derives from a (+)-stranded operon if (Adj2.get(0).getE().getStrand().equals(Strand.POSITIVE)){ //Adj1 is the case with the widened gap. if (Adj1.get(1).getE().getStart()-Adj1.get(0).getE().getStop() > Adj2.get(1).getE().getStart()-Adj2.get(0).getE().getStop()){ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(0).getE().getAssociatedMotifs().size() > Adj2.get(1).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } //Adj2 is the case with widened gap. } else{ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(0).getE().getAssociatedMotifs().size() < Adj2.get(1).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } } //Adj 2 derives from a (-)-stranded operon } else { //Adj1 is the case with the widened gap. if (Adj1.get(1).getE().getStart()-Adj1.get(0).getE().getStop() > Adj2.get(1).getE().getStart()-Adj2.get(0).getE().getStop()){ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(0).getE().getAssociatedMotifs().size() > Adj2.get(0).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } //Adj2 is the case with widened gap. } else{ //The widened gene has more promoter motifs than the non-widened. if (Adj1.get(0).getE().getAssociatedMotifs().size() < Adj2.get(0).getE().getAssociatedMotifs().size()){ NewInternalMotifInGap = true; } } } } //return the value return NewInternalMotifInGap; } //new class public class ImportanceMapping { public String FactorType; public double Dissimilarity; public int Importance; } public class IMComparator implements Comparator<ImportanceMapping>{ @Override public int compare(ImportanceMapping IM1, ImportanceMapping IM2) { return IM1.Importance - IM2.Importance; } } }