package operonClustering; import genomeObjects.GenomicElementAndQueryMatch; import java.util.ArrayList; import java.util.LinkedList; import org.biojava3.core.sequence.Strand; public class MovingDistancesbyAnnotation implements OperonDissimilarityMeasure { //distance-related parameters double MismatchPenalty = 0.2; int GenomicRearrangementOrInsertion = 200; double GenomicRearrangementOrInsertionPenalty = 0.2; int ProteinBindingSiteChange = 25; double ProteinBindingSiteChangePenalty = 0.05; int TranscriptionUnitChange = 10; double TranscriptionUnitChangePenalty = 0.02; int CloselyPackedGenesDistance = 10; @Override public double computeDissimilarity(LinkedList<GenomicElementAndQueryMatch> O1, LinkedList<GenomicElementAndQueryMatch> O2) { //initialize lists ArrayList<String> O1Annotations = new ArrayList<String>(); ArrayList<String> O2Annotations = new ArrayList<String>(); //add elements for (GenomicElementAndQueryMatch E: O1){ O1Annotations.add(E.getE().getAnnotation().toUpperCase()); } for (GenomicElementAndQueryMatch E: O2){ O2Annotations.add(E.getE().getAnnotation().toUpperCase()); } //Determine numbers of elements at various points double NumIntersecting = 0; double Op1Unique = 0; double Op2Unique = 0; for (int i = 0; i <O1Annotations.size(); i++){ if (O2Annotations.contains(O1Annotations.get(i))){ NumIntersecting++; } else { Op1Unique++; } } for (int i = 0; i < O2Annotations.size(); i++){ if (O1Annotations.contains(O2Annotations.get(i)) == false){ Op2Unique++; } } //only compute distances for cases where the same number of elements are relevant if (Op1Unique == 0 && Op2Unique == 0 && O1.size() == O2.size()){ //determine strandedness of each context set //determine reverse or straight genomic order for O2 int QueryStrandPlusO1 = 0; int QueryStrandMinusO1 = 0; int QueryStrandPlusO2 = 0; int QueryStrandMinusO2 = 0; for (GenomicElementAndQueryMatch e : O1){ if (e.isQueryMatch()){ if (e.getE().getStrand().equals(Strand.POSITIVE)){ QueryStrandPlusO1++; } else { QueryStrandMinusO1++; } } } for (GenomicElementAndQueryMatch e : O2){ if (e.isQueryMatch()){ if (e.getE().getStrand().equals(Strand.POSITIVE)){ QueryStrandPlusO2++; } else { QueryStrandMinusO2++; } } } //if strand = +, put in genomic order //if strand = -, put into reverse genomic order if (QueryStrandPlusO1 >= QueryStrandMinusO1){ for (int i = 0; i < O1.size()-1; i++){ for (int j = 0; j < O1.size()-1; j++){ if (O1.get(j).getE().getStart() > O1.get(j+1).getE().getStart()){ //switch elements GenomicElementAndQueryMatch tempE = O1.get(j+1); O1.set(j+1, O1.get(j)); O1.set(j, tempE); } } } } else { for (int i = 0; i < O1.size()-1; i++){ for (int j = 0; j < O1.size()-1; j++){ if (O1.get(j).getE().getStart() < O1.get(j+1).getE().getStart()){ //switch elements GenomicElementAndQueryMatch tempE = O1.get(j+1); O1.set(j+1, O1.get(j)); O1.set(j, tempE); } } } } //disagreement in strandedness of elements if (QueryStrandPlusO2 >= QueryStrandMinusO2){ for (int i = 0; i < O2.size()-1; i++){ for (int j = 0; j < O2.size()-1; j++){ if (O2.get(j).getE().getStart() > O2.get(j+1).getE().getStart()){ //switch elements GenomicElementAndQueryMatch tempE = O2.get(j+1); O2.set(j+1, O2.get(j)); O2.set(j, tempE); } } } } else { for (int i = 0; i < O2.size()-1; i++){ for (int j = 0; j < O2.size()-1; j++){ if (O2.get(j).getE().getStart() < O2.get(j+1).getE().getStart()){ //switch elements GenomicElementAndQueryMatch tempE = O2.get(j+1); O2.set(j+1, O2.get(j)); O2.set(j, tempE); } } } } //Any inversions among elements? int MisMatches = 0; for (int i = 0; i <O1.size(); i++){ if (O1.get(i).getE().getGeneID() != O2.get(i).getE().getGeneID()){ MisMatches++; } } //if no mismatches, check protein binding site changes if (MisMatches == 0){ double ChangesInDistance = 0.0; for (int i = 0; i <O1.size()-1; i++){ double dist1 = 0.0; double dist2 = 0.0; //make sure to compare the correct quantities - depending on the strandedness of the CS. if (QueryStrandPlusO1 >= QueryStrandMinusO1){ dist1 = Math.abs((double)(O1.get(i+1).getE().getStart()-O1.get(i).getE().getStop())); if (QueryStrandPlusO2 >= QueryStrandMinusO2){ dist2 = Math.abs((double)(O2.get(i+1).getE().getStart()-O2.get(i).getE().getStop())); } else { dist2 = Math.abs((double)(O2.get(i).getE().getStart()-O2.get(i+1).getE().getStop())); } } else { dist1 = Math.abs((double)(O1.get(i).getE().getStart()-O1.get(i+1).getE().getStop())); if (QueryStrandPlusO2 >= QueryStrandMinusO2){ dist2 = Math.abs((double)(O2.get(i+1).getE().getStart()-O2.get(i).getE().getStop())); } else { dist2 = Math.abs((double)(O2.get(i).getE().getStart()-O2.get(i+1).getE().getStop())); } } //large change : genomic rearrangement or insertion event //intermediate: gain/loss of protein binding site/regulatory site //small: transcription unit rearrangement if (Math.abs(dist1-dist2) > GenomicRearrangementOrInsertion){ //large change ChangesInDistance = ChangesInDistance + GenomicRearrangementOrInsertionPenalty; } else if (Math.abs(dist1-dist2) > ProteinBindingSiteChange){ //intermediate change ChangesInDistance = ChangesInDistance + ProteinBindingSiteChangePenalty; } else if (Math.abs(dist1-dist2) > TranscriptionUnitChange && //small change ((dist1 < CloselyPackedGenesDistance) || (dist2 < CloselyPackedGenesDistance))){ ChangesInDistance = ChangesInDistance + TranscriptionUnitChangePenalty; } } if (ChangesInDistance > 1){ return 1; } else { return ChangesInDistance; } } else { //mismatches override dissimilarity differences if (MisMatches * MismatchPenalty > 1){ return 1; } else { return MisMatches * MismatchPenalty; } } } else { //if there is some difference in elements, these are maximally distance from each other return 1; } } }