package genomeObjects; import haloGUI.GBKFieldMapping; import java.io.*; import java.net.URL; import java.text.Collator; import java.util.*; import java.util.Map.Entry; import javax.swing.JOptionPane; import org.biojava3.core.sequence.DNASequence; import org.biojava3.core.sequence.ProteinSequence; import org.biojava3.core.sequence.RNASequence; import org.biojava3.core.sequence.Strand; import org.biojava3.core.sequence.io.FastaReaderHelper; public class AnnotatedGenome implements Serializable { /** * */ private static final long serialVersionUID = -7721895130219179915L; //Fields private String Genus; //-Biological-organization------------- private String Species; // private LinkedList<GenomicElement> Elements; //-Genes, SigSeqs, and groups of genes-- private LinkedList<MotifGroup> Motifs // = new LinkedList<MotifGroup>(); // private LinkedList<ContextSet> Groupings = new LinkedList<ContextSet>(); //-Predicted Groupings----------------- private File GenomeFile; //-Associated genome file -------------- private String GenomeSequenceFile; private boolean SeqsFromFile; private boolean TryToComputeOperons; private LinkedList<String> FeatureIncludeTypes; //-Types of data worth importing/processing private LinkedList<String> FeatureDisplayTypes; private boolean AGClustersLoaded = false; private String TextDescription = ""; //-Info about the genome private String GenbankID; private Integer LargestCluster = 0; private GBKFieldMapping GFM; private LinkedHashMap<String, Integer> ContigEnds = new LinkedHashMap<String, Integer>(); // ----------------------- Construction ------------------------// //Constructor public AnnotatedGenome() { super(); } //import annotated elements from a .GFF file. public void importFromGFFFile(String filename){ //define a null linked list LinkedList<GenomicElement> Elements = new LinkedList<GenomicElement>(); try{ //import buffered reader BufferedReader br = new BufferedReader(new FileReader(filename)); String Line = null; int Counter = 0; //Information for statistics - type counts LinkedHashMap<String, Integer> Counts = new LinkedHashMap<String, Integer>(); HashSet<String> ContigCount = new HashSet<String>(); while((Line = br.readLine()) != null){ //System.out.println(Line); //ignore commented lines if (!Line.startsWith("#") && !Line.isEmpty()){ //increment Counter Counter++; //import each line of the .gff file String ImportedLine[] = Line.split("\t"); //GFF files must contain exactly at least 9 fields if (ImportedLine.length < 9){ throw new Exception(); } //check and see if this element should be retained at all //check include types boolean RetainElement = false; for (String s : this.FeatureIncludeTypes){ if (ImportedLine[2].trim().contentEquals(s)){ RetainElement = true; break; } } //if this fails, check for display types if (!RetainElement){ for (String s : this.FeatureDisplayTypes){ if (ImportedLine[2].trim().contentEquals(s)){ RetainElement = true; break; } } } //add this element to the list, if necessary if (RetainElement){ //if a line or two are not formatted correctly, just ignore these lines. try { //create a new element GenomicElement E = new GenomicElement(); //set appropriate fields of this genomic element with inputs achieved from the GFF file E.setContig(ImportedLine[0]); E.setType(ImportedLine[2]); E.setStart(Integer.parseInt(ImportedLine[3])); E.setStop(Integer.parseInt(ImportedLine[4])); E.setElementID(Counter); E.DetermineCenter(); try { if(Integer.parseInt(ImportedLine[6])==1){ E.setStrand(Strand.POSITIVE); }else{ E.setStrand(Strand.NEGATIVE); } } catch (Exception ex) { if (ImportedLine[6].contentEquals("+")){ E.setStrand(Strand.POSITIVE); } else { E.setStrand(Strand.NEGATIVE); } } //set annotation E.setAnnotation(ImportedLine[8]); //add gene IDs + homology clusters, if available if (ImportedLine.length > 9){ int ClustID = Integer.parseInt(ImportedLine[9]); E.setClusterID(ClustID); if (ClustID > LargestCluster){ LargestCluster = ClustID; } this.AGClustersLoaded = true; //System.out.println("Set!"); if (ImportedLine.length > 10){ E.setGeneID(ImportedLine[10]); } //System.out.println("Largest: " + LargestCluster); } //add to list, if it doesn't already exist. Elements.add(E); //add contig ends if (ContigEnds.get(E.getContig()) != null){ if (ContigEnds.get(E.getContig()) < E.getStop()){ ContigEnds.put(E.getContig(), E.getStop()); } } else { ContigEnds.put(E.getContig(), E.getStop()); } //Record counts of types if (Counts.get(E.getType()) != null){ int OldCount = Counts.get(E.getType()); Counts.put(E.getType(),(OldCount+1)); } else { Counts.put(E.getType(), 1); } //Record counts of contigs ContigCount.add(E.getContig()); } catch (Exception ex) { ex.printStackTrace(); } } } } //Convert feature counts to string, for display. //Number of contigs / plasmids / chromosomes TextDescription = "Sequences (" + String.valueOf(ContigCount.size()) + "):\n"; for (String s : ContigCount){ TextDescription = TextDescription + s + "\n"; } //Feature tabulation TextDescription = TextDescription + "\nFeature Types (" + String.valueOf(Counts.values().size()) + "):\n"; for (String s : Counts.keySet()){ TextDescription = TextDescription + s + " (" + String.valueOf(Counts.get(s)) + ")\n"; } //close file stream br.close(); }catch(Exception ex){ //ex.printStackTrace(); //System.out.println("fail!"); //System.exit(1); System.err.println("File format error! Please re-format and try again."); } //sort elements Collections.sort(Elements, new GenomicElementComparator()); //set elements to newly parsed, sorted, and redundancy-filtered elements. this.Elements = removeRedundantElements(Elements); //set elements to the newly parsed elements. //this.Elements = Elements; //System.out.println(LargestCluster); } //import annotated elements from a .GBK file. public void importFromGBKFile(String filename){ //call reader! try { //create a buffered reader to read the sequence file specified by args[0] BufferedReader br = new BufferedReader(new FileReader(filename)); //call the reader! importFromGBKReader(br); } catch (Exception ex) { ex.printStackTrace(); } } //import annotated elements streamed in from .GBK website. public void importFromGBKReader(BufferedReader br){ //Information for statistics - type counts LinkedHashMap<String, Integer> Counts = new LinkedHashMap<String, Integer>(); HashSet<String> ContigCount = new HashSet<String>(); String Line = null; boolean ReadFeatures = false; boolean NewFeature = false; boolean DescriptiveInfo = false; //Fields for genomic features. String ContigName = ""; String TypeName = ""; GenomicElement E = new GenomicElement(); String LocusTag = ""; boolean WritingProduct = false; boolean WritingTranslation = false; boolean InAnIgnoreFeature = false; //define types for import. LinkedList<String> Types = new LinkedList<String>(); Types.addAll(FeatureIncludeTypes); Types.addAll(FeatureDisplayTypes); //prepare list for addition Elements = new LinkedList<GenomicElement>(); try { while ((Line = br.readLine()) != null){ //trim the line to remove white space. Line = Line.trim(); //System.out.println(Line); //System.out.println(Line); String[] L = Line.split("\\s+"); //new contig if (Line.startsWith("LOCUS")){ ContigName = L[1]; ContigCount.add(ContigName); try { ContigEnds.put(ContigName, Integer.parseInt(L[2])); } catch (Exception ex){} DescriptiveInfo = true; } //read lines for features if (ReadFeatures){ //check if line is a new feature for (String s : Types){ //System.out.println(s); if (Line.startsWith(s) && !WritingProduct && !WritingTranslation && L[0].equals(s)){ if (L.length == 2){ if (L[1].contains("..")){ NewFeature = true; InAnIgnoreFeature = false; TypeName = s; break; } } } } //line is a new feature if (NewFeature){ //write previous feature if (E != null){ if (E.getType() != null){ Elements.add(E); //Record counts of types if (Counts.get(E.getType()) != null){ int OldCount = Counts.get(E.getType()); Counts.put(E.getType(),(OldCount+1)); } else { Counts.put(E.getType(), 1); } } } //create new feature E = new GenomicElement(); NewFeature = false; //reset switches WritingProduct = false; WritingTranslation = false; //type info E.setType(TypeName); E.setContig(ContigName); //fwd or reverse strand if (L[1].contains("complement")){ //completely assembled or not if (L[1].contains("join")){ //complement(join(729725..730909,730913..731044)) String[] X = ((String) L[1].trim().subSequence(16,L[1].length()-2)).split("\\.."); if (X[0].contains(">") || X[0].contains("<")){ X[0] = X[0].substring(1); } if (X[X.length-1].contains(">") || X[1].contains("<")){ X[X.length-1] = X[X.length-1].substring(1); } E.setStart(Integer.parseInt(X[0])); E.setStop(Integer.parseInt(X[X.length-1])); E.setStrand(Strand.NEGATIVE); E.DetermineCenter(); //no join } else { String[] X = ((String) L[1].trim().subSequence(11,L[1].length()-1)).split("\\.."); if (X[0].contains(">") || X[0].contains("<")){ X[0] = X[0].substring(1); } if (X[1].contains(">") || X[1].contains("<")){ X[1] = X[1].substring(1); } //Start and stop E.setStart(Integer.parseInt(X[0])); E.setStop(Integer.parseInt(X[1])); E.setStrand(Strand.NEGATIVE); E.DetermineCenter(); } } else { //join if (L[1].contains("join")){ String[] X = ((String) L[1].trim().subSequence(5,L[1].length()-1)).split("\\.."); if (X[0].contains(">") || X[0].contains("<")){ X[0] = X[0].substring(1); } if (X[X.length-1].contains(">") || X[X.length-1].contains("<")){ X[X.length-1] = X[X.length-1].substring(1); } E.setStart(Integer.parseInt(X[0])); E.setStop(Integer.parseInt(X[X.length-1])); E.setStrand(Strand.POSITIVE); E.DetermineCenter(); //no join } else { String[] X = L[1].trim().split("\\.."); if (X[0].contains(">") || X[0].contains("<")){ X[0] = X[0].substring(1); } if (X[1].contains(">") || X[1].contains("<")){ X[1] = X[X.length-1].substring(1); } E.setStart(Integer.parseInt(X[0])); E.setStop(Integer.parseInt(X[1])); E.setStrand(Strand.POSITIVE); E.DetermineCenter(); } } //line is not a new feature } else { //not a new feature NewFeature = false; //Is this feature one to be ignored? if (L.length == 2){ if (L[1].contains("..")){ InAnIgnoreFeature = true; } } } //add to an existing feature if (!NewFeature && !InAnIgnoreFeature){ //check if currently writing things, first if(WritingProduct){ //add the current line. String UpdatedAnnotation = E.getAnnotation() + " " + Line; E.setAnnotation(UpdatedAnnotation); //if a quotation mark is the last character, this is the end of writing product. if (Line.substring(Line.length()-1).equals("\"")){ WritingProduct = false; } } else if (WritingTranslation){ //last line in translation if (Line.substring(Line.length()-1).equals("\"")){ String UpdatedTranslation = E.getTranslation() + Line.substring(0,Line.length()-1); E.setTranslation(UpdatedTranslation); WritingTranslation = false; } else { String UpdatedTranslation = E.getTranslation() + Line; E.setTranslation(UpdatedTranslation); } //not writing anything - possibly open things up } else { //start product if (L[0].startsWith(GFM.Annotation)){ WritingProduct = true; E.setAnnotation(Line.substring(1)); //if a quotation mark is the last character, this is the end of writing product. if (Line.substring(Line.length()-1).equals("\"")){ WritingProduct = false; } //start translation } else if (GFM.GetTranslation && L[0].startsWith("/translation=")){ WritingTranslation = true; //short translation - ends in quote if (Line.substring(Line.length()-1).equals("\"")){ E.setTranslation((String) Line.substring(14, Line.length()-1)); WritingTranslation = false; //normal translation - extends multiple lines } else { E.setTranslation(Line.substring(14)); WritingTranslation = true; } //attempt to parse cluster tag } else if (GFM.GetCluster && L[0].startsWith(GFM.GetClusterTag)){ String Info = Line.substring(GFM.GetClusterTag.length()); Info = Info.replaceAll("\"", ""); String[] InfoSplit = Info.split("\\s+"); for (String s : InfoSplit){ if (s.startsWith("COG")){ try{ E.setClusterID(Integer.parseInt(s.substring(3))); break; }catch (Exception ex){} } } //add gene ID } else if (L[0].startsWith(GFM.GeneID)){ try { String GIDNoQuotes = Line.substring(GFM.GeneID.length()).replaceAll("\"", ""); E.setGeneID(GIDNoQuotes); } catch (Exception ex) { } } } } } else { if (DescriptiveInfo){ //Add introductory info to the text description. if (!(TextDescription).equals("")){ TextDescription = TextDescription + "\n" + Line; } else{ TextDescription = Line; } } } //turn on feature-reading if (Line.startsWith("FEATURES")){ ReadFeatures = true; } //turn off feature-reading if (Line.startsWith("BASE COUNT")){ DescriptiveInfo = false; ReadFeatures = false; } } //add last element. if (!Elements.contains(E)){ Elements.add(E); } } catch (NumberFormatException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } //Convert feature counts to string, for display. //Number of contigs / plasmids / chromosomes TextDescription = TextDescription +"\n\nSequences (" + String.valueOf(ContigCount.size()) + "):\n"; for (String s : ContigCount){ TextDescription = TextDescription + s + "\n"; } //Feature tabulation TextDescription = TextDescription + "\nFeature Types (" + String.valueOf(Counts.values().size()) + "):\n"; for (String s : Counts.keySet()){ TextDescription = TextDescription + s + " (" + String.valueOf(Counts.get(s)) + ")\n"; } //close opened stream. try { br.close(); } catch (Exception ex) { //ex.printStackTrace(); } //sort elements Collections.sort(Elements, new GenomicElementComparator()); //remove redundant elements this.Elements = removeRedundantElements(Elements); } //remove redundant elements public LinkedList<GenomicElement> removeRedundantElements(LinkedList<GenomicElement> InitialElements){ //Initialize output LinkedList<GenomicElement> OutputElements = new LinkedList<GenomicElement>(); if (InitialElements.size() > 1){ //initial comparison element GenomicElement ECompare = InitialElements.get(0); //check all remaining (move through list) for (int i = 1; i < InitialElements.size(); i++){ //a new element for comparison GenomicElement ENew = InitialElements.get(i); //if all of these are similar, no need to write this element. if (ECompare.getContig().equals(ENew.getContig()) && ECompare.getStart() == ENew.getStart() && ECompare.getStop() == ENew.getStop() && ECompare.getStrand().equals(ENew.getStrand()) && ECompare.getType().equals(ENew.getType())){ } else { OutputElements.add(ECompare); } ECompare = ENew; } //last gene in file GenomicElement ENew = OutputElements.getLast(); //if all of these are similar, no need to write this element. if (ECompare.getContig().equals(ENew.getContig()) && ECompare.getStart() == ENew.getStart() && ECompare.getStop() == ENew.getStop() && ECompare.getStrand().equals(ENew.getStrand()) && ECompare.getType().equals(ENew.getType())){ } else { OutputElements.add(ECompare); } } //return with new set. return OutputElements; } //----------------------- add cluster number -----------------------// //Organism - Gene Name - Cluster Number [OR] Gene Name - Cluster Number [OR] Gene Name public void addClusterNumber(String Annotation, int Clusternumber){ for (GenomicElement E : Elements){ if (E.getAnnotation().toUpperCase().contains(Annotation.toUpperCase().trim())){ E.setClusterID(Clusternumber); } } } //Organism - Contig - Gene Name - Cluster Number public void addClusterNumber(String Contig, String Annotation, int Clusternumber){ for (GenomicElement E : Elements){ if (E.getContig().contentEquals(Contig) && E.getAnnotation().toUpperCase().contains(Annotation.toUpperCase().trim())){ E.setClusterID(Clusternumber); } } } //Organism - Contig - Gene Start - Gene Stop - Cluster Number public void addClusterNumber(String Contig, int Start, int Stop, int Clusternumber){ for (GenomicElement E : Elements){ if (E.getContig().contentEquals(Contig) && E.getStart() == Start && E.getStop() == Stop){ E.setClusterID(Clusternumber); break; } } } //----------------------- Context Set computation ------------------// //single gene context set public void MakeSingleGeneContextSet(String CSName){ //initialize a new context set ContextSet CS = new ContextSet(CSName, "SingleGene"); CS.setPreProcessed(true); HashMap<Integer, LinkedList<GenomicElement>> csmap = new HashMap<Integer, LinkedList<GenomicElement>>(); //iterate through all elements, add each to single-gene context set int Counter = 0; for (GenomicElement E : this.Elements){ Counter++; LinkedList<GenomicElement> L = new LinkedList<GenomicElement>(); L.add(E); csmap.put(Counter, L); } //add completed hash map to context set object CS.setContextMapping(csmap); //add this new context set to the Groupings field. if (Groupings == null){ Groupings = new LinkedList<ContextSet>(); } this.Groupings.add(CS); } public void generateOperonReports(){ String dir = "/Users/phillipseitzer/UCDavis/OperonEvolutionInHalophiles/NRC1-distance-vs-transcriptomics/OperonStats"; //generate context sets String nameStem = "Dist-"; for (int i = 0; i <= 300; i++){ String name = nameStem +String.valueOf(i); ContextSet CS = ComputeContextSet(name, i, true); String fileName = dir + "/" + name; try { BufferedWriter bw = new BufferedWriter(new FileWriter(fileName)); for (LinkedList<GenomicElement> operon : CS.getContextMapping().values()){ StringBuilder sb = new StringBuilder(); for (GenomicElement ge : operon){ String geneId = ge.getGeneID(); if (geneId.endsWith("m")){ geneId = geneId.substring(0, geneId.length()-1); } sb.append(geneId); sb.append(" "); } sb.append("\n"); bw.write(sb.toString()); } bw.close(); } catch (IOException e) { e.printStackTrace(); } System.out.println("Computed context set " + i + "/300."); } } //estimate contexts based on distance public ContextSet ComputeContextSet(String CSName, int tolerance, boolean RequireSameStrain){ //initialize a new context set ContextSet CS = new ContextSet(CSName, "IntergenicDist"); CS.setPreProcessed(true); HashMap<Integer, LinkedList<GenomicElement>> csmap = new HashMap<Integer, LinkedList<GenomicElement>>(); // start counter, initialize each operon (as a LL). int OperonCounter = 1; LinkedList<GenomicElement> LL = new LinkedList<GenomicElement>(); //examine elements, and put into operons //this method assumes that the elements are in order for (int i=0; i < Elements.size()-1; i++){ //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (Elements.get(i).getType().contentEquals(s)){ ElementIsValid = true; break; } } //require valid type // if (Elements.get(i).getType().contentEquals("CDS") || // Elements.get(i).getType().contentEquals("tRNA") || // Elements.get(i).getType().contentEquals("rRNA")){ if (ElementIsValid){ //if the element is valid, place into an operon. //Comment: technically, a pointer to the element LL.add(Elements.get(i)); //find the next valid type in the list boolean validType = false; int NextValid = i+1; //discover the next valid element in the Elements field. while(validType == false){ //determine if next element is valid (should be included) boolean NextElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (Elements.get(NextValid).getType().contentEquals(s)){ NextElementIsValid = true; break; } } //case: next element is valid if (NextElementIsValid){ validType = true; }else if (NextValid < Elements.size()-1) { // case: next element is not valid, look further in file NextValid++; } else { //case: there are no more valid elements in the file NextValid = -1; validType = true; } } //Assuming that there are valid elements to compare against, if (NextValid != -1){ //next element is in a new operon if any of the following are true: //(1) different strand, (2) different contig, (3) too far away from current element boolean newOperon = false; //Comparison blocks - may or may not require the same strain if (RequireSameStrain == true) { if (Elements.get(i).getStrand() == Strand.POSITIVE){ if ((Elements.get(NextValid).getStrand() == Strand.NEGATIVE) || (Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())==false) || (Elements.get(NextValid).getStrand() == Strand.POSITIVE && Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig()) && Elements.get(NextValid).getStart()-Elements.get(i).getStop() > tolerance)) { newOperon = true; } } else { if ((Elements.get(NextValid).getStrand() == Strand.POSITIVE) || (Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())==false) || (Elements.get(NextValid).getStrand() == Strand.NEGATIVE && Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig()) && Elements.get(NextValid).getStart()-Elements.get(i).getStop() > tolerance)) { newOperon = true; } } } else { //Only compare contig names and distance, when not considering strain. if ((Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())==false) || (Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig()) && Elements.get(NextValid).getStart()-Elements.get(i).getStop() > tolerance)) { newOperon = true; } } //if the next valid element defines a new operon: // store the old operon, reset the LL, increment the operon counter. if (newOperon == true){ csmap.put(OperonCounter, LL); LL = new LinkedList<GenomicElement>(); OperonCounter++; } //Last element in the file } else { //place element into an operon, and store the operon in the hash map. LL.add(Elements.get(i)); csmap.put(OperonCounter,LL); } } } //add completed hash map CS.setContextMapping(csmap); //add this new context set to the Groupings field. if (Groupings == null){ Groupings = new LinkedList<ContextSet>(); } this.Groupings.add(CS); return CS; } //add pre-computed contexts from file public void ImportContextSet(String CSName, String fileName) { this.TryToComputeOperons = true; try{ //import buffered reader BufferedReader br = new BufferedReader(new FileReader(fileName)); String Line = null; //initialize a new context set ContextSet CS = new ContextSet(CSName, "Loaded"); CS.setPreProcessed(true); LinkedHashMap<Integer, LinkedList<GenomicElement>> CSMap = new LinkedHashMap<Integer, LinkedList<GenomicElement>>(); while((Line = br.readLine()) != null){ //import line String ImportedLine[] = Line.split("\t"); //if the ID is 0, then skip this entry entirely and move on the next one. int Key = Integer.parseInt(ImportedLine[3]); if (Key != 0){ //create new list, if it doesn't already exist if (CSMap.get(Key) == null){ CSMap.put(Key, new LinkedList<GenomicElement>()); } //search through genomes to find the correct element, add to list for (GenomicElement e : this.Elements){ if (e.getContig().equals(ImportedLine[0]) && e.getStart() == Integer.parseInt(ImportedLine[1]) && e.getStop() == Integer.parseInt(ImportedLine[2])){ CSMap.get(Key).add(e); break; } } } } //add completed mapping to context set CS.setContextMapping(CSMap); //add this context set to existing context sets. if (this.Groupings == null){ Groupings = new LinkedList<ContextSet>(); } Groupings.add(CS); } catch (Exception ex) { this.TryToComputeOperons = false; String Message = "The Genome Context File " + "\n" + fileName + "\n" + "was improperly formatted. Please re-format this file and try again."; JOptionPane.showMessageDialog(null, Message, "Invalid File Format", JOptionPane.ERROR_MESSAGE); } } //adjust a context set public void AdjustContextSet(String CSName, String ContigName, int Start, int Stop, int Key){ ContextSet CS = null; boolean AddCStoGroups = true; //Find the context set for (ContextSet CS1 : Groupings){ if (CS1.getName().equals(CSName)){ CS = CS1; AddCStoGroups = false; break; } } //create it if it doesn't yet exist if (CS == null){ CS = new ContextSet(); CS.setPreProcessed(true); CS.setName(CSName); } //Retrieve existing mapping, or create new one HashMap<Integer, LinkedList<GenomicElement>> CSMap = null; if (CS.getContextMapping() != null){ CSMap = CS.getContextMapping(); } else { CSMap = new HashMap<Integer, LinkedList<GenomicElement>>(); } //add element //find appropriate elements for (GenomicElement E : Elements){ //match start, stop, and contig if (E.getContig().equals(ContigName) && E.getStart() == Start && E.getStop() == Stop){ //update list if (CSMap.get(Key) != null){ //if the CSMap already contains the element, don't add again if (!CSMap.get(Key).contains(E)){ CSMap.get(Key).add(E); } } else { LinkedList<GenomicElement> List = new LinkedList<GenomicElement>(); List.add(E); CSMap.put(Key, List); } //break out of loop break; } } //update the hash map CS.setContextMapping(CSMap); //add the CS to the groupings, if it's brand new. if (AddCStoGroups){ Groupings.add(CS); } } //----------------------- Sorting ------------------------// //sort genomic elements by (1) contig name, and within contigs, (2) start position. public class GenomicElementComparator implements Comparator<GenomicElement> { public int compare(GenomicElement E1, GenomicElement E2) { int nameCompare = E1.getContig().compareToIgnoreCase(E2.getContig()); if (nameCompare != 0) { return nameCompare; } else { //return Integer.valueOf(E1.getStart()).compareTo(Integer.valueOf(E2.getStart())); return Integer.valueOf(E1.getCenter()).compareTo(Integer.valueOf(E2.getCenter())); } } } public static class SortGandEByElements implements Comparator<GenomicElementAndQueryMatch> { @Override public int compare(GenomicElementAndQueryMatch GandE1, GenomicElementAndQueryMatch GandE2) { int nameCompare = GandE1.getE().getContig().compareToIgnoreCase(GandE2.getE().getContig()); if (nameCompare != 0) { return nameCompare; } else { return Integer.valueOf(GandE1.getE().getCenter()) .compareTo(Integer.valueOf(GandE2.getE().getCenter())); } } } // ----------------------- Export + Sequence -----------------------// //return DNA sequence from a .fasta file (by streaming file in) public String DNASequence(String contig, int start, int stop, Strand strand){ //initialize and instantiate variable String seq = null; //stream in until the appropriate line is discovered. try { BufferedReader br = null; //read from file if (SeqsFromFile){ br = new BufferedReader(new FileReader(GenomeSequenceFile)); } else { //read from website URL SeqFile = new URL(GenomeSequenceFile); InputStream is = SeqFile.openStream(); br = new BufferedReader(new InputStreamReader(is)); } String Line = null; boolean ThisContig = false; boolean StartedSequence = false; int ContigSeqBlock = 0; while ((Line = br.readLine()) != null){ //header if (Line.startsWith(">")){ if (Line.contains(contig)){ ThisContig = true; } //sequence } else if (ThisContig){ /* * (1) First coordinate in line = ContigSeqBlock + 1 * (2) */ //available coordinate range featured in this line. int StartLine = ContigSeqBlock + 1; int StopLine = ContigSeqBlock + Line.length(); //(1) Check for start coordinate, if appropriate if (StartedSequence){ if (stop < StopLine){ //the line ends in this line - recover sequence + exit. seq = seq + (String) Line.subSequence(0, stop - ContigSeqBlock); break; } else{ //write all sequence and proceed. seq = seq + Line; } } else { //the line starts here. Record the appropriate place. if (start >= StartLine && start <= StopLine){ StartedSequence = true; //the string also ends in this line. - recover sequence + exit if (stop <= StopLine){ seq = (String) Line.subSequence((start-1)-ContigSeqBlock,stop-ContigSeqBlock); break; //start sequence, end later } else { seq = (String) Line.substring((start-1)-ContigSeqBlock); } } } //update the tally of all previous sequence ContigSeqBlock = StopLine; } } //close file stream br.close(); //flip, if appropriate if (strand.equals(Strand.NEGATIVE)){ DNASequence d = new DNASequence(seq); seq = d.getReverseComplement().getSequenceAsString(); } //return all sequence as upper case string. seq = seq.toUpperCase(); } catch (Exception ex){ ex.printStackTrace(); } //return statement return seq; } //Export a GFF file with gene IDs + cluster IDs (if applicable) public void ExportExtendedGFFFile(String FileName){ try { //filewriter BufferedWriter bw = new BufferedWriter(new FileWriter(FileName)); String Line; String TheStrand; for (GenomicElement E : this.Elements){ if (E.getStrand().equals(Strand.POSITIVE)){ TheStrand = "1"; } else { TheStrand = "-1"; } //build line Line = E.getContig() + "\tGenBank\t" + String.valueOf(E.getType()) + "\t" + String.valueOf(E.getStart()) + "\t" + String.valueOf(E.getStop()) + "\t+\t" + TheStrand + "\t.\t" + E.getAnnotation() + "\t" + String.valueOf(E.getClusterID()); //possibly add homology cluster if (E.getGeneID() != ""){ Line = Line + "\t" + E.getGeneID(); } Line = Line + "\n"; bw.write(Line); bw.flush(); } bw.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } //----------------------- Search/Retrieval ------------------------// //preprocessed == true //return a hashset of gene groupings - annotation public HashSet<LinkedList<GenomicElementAndQueryMatch>> AnnotationMatches(String[] query, String ContextSetName){ //initialize ContextSet CS = new ContextSet(); //determine the correct context set, and make a copy for (ContextSet selectCS : Groupings){ if (selectCS.getName().equals(ContextSetName)){ CS = selectCS; break; } } //System.out.println(this.Species + " " + CS.getName()); //create a tree set to contain individual element matches HashSet<LinkedList<GenomicElementAndQueryMatch>> Hits = new HashSet<LinkedList<GenomicElementAndQueryMatch>>(); boolean AddtheSet; //determine all matches for (LinkedList<GenomicElement> LL: CS.getContextMapping().values()){ //default: do not add the set AddtheSet = false; //initialize the list LinkedList<GenomicElementAndQueryMatch> TheList = new LinkedList<GenomicElementAndQueryMatch>(); //search for all direct matches, and mark them for (int i = 0; i < LL.size(); i++){ //initialize a new GenomicElementAndQueryMatch GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(LL.get(i)); //check each query for (int j = 0; j < query.length; j++){ //check annotation first if (LL.get(i).getAnnotation().toUpperCase().contains(query[j].trim().toUpperCase())){ AddtheSet = true; GandE.setQueryMatch(true); //next, check gene IDs } else if (LL.get(i).getGeneID().toUpperCase().equals(query[j].trim().toUpperCase())){ AddtheSet = true; GandE.setQueryMatch(true); // no match! } else { GandE.setQueryMatch(false); } } //add this element to the list TheList.add(GandE); } //if even one match was discovered in an LL, add the whole LL. if (AddtheSet == true){ Hits.add(TheList); } } //return HashSet return Hits; } //return a hashset of gene groupings - homology cluster public HashSet<LinkedList<GenomicElementAndQueryMatch>> ClusterMatches(int[] ClusterNumber, String ContextSetName){ //initialize ContextSet CS = new ContextSet(); //determine the correct context set for (ContextSet selectCS : Groupings){ if (selectCS.getName().equals(ContextSetName)){ CS = selectCS; break; } } //create a tree set to contain individual element matches HashSet<LinkedList<GenomicElementAndQueryMatch>> Hits = new HashSet<LinkedList<GenomicElementAndQueryMatch>>(); boolean AddtheSet; //determine all matches for (LinkedList<GenomicElement> LL: CS.getContextMapping().values()){ //initialize the list LinkedList<GenomicElementAndQueryMatch> TheList = new LinkedList<GenomicElementAndQueryMatch>(); //reset value to false AddtheSet = false; //search for all direct matches, and mark them for (int i = 0; i < LL.size(); i++){ //initialize a new GenomicElementAndQueryMatch GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(LL.get(i)); //defaults: do not take GandE.setQueryMatch(false); //check every cluster number, for query match for (int j = 0; j < ClusterNumber.length; j++){ if (LL.get(i).getClusterID()==ClusterNumber[j]){ AddtheSet = true; GandE.setQueryMatch(true); } } //add this element to the list TheList.add(GandE); } //if even one match was discovered in an LL, add the whole LL. if (AddtheSet == true){ Hits.add(TheList); //System.out.println("added a new set with " + TheList.size() + " genes, from " + LL.size()); } } //return HashSet return Hits; } //preprocessed == false //return a hashset of gene groupings public HashSet<LinkedList<GenomicElementAndQueryMatch>> MatchesOnTheFly(String[] Queries, int[] ClusterNumbers, ContextSetDescription CSD){ //create a tree set to contain individual element matches HashSet<LinkedList<GenomicElementAndQueryMatch>> Hits = new HashSet<LinkedList<GenomicElementAndQueryMatch>>(); //determine appropriate form of searches boolean IsCluster = false; if (Queries == null){ IsCluster = true; } //find query match boolean QueryMatch = false; //group genes together according to the specificed gene grouping protocol. if (CSD.getType().contentEquals("Range")) { //iterate through all elements for (int i = 0; i <this.Elements.size(); i++){ //determine if the element is a query match. QueryMatch = false; if (IsCluster){ for (int j = 0; j < ClusterNumbers.length; j++){ if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){ QueryMatch = true; break; } } } else { for (int j = 0; j < Queries.length; j++){ if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } } } //if it is, extract the appropriate range if (QueryMatch){ //define a new GenomicElementAndQueryMatch LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>(); GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(this.Elements.get(i)); GandE.setQueryMatch(true); LL.add(GandE); //Center of the query match //int Center = (int)Math.round(0.5*(double)(GandE.getE().getStart()+GandE.getE().getStop())); int Center = GandE.getE().getCenter(); //continue adding genes until sufficient //before genes //int BeforeQuery = Center - this.Elements.get(i).getStart(); int BeforeQuery = Center - this.Elements.get(i).getCenter(); int BeforeCounter = 0; boolean EndOfContig = false; String CurrentContig = this.Elements.get(i).getContig(); while (BeforeQuery <= CSD.getNtRangeBefore() && EndOfContig == false){ BeforeCounter++; GandE = new GenomicElementAndQueryMatch(); //first element in file if (i-BeforeCounter >= 0) { GandE.setE(this.Elements.get(i-BeforeCounter)); GandE.setQueryMatch(false); //BeforeQuery = Center - GandE.getE().getStart(); BeforeQuery = Center - GandE.getE().getCenter(); //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } if (ElementIsValid){ //check for end of contig if (CurrentContig.equals(GandE.getE().getContig())){ if (BeforeQuery < CSD.getNtRangeBefore()){ LL.add(0,GandE); } } else { EndOfContig = true; } //when the element is not valid, just skip to the next one. } // else { // EndOfContig = true; // } } else { EndOfContig = true; } } //after genes //int AfterQuery = this.Elements.get(i).getStop() - Center; int AfterQuery = this.Elements.get(i).getCenter() - Center; int AfterCounter = 0; EndOfContig = false; CurrentContig = this.Elements.get(i).getContig(); while (AfterQuery <= CSD.getNtRangeAfter() && EndOfContig == false){ AfterCounter++; GandE = new GenomicElementAndQueryMatch(); //last element in file if (i+AfterCounter < this.Elements.size()){ GandE.setE(this.Elements.get(i+AfterCounter)); GandE.setQueryMatch(false); //AfterQuery = GandE.getE().getStop() - Center; AfterQuery = GandE.getE().getCenter() - Center; //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } if (ElementIsValid){ //check for end of contig if (CurrentContig.equals(GandE.getE().getContig())){ if (AfterQuery < CSD.getNtRangeAfter()){ LL.add(GandE); } } else { EndOfContig = true; } } //try: just skipping over // else { // EndOfContig = true; // } } else { EndOfContig = true; } } //finally, add this to the hit list Hits.add(LL); } } } else if (CSD.getType().contentEquals("GenesAround")) { //iterate through all elements for (int i = 0; i <this.Elements.size(); i++){ //determine if the element is a query match. QueryMatch = false; if (IsCluster){ for (int j = 0; j < ClusterNumbers.length; j++){ if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){ QueryMatch = true; break; } } } else { for (int j = 0; j < Queries.length; j++){ if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } } } //if it is, extract the appropriate range if (QueryMatch){ //define a new GenomicElementAndQueryMatch LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>(); GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(this.Elements.get(i)); GandE.setQueryMatch(true); LL.add(GandE); //continue adding genes until sufficient //before genes int BeforeCounter = 0; boolean EndOfContig = false; String CurrentContig = this.Elements.get(i).getContig(); while (BeforeCounter < CSD.getGenesBefore() && EndOfContig == false){ BeforeCounter++; GandE = new GenomicElementAndQueryMatch(); //first element in file if (i-BeforeCounter > 0) { GandE.setE(this.Elements.get(i-BeforeCounter)); GandE.setQueryMatch(false); //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } //only add elements of the appropriate type - otherwise, skip if (ElementIsValid){ //check for end of contig if (CurrentContig.equals(GandE.getE().getContig())){ LL.add(GandE); } else { EndOfContig = true; } } else { EndOfContig = true; } } } //after genes int AfterCounter = 0; EndOfContig = false; CurrentContig = this.Elements.get(i).getContig(); while (AfterCounter < CSD.getGenesAfter() && EndOfContig == false){ AfterCounter++; GandE = new GenomicElementAndQueryMatch(); //last element in file if (i+AfterCounter < this.Elements.size()){ GandE.setE(this.Elements.get(i+AfterCounter)); GandE.setQueryMatch(false); //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } //only add elements of the appropriate type - otherwise, skip if (ElementIsValid){ //check for end of contig if (CurrentContig.equals(GandE.getE().getContig())){ LL.add(GandE); } else { EndOfContig = true; } } else { EndOfContig = true; } } } //finally, add this to the hit list Hits.add(LL); } } } else if (CSD.getType().contentEquals("GenesBetween")) { LinkedList<GenomicElement> FirstQueries = new LinkedList<GenomicElement>(); LinkedList<GenomicElement> SecondQueries = new LinkedList<GenomicElement>(); //iterate through all elements, find first + second queries for (int i = 0; i <this.Elements.size(); i++){ //determine if the element is a query match. QueryMatch = false; if (IsCluster){ for (int j = 0; j <ClusterNumbers.length; j++){ if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){ if (j == 0){ FirstQueries.add(Elements.get(i)); } else { SecondQueries.add(Elements.get(i)); } } } } else { for (int j = 0; j < Queries.length; j++){ if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){ if (j == 0){ FirstQueries.add(Elements.get(i)); } else { SecondQueries.add(Elements.get(i)); } } else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){ if (j == 0){ FirstQueries.add(Elements.get(i)); } else { SecondQueries.add(Elements.get(i)); }; } } } } //pairings of genomic element query matches HashSet<LinkedList<GenomicElement>> Pairs = new HashSet<LinkedList<GenomicElement>>(); //find first set matches int ClosestDistance = 999999999; GenomicElement Partner = null; for (GenomicElement E1 : FirstQueries){ //reset values Partner = null; ClosestDistance = 999999999; //find closest for (GenomicElement E2 : SecondQueries){ if (E1.getContig().contentEquals(E2.getContig()) && Math.abs(E1.getCenter() - E2.getCenter()) < ClosestDistance && !E1.equals(E2)) { //Partners that are too far away are excluded anyway, if this option is specified if ((CSD.isGapLimit() && Math.abs(E1.getCenter() - E2.getCenter()) <= CSD.getGapLimitSize()) || !CSD.isGapLimit()){ //operon expansion option - same strand checks if (CSD.isOperonExpansion && CSD.SameStrandRequired){ //strand must match to count as a partner if (E1.getStrand().equals(E2.getStrand())){ //update the closest distance Math.abs(ClosestDistance = E1.getCenter() - E2.getCenter()); Partner = E2; } //no operon expansion - strandedness doesn't matter in this case } else { //update the closest distance Math.abs(ClosestDistance = E1.getCenter() - E2.getCenter()); Partner = E2; } } } } //there must be a partner to describe a partnership. if (Partner != null){ //add to hash set LinkedList<GenomicElement> Partnership = new LinkedList<GenomicElement>(); Partnership.add(E1); Partnership.add(Partner); Pairs.add(Partnership); } } //find second set matches ClosestDistance = 999999999; Partner = null; for (GenomicElement E2 : SecondQueries){ //reset values Partner = null; ClosestDistance = 999999999; //find closest for (GenomicElement E1 : FirstQueries){ if (E2.getContig().contentEquals(E1.getContig()) && Math.abs(E2.getCenter() - E1.getCenter()) < ClosestDistance && !E1.equals(E2) ) { //Partners that are too far away are excluded anyway, if this option is specified if ((CSD.isGapLimit() && Math.abs(E1.getCenter() - E2.getCenter()) <= CSD.getGapLimitSize()) || !CSD.isGapLimit()){ //operon expansion option - same strand checks if (CSD.isOperonExpansion && CSD.SameStrandRequired){ //strand must match to count as a partner if (E1.getStrand().equals(E2.getStrand())){ //update the closest distance ClosestDistance = Math.abs(E2.getCenter() - E1.getCenter()); Partner = E1; } //no operon expansion - strandedness doesn't matter in this case } else { //update the closest distance ClosestDistance = Math.abs(E2.getCenter() - E1.getCenter()); Partner = E1; } } } } //there must be a partner for this to even matter. if (Partner != null){ //add to hash set LinkedList<GenomicElement> Partnership = new LinkedList<GenomicElement>(); Partnership.add(Partner); Partnership.add(E2); Pairs.add(Partnership); } } //for all pairs, add all genomic elements Iterator<LinkedList<GenomicElement>> it = Pairs.iterator(); while(it.hasNext()){ LinkedList<GenomicElement> Pair = it.next(); //find starting /ending points int StartingE = -1; int StoppingE = -1; for (int i = 0; i < Elements.size(); i++){ if (this.Elements.get(i).equals(Pair.get(0))){ StartingE = i; } if (this.Elements.get(i).equals(Pair.get(1))){ StoppingE = i; } } //initialize an output linked list LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>(); //re-order correctly if (StartingE > StoppingE){ int temp = StartingE; StartingE = StoppingE; StoppingE = temp; } //for operon expansion boolean AddListToMatches = true; //add all intermediate elements GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(this.Elements.get(StartingE)); GandE.setQueryMatch(true); LL.add(GandE); int ElementNumber = StartingE + 1; int CurrentListCounter = 0; while (ElementNumber <= StoppingE){ GandE = new GenomicElementAndQueryMatch(); GandE.setE(Elements.get(ElementNumber)); if (ElementNumber == StoppingE){ GandE.setQueryMatch(true); } else { GandE.setQueryMatch(false); } //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } //only add elements of the appropriate type - otherwise, skip if (ElementIsValid){ //add these to list, and increment counter (for distance comparison) LL.add(GandE); CurrentListCounter++; //check against max gene count options if (CSD.InternalGeneNumberLimit){ if (CurrentListCounter - 1 > CSD.MaxNumInternalGenes){ AddListToMatches = false; break; } } //check against operon expansion options if (CSD.isOperonExpansion){ //check for same strand violation if (CSD.SameStrandRequired && !GandE.getE().getStrand().equals(LL.get(0).getE().getStrand())){ AddListToMatches = false; break; } //check for distance violation if (LL.get(CurrentListCounter).getE().getStart() - LL.get(CurrentListCounter-1).getE().getStop() > CSD.IntergenicGapLimit){ AddListToMatches = false; break; } } } ElementNumber++; } //debugging //System.out.println(LL); //add list to hash map if (AddListToMatches) Hits.add(LL); // //legacy // // //compute stats about stopping element // double StoppingECenter = this.Elements.get(StoppingE).getStart() // + (0.5*(this.Elements.get(StoppingE).getStop() - this.Elements.get(StoppingE).getStart())); // // //add list to hash map. Check for inappropriate cases. // if (CSD.isGapLimit()){ // //System.out.println("Starting: " + StartingECenter + " Stopping: " + StoppingECenter); // if (Math.abs(StoppingECenter - StartingECenter) <= CSD.getGapLimitSize()){ // Hits.add(LL); // } // } else { // Hits.add(LL); // } } } else if (CSD.getType().contentEquals("MultipleQuery")) { //all genomic element matches LinkedList<GenomicElementAndQueryMatch> MQMatches = new LinkedList<GenomicElementAndQueryMatch>(); //iterate through all elements, find all matches for (GenomicElement E : Elements){ //determine if the element is a query match. if (IsCluster){ for (int j = 0; j <ClusterNumbers.length; j++){ if (E.getClusterID() == ClusterNumbers[j]){ GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(E); GandE.setQueryMatch(true); //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } if (ElementIsValid){ MQMatches.add(GandE); } } } } else { for (int j = 0; j < Queries.length; j++){ //check annotation if (E.getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){ GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(E); GandE.setQueryMatch(true); //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } if (ElementIsValid){ MQMatches.add(GandE); } //check gene ID } else if (E.getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){ GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(E); GandE.setQueryMatch(true); //check against user-defined set of valid types boolean ElementIsValid = false; for (String s : this.FeatureIncludeTypes){ if (GandE.getE().getType().contentEquals(s)){ ElementIsValid = true; break; } } if (ElementIsValid){ MQMatches.add(GandE); } } } } } //add all non-null linked lists if (MQMatches != null){ Hits.add(MQMatches); } } else if (CSD.getType().contentEquals("IntergenicDist-pre")) { //Initialize a hashset for query matches, and for linked lists of genomic elements. HashSet<GenomicElement> QueryMatchSet = new HashSet<GenomicElement>(); HashSet<LinkedList<GenomicElement>> E_Hits = new HashSet<LinkedList<GenomicElement>>(); //iterate through all elements for (int i = 0; i < this.Elements.size(); i++){ //determine if the element is a query match. QueryMatch = false; if (IsCluster){ for (int j = 0; j < ClusterNumbers.length; j++){ if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){ QueryMatch = true; break; } } } else { for (int j = 0; j < Queries.length; j++){ if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } } } //if it is, extract the appropriate range if (QueryMatch){ //System.out.println("Breakpoint!"); //current element is the query match. GenomicElement E_curr = this.Elements.get(i); //add to list of query matches QueryMatchSet.add(E_curr); //define a new GenomicElementAndQueryMatch LinkedList<GenomicElement> LL = new LinkedList<GenomicElement>(); LL.add(E_curr); // ----- global switches ---- // boolean AddUpstream = true; boolean AddDownstream = true; int GeneNumber; boolean ValidElementsRemain; GenomicElement E_can; boolean Add2Operon; // ----- Add upstream ---- // //switches GeneNumber = i; ValidElementsRemain = false; //Initialize a candidate genomic element for operon addition. E_can = null; //add to operon switch. Add2Operon = false; //add upstream elements to list, if appropriate. while (AddUpstream){ //default: no more valid elements, do not add to operon ValidElementsRemain = false; Add2Operon = false; //find next valid element for (int q = GeneNumber-1; q >= 0; q--){ E_can = Elements.get(q); for (String s : this.FeatureIncludeTypes){ if (E_can.getType().contentEquals(s)){ ValidElementsRemain = true; GeneNumber = q; break; } } //break out of outer loop if (ValidElementsRemain){ break; } } //compare to current to candidate. if (ValidElementsRemain){ //check operon requirements. if (E_can.getContig().equals(E_curr.getContig()) && //Contig Match E_curr.getStart()-E_can.getStop() <= CSD.getIntGenSpacing()){ //Distance match //check for same strand. if (CSD.isNeedSameStrand()){ if (E_can.getStrand().equals(E_curr.getStrand())){ Add2Operon = true; } } else { Add2Operon = true; } } //add, if appropriate if (Add2Operon){ //add genomic element to growing operon chain LL.add(0,E_can); //re-set counter E_curr = E_can; } else{ //once you stop adding, no going back. AddUpstream = false; } } else { //finished with operon. AddUpstream = false; } } // ----- Add downstream ---- // //switches GeneNumber = i; ValidElementsRemain = false; //Re-initialize genomic elements for comparison. E_can = null; E_curr = this.Elements.get(i); //add to operon switch. Add2Operon = false; //add upstream elements to list, if appropriate. while (AddDownstream){ //default: no more valid elements, do not add to operon ValidElementsRemain = false; Add2Operon = false; //find next valid element for (int q = GeneNumber+1; q < Elements.size(); q++){ E_can = Elements.get(q); for (String s : this.FeatureIncludeTypes){ if (E_can.getType().contentEquals(s)){ ValidElementsRemain = true; GeneNumber = q; break; } } //break out of outer loop if (ValidElementsRemain){ break; } } //compare to current to candidate. if (ValidElementsRemain){ //check operon requirements. if (E_can.getContig().equals(E_curr.getContig()) && //Contig Match E_can.getStart()-E_curr.getStop() <= CSD.getIntGenSpacing()){ //Distance match //check for same strand. if (CSD.isNeedSameStrand()){ if (E_can.getStrand().equals(E_curr.getStrand())){ Add2Operon = true; } } else { Add2Operon = true; } } //add, if appropriate if (Add2Operon){ //add genomic element to growing operon chain LL.add(E_can); //re-set counter E_curr = E_can; } else{ //once you stop adding, no going back. AddDownstream = false; } } else { //finished with operon. AddDownstream = false; } } //finally, add this to the hit list (pre-query match tags) E_Hits.add(LL); } } //build up actual hits - add query information for (LinkedList<GenomicElement> LL : E_Hits){ //initialize list LinkedList<GenomicElementAndQueryMatch> LLq = new LinkedList<GenomicElementAndQueryMatch>(); //iterate through elements, add query tag for (GenomicElement E : LL){ //initialize genomic element and query match GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(E); //if this element is in the set of query matches, tag if (QueryMatchSet.contains(E)){ GandE.setQueryMatch(true); } else { GandE.setQueryMatch(false); } //add query-updated element to list LLq.add(GandE); } //add completed list to final output set. Hits.add(LLq); } } else if (CSD.getType().contentEquals("SingleGene")) { //iterate through all elements for (GenomicElement E : Elements){ //re-set for each gene. QueryMatch = false; //check for match if (IsCluster){ for (int j = 0; j < ClusterNumbers.length; j++){ if (E.getClusterID() == ClusterNumbers[j]){ QueryMatch = true; break; } } } else { for (int j = 0; j < Queries.length; j++){ if (E.getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } else if (E.getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){ QueryMatch = true; break; } } } //add to list if (QueryMatch){ //System.out.println("Breakpoint!"); //Define Match GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch(); GandE.setE(E); GandE.setQueryMatch(true); //gene should be in a class all of its own LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>(); //add gene to list LL.add(GandE); //add list to set of lists Hits.add(LL); } } } // various gene grouping strategies return Hits; } //----------------------- GETTERS+SETTERS ------------------------// //Getters and Setters public String getGenus() { return Genus; } public void setGenus(String genus) { Genus = genus; } public String getSpecies() { return Species; } public void setSpecies(String species) { Species = species; } public LinkedList<GenomicElement> getElements() { return Elements; } public void setElements(LinkedList<GenomicElement> elements) { Elements = elements; } public File getGenomeFile() { return GenomeFile; } public void setGenomeFile(File genomeFile) { GenomeFile = genomeFile; } public LinkedList<ContextSet> getGroupings() { if (Groupings == null){ Groupings = new LinkedList<ContextSet>(); } return Groupings; } public void setGroupings(LinkedList<ContextSet> groupings) { Groupings = groupings; } public boolean isTryToComputeOperons() { return TryToComputeOperons; } public void setTryToComputeOperons(boolean tryToComputeOperons) { TryToComputeOperons = tryToComputeOperons; } public LinkedList<String> getIncludeTypes() { return FeatureIncludeTypes; } public void setIncludeTypes(LinkedList<String> includeTypes) { FeatureIncludeTypes = includeTypes; } public LinkedList<String> getDisplayOnlyTypes() { return FeatureDisplayTypes; } public void setDisplayOnlyTypes(LinkedList<String> displayOnlyTypes) { FeatureDisplayTypes = displayOnlyTypes; } public LinkedList<MotifGroup> getMotifs() { return Motifs; } public void setMotifs(LinkedList<MotifGroup> motifs) { Motifs = motifs; } public boolean isAGClustersLoaded() { return AGClustersLoaded; } public void setAGClustersLoaded(boolean aGClustersLoaded) { AGClustersLoaded = aGClustersLoaded; } public String getTextDescription() { return TextDescription; } public void setTextDescription(String textDescription) { TextDescription = textDescription; } public String getGenbankID() { return GenbankID; } public void setGenbankID(String genbankID) { GenbankID = genbankID; } public GBKFieldMapping getGFM() { return GFM; } public void setGFM(GBKFieldMapping gFM) { GFM = gFM; } public LinkedHashMap<String, Integer> getContigEnds() { return ContigEnds; } public void setContigEnds(LinkedHashMap<String, Integer> contigEnds) { ContigEnds = contigEnds; } public Integer getLargestCluster() { return LargestCluster; } public void setLargestCluster(Integer largestCluster) { LargestCluster = largestCluster; } public String getGenomeSequenceFile() { return GenomeSequenceFile; } public void setGenomeSequenceFile(String genomeSequenceFile) { GenomeSequenceFile = genomeSequenceFile; } //-----------------------Deprecated ----------------------// //DEPRECATED biojava this function simply returns a DNA sequence from a particular genome file. public String retrieveSequence(String contig, int start, int stop, Strand strand){ //initialize and instantiate variable String seq=null; //load genome, and recover sequence LinkedHashMap<String, DNASequence> genome; try { //import genome genome = FastaReaderHelper.readFastaDNASequence(new File(GenomeSequenceFile)); //retrieve string value + extract subsequence for (Entry<String, DNASequence> entry : genome.entrySet()) { if (entry.getValue().getOriginalHeader().contains(contig)){ seq = entry.getValue().getSequenceAsString(start, stop, strand).toUpperCase(); System.out.println("Start: " + start + " Stop: " + stop + " Strand: " + strand); break; } } } catch (Exception e) { e.printStackTrace(); } return seq; } public boolean isSeqsFromFile() { return SeqsFromFile; } public void setSeqsFromFile(boolean seqsFromFile) { SeqsFromFile = seqsFromFile; } } //completes classbody