package genomeObjects;
import haloGUI.GBKFieldMapping;
import java.io.*;
import java.net.URL;
import java.text.Collator;
import java.util.*;
import java.util.Map.Entry;
import javax.swing.JOptionPane;
import org.biojava3.core.sequence.DNASequence;
import org.biojava3.core.sequence.ProteinSequence;
import org.biojava3.core.sequence.RNASequence;
import org.biojava3.core.sequence.Strand;
import org.biojava3.core.sequence.io.FastaReaderHelper;
public class AnnotatedGenome implements Serializable {
/**
*
*/
private static final long serialVersionUID = -7721895130219179915L;
//Fields
private String Genus; //-Biological-organization-------------
private String Species; //
private LinkedList<GenomicElement> Elements; //-Genes, SigSeqs, and groups of genes--
private LinkedList<MotifGroup> Motifs //
= new LinkedList<MotifGroup>(); //
private LinkedList<ContextSet> Groupings = new LinkedList<ContextSet>(); //-Predicted Groupings-----------------
private File GenomeFile; //-Associated genome file --------------
private String GenomeSequenceFile;
private boolean SeqsFromFile;
private boolean TryToComputeOperons;
private LinkedList<String> FeatureIncludeTypes; //-Types of data worth importing/processing
private LinkedList<String> FeatureDisplayTypes;
private boolean AGClustersLoaded = false;
private String TextDescription = ""; //-Info about the genome
private String GenbankID;
private Integer LargestCluster = 0;
private GBKFieldMapping GFM;
private LinkedHashMap<String, Integer> ContigEnds
= new LinkedHashMap<String, Integer>();
// ----------------------- Construction ------------------------//
//Constructor
public AnnotatedGenome() {
super();
}
//import annotated elements from a .GFF file.
public void importFromGFFFile(String filename){
//define a null linked list
LinkedList<GenomicElement> Elements = new LinkedList<GenomicElement>();
try{
//import buffered reader
BufferedReader br = new BufferedReader(new FileReader(filename));
String Line = null;
int Counter = 0;
//Information for statistics - type counts
LinkedHashMap<String, Integer> Counts
= new LinkedHashMap<String, Integer>();
HashSet<String> ContigCount = new HashSet<String>();
while((Line = br.readLine()) != null){
//System.out.println(Line);
//ignore commented lines
if (!Line.startsWith("#") && !Line.isEmpty()){
//increment Counter
Counter++;
//import each line of the .gff file
String ImportedLine[] = Line.split("\t");
//GFF files must contain exactly at least 9 fields
if (ImportedLine.length < 9){
throw new Exception();
}
//check and see if this element should be retained at all
//check include types
boolean RetainElement = false;
for (String s : this.FeatureIncludeTypes){
if (ImportedLine[2].trim().contentEquals(s)){
RetainElement = true;
break;
}
}
//if this fails, check for display types
if (!RetainElement){
for (String s : this.FeatureDisplayTypes){
if (ImportedLine[2].trim().contentEquals(s)){
RetainElement = true;
break;
}
}
}
//add this element to the list, if necessary
if (RetainElement){
//if a line or two are not formatted correctly, just ignore these lines.
try {
//create a new element
GenomicElement E = new GenomicElement();
//set appropriate fields of this genomic element with inputs achieved from the GFF file
E.setContig(ImportedLine[0]);
E.setType(ImportedLine[2]);
E.setStart(Integer.parseInt(ImportedLine[3]));
E.setStop(Integer.parseInt(ImportedLine[4]));
E.setElementID(Counter);
E.DetermineCenter();
try {
if(Integer.parseInt(ImportedLine[6])==1){
E.setStrand(Strand.POSITIVE);
}else{
E.setStrand(Strand.NEGATIVE);
}
} catch (Exception ex) {
if (ImportedLine[6].contentEquals("+")){
E.setStrand(Strand.POSITIVE);
} else {
E.setStrand(Strand.NEGATIVE);
}
}
//set annotation
E.setAnnotation(ImportedLine[8]);
//add gene IDs + homology clusters, if available
if (ImportedLine.length > 9){
int ClustID = Integer.parseInt(ImportedLine[9]);
E.setClusterID(ClustID);
if (ClustID > LargestCluster){
LargestCluster = ClustID;
}
this.AGClustersLoaded = true;
//System.out.println("Set!");
if (ImportedLine.length > 10){
E.setGeneID(ImportedLine[10]);
}
//System.out.println("Largest: " + LargestCluster);
}
//add to list, if it doesn't already exist.
Elements.add(E);
//add contig ends
if (ContigEnds.get(E.getContig()) != null){
if (ContigEnds.get(E.getContig()) < E.getStop()){
ContigEnds.put(E.getContig(), E.getStop());
}
} else {
ContigEnds.put(E.getContig(), E.getStop());
}
//Record counts of types
if (Counts.get(E.getType()) != null){
int OldCount = Counts.get(E.getType());
Counts.put(E.getType(),(OldCount+1));
} else {
Counts.put(E.getType(), 1);
}
//Record counts of contigs
ContigCount.add(E.getContig());
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
}
//Convert feature counts to string, for display.
//Number of contigs / plasmids / chromosomes
TextDescription = "Sequences (" + String.valueOf(ContigCount.size()) + "):\n";
for (String s : ContigCount){
TextDescription = TextDescription + s + "\n";
}
//Feature tabulation
TextDescription = TextDescription + "\nFeature Types (" + String.valueOf(Counts.values().size()) + "):\n";
for (String s : Counts.keySet()){
TextDescription = TextDescription + s + " (" + String.valueOf(Counts.get(s)) + ")\n";
}
//close file stream
br.close();
}catch(Exception ex){
//ex.printStackTrace();
//System.out.println("fail!");
//System.exit(1);
System.err.println("File format error! Please re-format and try again.");
}
//sort elements
Collections.sort(Elements, new GenomicElementComparator());
//set elements to newly parsed, sorted, and redundancy-filtered elements.
this.Elements = removeRedundantElements(Elements);
//set elements to the newly parsed elements.
//this.Elements = Elements;
//System.out.println(LargestCluster);
}
//import annotated elements from a .GBK file.
public void importFromGBKFile(String filename){
//call reader!
try {
//create a buffered reader to read the sequence file specified by args[0]
BufferedReader br = new BufferedReader(new FileReader(filename));
//call the reader!
importFromGBKReader(br);
} catch (Exception ex) {
ex.printStackTrace();
}
}
//import annotated elements streamed in from .GBK website.
public void importFromGBKReader(BufferedReader br){
//Information for statistics - type counts
LinkedHashMap<String, Integer> Counts
= new LinkedHashMap<String, Integer>();
HashSet<String> ContigCount = new HashSet<String>();
String Line = null;
boolean ReadFeatures = false;
boolean NewFeature = false;
boolean DescriptiveInfo = false;
//Fields for genomic features.
String ContigName = "";
String TypeName = "";
GenomicElement E = new GenomicElement();
String LocusTag = "";
boolean WritingProduct = false;
boolean WritingTranslation = false;
boolean InAnIgnoreFeature = false;
//define types for import.
LinkedList<String> Types = new LinkedList<String>();
Types.addAll(FeatureIncludeTypes);
Types.addAll(FeatureDisplayTypes);
//prepare list for addition
Elements = new LinkedList<GenomicElement>();
try {
while ((Line = br.readLine()) != null){
//trim the line to remove white space.
Line = Line.trim();
//System.out.println(Line);
//System.out.println(Line);
String[] L = Line.split("\\s+");
//new contig
if (Line.startsWith("LOCUS")){
ContigName = L[1];
ContigCount.add(ContigName);
try {
ContigEnds.put(ContigName, Integer.parseInt(L[2]));
} catch (Exception ex){}
DescriptiveInfo = true;
}
//read lines for features
if (ReadFeatures){
//check if line is a new feature
for (String s : Types){
//System.out.println(s);
if (Line.startsWith(s) && !WritingProduct
&& !WritingTranslation && L[0].equals(s)){
if (L.length == 2){
if (L[1].contains("..")){
NewFeature = true;
InAnIgnoreFeature = false;
TypeName = s;
break;
}
}
}
}
//line is a new feature
if (NewFeature){
//write previous feature
if (E != null){
if (E.getType() != null){
Elements.add(E);
//Record counts of types
if (Counts.get(E.getType()) != null){
int OldCount = Counts.get(E.getType());
Counts.put(E.getType(),(OldCount+1));
} else {
Counts.put(E.getType(), 1);
}
}
}
//create new feature
E = new GenomicElement();
NewFeature = false;
//reset switches
WritingProduct = false;
WritingTranslation = false;
//type info
E.setType(TypeName);
E.setContig(ContigName);
//fwd or reverse strand
if (L[1].contains("complement")){
//completely assembled or not
if (L[1].contains("join")){
//complement(join(729725..730909,730913..731044))
String[] X = ((String) L[1].trim().subSequence(16,L[1].length()-2)).split("\\..");
if (X[0].contains(">") || X[0].contains("<")){
X[0] = X[0].substring(1);
}
if (X[X.length-1].contains(">") || X[1].contains("<")){
X[X.length-1] = X[X.length-1].substring(1);
}
E.setStart(Integer.parseInt(X[0]));
E.setStop(Integer.parseInt(X[X.length-1]));
E.setStrand(Strand.NEGATIVE);
E.DetermineCenter();
//no join
} else {
String[] X = ((String) L[1].trim().subSequence(11,L[1].length()-1)).split("\\..");
if (X[0].contains(">") || X[0].contains("<")){
X[0] = X[0].substring(1);
}
if (X[1].contains(">") || X[1].contains("<")){
X[1] = X[1].substring(1);
}
//Start and stop
E.setStart(Integer.parseInt(X[0]));
E.setStop(Integer.parseInt(X[1]));
E.setStrand(Strand.NEGATIVE);
E.DetermineCenter();
}
} else {
//join
if (L[1].contains("join")){
String[] X = ((String) L[1].trim().subSequence(5,L[1].length()-1)).split("\\..");
if (X[0].contains(">") || X[0].contains("<")){
X[0] = X[0].substring(1);
}
if (X[X.length-1].contains(">") || X[X.length-1].contains("<")){
X[X.length-1] = X[X.length-1].substring(1);
}
E.setStart(Integer.parseInt(X[0]));
E.setStop(Integer.parseInt(X[X.length-1]));
E.setStrand(Strand.POSITIVE);
E.DetermineCenter();
//no join
} else {
String[] X = L[1].trim().split("\\..");
if (X[0].contains(">") || X[0].contains("<")){
X[0] = X[0].substring(1);
}
if (X[1].contains(">") || X[1].contains("<")){
X[1] = X[X.length-1].substring(1);
}
E.setStart(Integer.parseInt(X[0]));
E.setStop(Integer.parseInt(X[1]));
E.setStrand(Strand.POSITIVE);
E.DetermineCenter();
}
}
//line is not a new feature
} else {
//not a new feature
NewFeature = false;
//Is this feature one to be ignored?
if (L.length == 2){
if (L[1].contains("..")){
InAnIgnoreFeature = true;
}
}
}
//add to an existing feature
if (!NewFeature && !InAnIgnoreFeature){
//check if currently writing things, first
if(WritingProduct){
//add the current line.
String UpdatedAnnotation = E.getAnnotation() + " " + Line;
E.setAnnotation(UpdatedAnnotation);
//if a quotation mark is the last character, this is the end of writing product.
if (Line.substring(Line.length()-1).equals("\"")){
WritingProduct = false;
}
} else if (WritingTranslation){
//last line in translation
if (Line.substring(Line.length()-1).equals("\"")){
String UpdatedTranslation = E.getTranslation() + Line.substring(0,Line.length()-1);
E.setTranslation(UpdatedTranslation);
WritingTranslation = false;
} else {
String UpdatedTranslation = E.getTranslation() + Line;
E.setTranslation(UpdatedTranslation);
}
//not writing anything - possibly open things up
} else {
//start product
if (L[0].startsWith(GFM.Annotation)){
WritingProduct = true;
E.setAnnotation(Line.substring(1));
//if a quotation mark is the last character, this is the end of writing product.
if (Line.substring(Line.length()-1).equals("\"")){
WritingProduct = false;
}
//start translation
} else if (GFM.GetTranslation && L[0].startsWith("/translation=")){
WritingTranslation = true;
//short translation - ends in quote
if (Line.substring(Line.length()-1).equals("\"")){
E.setTranslation((String) Line.substring(14, Line.length()-1));
WritingTranslation = false;
//normal translation - extends multiple lines
} else {
E.setTranslation(Line.substring(14));
WritingTranslation = true;
}
//attempt to parse cluster tag
} else if (GFM.GetCluster && L[0].startsWith(GFM.GetClusterTag)){
String Info = Line.substring(GFM.GetClusterTag.length());
Info = Info.replaceAll("\"", "");
String[] InfoSplit = Info.split("\\s+");
for (String s : InfoSplit){
if (s.startsWith("COG")){
try{
E.setClusterID(Integer.parseInt(s.substring(3)));
break;
}catch (Exception ex){}
}
}
//add gene ID
} else if (L[0].startsWith(GFM.GeneID)){
try {
String GIDNoQuotes = Line.substring(GFM.GeneID.length()).replaceAll("\"", "");
E.setGeneID(GIDNoQuotes);
} catch (Exception ex) {
}
}
}
}
} else {
if (DescriptiveInfo){
//Add introductory info to the text description.
if (!(TextDescription).equals("")){
TextDescription = TextDescription + "\n" + Line;
} else{
TextDescription = Line;
}
}
}
//turn on feature-reading
if (Line.startsWith("FEATURES")){
ReadFeatures = true;
}
//turn off feature-reading
if (Line.startsWith("BASE COUNT")){
DescriptiveInfo = false;
ReadFeatures = false;
}
}
//add last element.
if (!Elements.contains(E)){
Elements.add(E);
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
//Convert feature counts to string, for display.
//Number of contigs / plasmids / chromosomes
TextDescription = TextDescription +"\n\nSequences (" + String.valueOf(ContigCount.size()) + "):\n";
for (String s : ContigCount){
TextDescription = TextDescription + s + "\n";
}
//Feature tabulation
TextDescription = TextDescription + "\nFeature Types (" + String.valueOf(Counts.values().size()) + "):\n";
for (String s : Counts.keySet()){
TextDescription = TextDescription + s + " (" + String.valueOf(Counts.get(s)) + ")\n";
}
//close opened stream.
try {
br.close();
} catch (Exception ex) {
//ex.printStackTrace();
}
//sort elements
Collections.sort(Elements, new GenomicElementComparator());
//remove redundant elements
this.Elements = removeRedundantElements(Elements);
}
//remove redundant elements
public LinkedList<GenomicElement> removeRedundantElements(LinkedList<GenomicElement> InitialElements){
//Initialize output
LinkedList<GenomicElement> OutputElements = new LinkedList<GenomicElement>();
if (InitialElements.size() > 1){
//initial comparison element
GenomicElement ECompare = InitialElements.get(0);
//check all remaining (move through list)
for (int i = 1; i < InitialElements.size(); i++){
//a new element for comparison
GenomicElement ENew = InitialElements.get(i);
//if all of these are similar, no need to write this element.
if (ECompare.getContig().equals(ENew.getContig()) &&
ECompare.getStart() == ENew.getStart() &&
ECompare.getStop() == ENew.getStop() &&
ECompare.getStrand().equals(ENew.getStrand()) &&
ECompare.getType().equals(ENew.getType())){
} else {
OutputElements.add(ECompare);
}
ECompare = ENew;
}
//last gene in file
GenomicElement ENew = OutputElements.getLast();
//if all of these are similar, no need to write this element.
if (ECompare.getContig().equals(ENew.getContig()) &&
ECompare.getStart() == ENew.getStart() &&
ECompare.getStop() == ENew.getStop() &&
ECompare.getStrand().equals(ENew.getStrand()) &&
ECompare.getType().equals(ENew.getType())){
} else {
OutputElements.add(ECompare);
}
}
//return with new set.
return OutputElements;
}
//----------------------- add cluster number -----------------------//
//Organism - Gene Name - Cluster Number [OR] Gene Name - Cluster Number [OR] Gene Name
public void addClusterNumber(String Annotation, int Clusternumber){
for (GenomicElement E : Elements){
if (E.getAnnotation().toUpperCase().contains(Annotation.toUpperCase().trim())){
E.setClusterID(Clusternumber);
}
}
}
//Organism - Contig - Gene Name - Cluster Number
public void addClusterNumber(String Contig, String Annotation, int Clusternumber){
for (GenomicElement E : Elements){
if (E.getContig().contentEquals(Contig) &&
E.getAnnotation().toUpperCase().contains(Annotation.toUpperCase().trim())){
E.setClusterID(Clusternumber);
}
}
}
//Organism - Contig - Gene Start - Gene Stop - Cluster Number
public void addClusterNumber(String Contig, int Start, int Stop, int Clusternumber){
for (GenomicElement E : Elements){
if (E.getContig().contentEquals(Contig) &&
E.getStart() == Start &&
E.getStop() == Stop){
E.setClusterID(Clusternumber);
break;
}
}
}
//----------------------- Context Set computation ------------------//
//single gene context set
public void MakeSingleGeneContextSet(String CSName){
//initialize a new context set
ContextSet CS = new ContextSet(CSName, "SingleGene");
CS.setPreProcessed(true);
HashMap<Integer, LinkedList<GenomicElement>> csmap
= new HashMap<Integer, LinkedList<GenomicElement>>();
//iterate through all elements, add each to single-gene context set
int Counter = 0;
for (GenomicElement E : this.Elements){
Counter++;
LinkedList<GenomicElement> L = new LinkedList<GenomicElement>();
L.add(E);
csmap.put(Counter, L);
}
//add completed hash map to context set object
CS.setContextMapping(csmap);
//add this new context set to the Groupings field.
if (Groupings == null){
Groupings = new LinkedList<ContextSet>();
}
this.Groupings.add(CS);
}
public void generateOperonReports(){
String dir = "/Users/phillipseitzer/UCDavis/OperonEvolutionInHalophiles/NRC1-distance-vs-transcriptomics/OperonStats";
//generate context sets
String nameStem = "Dist-";
for (int i = 0; i <= 300; i++){
String name = nameStem +String.valueOf(i);
ContextSet CS = ComputeContextSet(name, i, true);
String fileName = dir + "/" + name;
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(fileName));
for (LinkedList<GenomicElement> operon : CS.getContextMapping().values()){
StringBuilder sb = new StringBuilder();
for (GenomicElement ge : operon){
String geneId = ge.getGeneID();
if (geneId.endsWith("m")){
geneId = geneId.substring(0, geneId.length()-1);
}
sb.append(geneId);
sb.append(" ");
}
sb.append("\n");
bw.write(sb.toString());
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Computed context set " + i + "/300.");
}
}
//estimate contexts based on distance
public ContextSet ComputeContextSet(String CSName, int tolerance, boolean RequireSameStrain){
//initialize a new context set
ContextSet CS = new ContextSet(CSName, "IntergenicDist");
CS.setPreProcessed(true);
HashMap<Integer, LinkedList<GenomicElement>> csmap
= new HashMap<Integer, LinkedList<GenomicElement>>();
// start counter, initialize each operon (as a LL).
int OperonCounter = 1;
LinkedList<GenomicElement> LL = new LinkedList<GenomicElement>();
//examine elements, and put into operons
//this method assumes that the elements are in order
for (int i=0; i < Elements.size()-1; i++){
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (Elements.get(i).getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
//require valid type
// if (Elements.get(i).getType().contentEquals("CDS") ||
// Elements.get(i).getType().contentEquals("tRNA") ||
// Elements.get(i).getType().contentEquals("rRNA")){
if (ElementIsValid){
//if the element is valid, place into an operon.
//Comment: technically, a pointer to the element
LL.add(Elements.get(i));
//find the next valid type in the list
boolean validType = false;
int NextValid = i+1;
//discover the next valid element in the Elements field.
while(validType == false){
//determine if next element is valid (should be included)
boolean NextElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (Elements.get(NextValid).getType().contentEquals(s)){
NextElementIsValid = true;
break;
}
}
//case: next element is valid
if (NextElementIsValid){
validType = true;
}else if (NextValid < Elements.size()-1) { // case: next element is not valid, look further in file
NextValid++;
}
else { //case: there are no more valid elements in the file
NextValid = -1;
validType = true;
}
}
//Assuming that there are valid elements to compare against,
if (NextValid != -1){
//next element is in a new operon if any of the following are true:
//(1) different strand, (2) different contig, (3) too far away from current element
boolean newOperon = false;
//Comparison blocks - may or may not require the same strain
if (RequireSameStrain == true) {
if (Elements.get(i).getStrand() == Strand.POSITIVE){
if ((Elements.get(NextValid).getStrand() == Strand.NEGATIVE) ||
(Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())==false) ||
(Elements.get(NextValid).getStrand() == Strand.POSITIVE &&
Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())
&& Elements.get(NextValid).getStart()-Elements.get(i).getStop() > tolerance))
{
newOperon = true;
}
} else {
if ((Elements.get(NextValid).getStrand() == Strand.POSITIVE) ||
(Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())==false) ||
(Elements.get(NextValid).getStrand() == Strand.NEGATIVE
&& Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())
&& Elements.get(NextValid).getStart()-Elements.get(i).getStop() > tolerance))
{
newOperon = true;
}
}
} else {
//Only compare contig names and distance, when not considering strain.
if ((Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())==false) ||
(Elements.get(i).getContig().contentEquals(Elements.get(NextValid).getContig())
&& Elements.get(NextValid).getStart()-Elements.get(i).getStop() > tolerance))
{
newOperon = true;
}
}
//if the next valid element defines a new operon:
// store the old operon, reset the LL, increment the operon counter.
if (newOperon == true){
csmap.put(OperonCounter, LL);
LL = new LinkedList<GenomicElement>();
OperonCounter++;
}
//Last element in the file
} else {
//place element into an operon, and store the operon in the hash map.
LL.add(Elements.get(i));
csmap.put(OperonCounter,LL);
}
}
}
//add completed hash map
CS.setContextMapping(csmap);
//add this new context set to the Groupings field.
if (Groupings == null){
Groupings = new LinkedList<ContextSet>();
}
this.Groupings.add(CS);
return CS;
}
//add pre-computed contexts from file
public void ImportContextSet(String CSName, String fileName) {
this.TryToComputeOperons = true;
try{
//import buffered reader
BufferedReader br = new BufferedReader(new FileReader(fileName));
String Line = null;
//initialize a new context set
ContextSet CS = new ContextSet(CSName, "Loaded");
CS.setPreProcessed(true);
LinkedHashMap<Integer, LinkedList<GenomicElement>> CSMap
= new LinkedHashMap<Integer, LinkedList<GenomicElement>>();
while((Line = br.readLine()) != null){
//import line
String ImportedLine[] = Line.split("\t");
//if the ID is 0, then skip this entry entirely and move on the next one.
int Key = Integer.parseInt(ImportedLine[3]);
if (Key != 0){
//create new list, if it doesn't already exist
if (CSMap.get(Key) == null){
CSMap.put(Key, new LinkedList<GenomicElement>());
}
//search through genomes to find the correct element, add to list
for (GenomicElement e : this.Elements){
if (e.getContig().equals(ImportedLine[0]) &&
e.getStart() == Integer.parseInt(ImportedLine[1]) &&
e.getStop() == Integer.parseInt(ImportedLine[2])){
CSMap.get(Key).add(e);
break;
}
}
}
}
//add completed mapping to context set
CS.setContextMapping(CSMap);
//add this context set to existing context sets.
if (this.Groupings == null){
Groupings = new LinkedList<ContextSet>();
}
Groupings.add(CS);
} catch (Exception ex) {
this.TryToComputeOperons = false;
String Message = "The Genome Context File " + "\n" +
fileName + "\n" +
"was improperly formatted. Please re-format this file and try again.";
JOptionPane.showMessageDialog(null, Message, "Invalid File Format", JOptionPane.ERROR_MESSAGE);
}
}
//adjust a context set
public void AdjustContextSet(String CSName, String ContigName, int Start, int Stop, int Key){
ContextSet CS = null;
boolean AddCStoGroups = true;
//Find the context set
for (ContextSet CS1 : Groupings){
if (CS1.getName().equals(CSName)){
CS = CS1;
AddCStoGroups = false;
break;
}
}
//create it if it doesn't yet exist
if (CS == null){
CS = new ContextSet();
CS.setPreProcessed(true);
CS.setName(CSName);
}
//Retrieve existing mapping, or create new one
HashMap<Integer, LinkedList<GenomicElement>> CSMap = null;
if (CS.getContextMapping() != null){
CSMap = CS.getContextMapping();
} else {
CSMap = new HashMap<Integer, LinkedList<GenomicElement>>();
}
//add element
//find appropriate elements
for (GenomicElement E : Elements){
//match start, stop, and contig
if (E.getContig().equals(ContigName) &&
E.getStart() == Start &&
E.getStop() == Stop){
//update list
if (CSMap.get(Key) != null){
//if the CSMap already contains the element, don't add again
if (!CSMap.get(Key).contains(E)){
CSMap.get(Key).add(E);
}
} else {
LinkedList<GenomicElement> List = new LinkedList<GenomicElement>();
List.add(E);
CSMap.put(Key, List);
}
//break out of loop
break;
}
}
//update the hash map
CS.setContextMapping(CSMap);
//add the CS to the groupings, if it's brand new.
if (AddCStoGroups){
Groupings.add(CS);
}
}
//----------------------- Sorting ------------------------//
//sort genomic elements by (1) contig name, and within contigs, (2) start position.
public class GenomicElementComparator implements Comparator<GenomicElement> {
public int compare(GenomicElement E1, GenomicElement E2) {
int nameCompare = E1.getContig().compareToIgnoreCase(E2.getContig());
if (nameCompare != 0) {
return nameCompare;
} else {
//return Integer.valueOf(E1.getStart()).compareTo(Integer.valueOf(E2.getStart()));
return Integer.valueOf(E1.getCenter()).compareTo(Integer.valueOf(E2.getCenter()));
}
}
}
public static class SortGandEByElements implements Comparator<GenomicElementAndQueryMatch> {
@Override
public int compare(GenomicElementAndQueryMatch GandE1, GenomicElementAndQueryMatch GandE2) {
int nameCompare = GandE1.getE().getContig().compareToIgnoreCase(GandE2.getE().getContig());
if (nameCompare != 0) {
return nameCompare;
} else {
return Integer.valueOf(GandE1.getE().getCenter())
.compareTo(Integer.valueOf(GandE2.getE().getCenter()));
}
}
}
// ----------------------- Export + Sequence -----------------------//
//return DNA sequence from a .fasta file (by streaming file in)
public String DNASequence(String contig, int start, int stop, Strand strand){
//initialize and instantiate variable
String seq = null;
//stream in until the appropriate line is discovered.
try {
BufferedReader br = null;
//read from file
if (SeqsFromFile){
br = new BufferedReader(new FileReader(GenomeSequenceFile));
} else {
//read from website
URL SeqFile = new URL(GenomeSequenceFile);
InputStream is = SeqFile.openStream();
br = new BufferedReader(new InputStreamReader(is));
}
String Line = null;
boolean ThisContig = false;
boolean StartedSequence = false;
int ContigSeqBlock = 0;
while ((Line = br.readLine()) != null){
//header
if (Line.startsWith(">")){
if (Line.contains(contig)){
ThisContig = true;
}
//sequence
} else if (ThisContig){
/*
* (1) First coordinate in line = ContigSeqBlock + 1
* (2)
*/
//available coordinate range featured in this line.
int StartLine = ContigSeqBlock + 1;
int StopLine = ContigSeqBlock + Line.length();
//(1) Check for start coordinate, if appropriate
if (StartedSequence){
if (stop < StopLine){
//the line ends in this line - recover sequence + exit.
seq = seq + (String) Line.subSequence(0, stop - ContigSeqBlock);
break;
} else{
//write all sequence and proceed.
seq = seq + Line;
}
} else {
//the line starts here. Record the appropriate place.
if (start >= StartLine && start <= StopLine){
StartedSequence = true;
//the string also ends in this line. - recover sequence + exit
if (stop <= StopLine){
seq = (String) Line.subSequence((start-1)-ContigSeqBlock,stop-ContigSeqBlock);
break;
//start sequence, end later
} else {
seq = (String) Line.substring((start-1)-ContigSeqBlock);
}
}
}
//update the tally of all previous sequence
ContigSeqBlock = StopLine;
}
}
//close file stream
br.close();
//flip, if appropriate
if (strand.equals(Strand.NEGATIVE)){
DNASequence d = new DNASequence(seq);
seq = d.getReverseComplement().getSequenceAsString();
}
//return all sequence as upper case string.
seq = seq.toUpperCase();
} catch (Exception ex){
ex.printStackTrace();
}
//return statement
return seq;
}
//Export a GFF file with gene IDs + cluster IDs (if applicable)
public void ExportExtendedGFFFile(String FileName){
try {
//filewriter
BufferedWriter bw = new BufferedWriter(new FileWriter(FileName));
String Line;
String TheStrand;
for (GenomicElement E : this.Elements){
if (E.getStrand().equals(Strand.POSITIVE)){
TheStrand = "1";
} else {
TheStrand = "-1";
}
//build line
Line = E.getContig() + "\tGenBank\t" + String.valueOf(E.getType())
+ "\t" + String.valueOf(E.getStart()) + "\t" + String.valueOf(E.getStop()) + "\t+\t"
+ TheStrand + "\t.\t" + E.getAnnotation() + "\t" + String.valueOf(E.getClusterID());
//possibly add homology cluster
if (E.getGeneID() != ""){
Line = Line + "\t" + E.getGeneID();
}
Line = Line + "\n";
bw.write(Line);
bw.flush();
}
bw.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//----------------------- Search/Retrieval ------------------------//
//preprocessed == true
//return a hashset of gene groupings - annotation
public HashSet<LinkedList<GenomicElementAndQueryMatch>> AnnotationMatches(String[] query, String ContextSetName){
//initialize
ContextSet CS = new ContextSet();
//determine the correct context set, and make a copy
for (ContextSet selectCS : Groupings){
if (selectCS.getName().equals(ContextSetName)){
CS = selectCS;
break;
}
}
//System.out.println(this.Species + " " + CS.getName());
//create a tree set to contain individual element matches
HashSet<LinkedList<GenomicElementAndQueryMatch>> Hits =
new HashSet<LinkedList<GenomicElementAndQueryMatch>>();
boolean AddtheSet;
//determine all matches
for (LinkedList<GenomicElement> LL: CS.getContextMapping().values()){
//default: do not add the set
AddtheSet = false;
//initialize the list
LinkedList<GenomicElementAndQueryMatch> TheList = new LinkedList<GenomicElementAndQueryMatch>();
//search for all direct matches, and mark them
for (int i = 0; i < LL.size(); i++){
//initialize a new GenomicElementAndQueryMatch
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(LL.get(i));
//check each query
for (int j = 0; j < query.length; j++){
//check annotation first
if (LL.get(i).getAnnotation().toUpperCase().contains(query[j].trim().toUpperCase())){
AddtheSet = true;
GandE.setQueryMatch(true);
//next, check gene IDs
} else if (LL.get(i).getGeneID().toUpperCase().equals(query[j].trim().toUpperCase())){
AddtheSet = true;
GandE.setQueryMatch(true);
// no match!
} else {
GandE.setQueryMatch(false);
}
}
//add this element to the list
TheList.add(GandE);
}
//if even one match was discovered in an LL, add the whole LL.
if (AddtheSet == true){
Hits.add(TheList);
}
}
//return HashSet
return Hits;
}
//return a hashset of gene groupings - homology cluster
public HashSet<LinkedList<GenomicElementAndQueryMatch>> ClusterMatches(int[] ClusterNumber, String ContextSetName){
//initialize
ContextSet CS = new ContextSet();
//determine the correct context set
for (ContextSet selectCS : Groupings){
if (selectCS.getName().equals(ContextSetName)){
CS = selectCS;
break;
}
}
//create a tree set to contain individual element matches
HashSet<LinkedList<GenomicElementAndQueryMatch>> Hits =
new HashSet<LinkedList<GenomicElementAndQueryMatch>>();
boolean AddtheSet;
//determine all matches
for (LinkedList<GenomicElement> LL: CS.getContextMapping().values()){
//initialize the list
LinkedList<GenomicElementAndQueryMatch> TheList = new LinkedList<GenomicElementAndQueryMatch>();
//reset value to false
AddtheSet = false;
//search for all direct matches, and mark them
for (int i = 0; i < LL.size(); i++){
//initialize a new GenomicElementAndQueryMatch
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(LL.get(i));
//defaults: do not take
GandE.setQueryMatch(false);
//check every cluster number, for query match
for (int j = 0; j < ClusterNumber.length; j++){
if (LL.get(i).getClusterID()==ClusterNumber[j]){
AddtheSet = true;
GandE.setQueryMatch(true);
}
}
//add this element to the list
TheList.add(GandE);
}
//if even one match was discovered in an LL, add the whole LL.
if (AddtheSet == true){
Hits.add(TheList);
//System.out.println("added a new set with " + TheList.size() + " genes, from " + LL.size());
}
}
//return HashSet
return Hits;
}
//preprocessed == false
//return a hashset of gene groupings
public HashSet<LinkedList<GenomicElementAndQueryMatch>> MatchesOnTheFly(String[] Queries,
int[] ClusterNumbers,
ContextSetDescription CSD){
//create a tree set to contain individual element matches
HashSet<LinkedList<GenomicElementAndQueryMatch>> Hits =
new HashSet<LinkedList<GenomicElementAndQueryMatch>>();
//determine appropriate form of searches
boolean IsCluster = false;
if (Queries == null){
IsCluster = true;
}
//find query match
boolean QueryMatch = false;
//group genes together according to the specificed gene grouping protocol.
if (CSD.getType().contentEquals("Range")) {
//iterate through all elements
for (int i = 0; i <this.Elements.size(); i++){
//determine if the element is a query match.
QueryMatch = false;
if (IsCluster){
for (int j = 0; j < ClusterNumbers.length; j++){
if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){
QueryMatch = true;
break;
}
}
} else {
for (int j = 0; j < Queries.length; j++){
if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
} else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
}
}
}
//if it is, extract the appropriate range
if (QueryMatch){
//define a new GenomicElementAndQueryMatch
LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>();
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(this.Elements.get(i)); GandE.setQueryMatch(true); LL.add(GandE);
//Center of the query match
//int Center = (int)Math.round(0.5*(double)(GandE.getE().getStart()+GandE.getE().getStop()));
int Center = GandE.getE().getCenter();
//continue adding genes until sufficient
//before genes
//int BeforeQuery = Center - this.Elements.get(i).getStart();
int BeforeQuery = Center - this.Elements.get(i).getCenter();
int BeforeCounter = 0;
boolean EndOfContig = false;
String CurrentContig = this.Elements.get(i).getContig();
while (BeforeQuery <= CSD.getNtRangeBefore() && EndOfContig == false){
BeforeCounter++;
GandE = new GenomicElementAndQueryMatch();
//first element in file
if (i-BeforeCounter >= 0) {
GandE.setE(this.Elements.get(i-BeforeCounter));
GandE.setQueryMatch(false);
//BeforeQuery = Center - GandE.getE().getStart();
BeforeQuery = Center - GandE.getE().getCenter();
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
if (ElementIsValid){
//check for end of contig
if (CurrentContig.equals(GandE.getE().getContig())){
if (BeforeQuery < CSD.getNtRangeBefore()){
LL.add(0,GandE);
}
} else {
EndOfContig = true;
}
//when the element is not valid, just skip to the next one.
}
// else {
// EndOfContig = true;
// }
} else {
EndOfContig = true;
}
}
//after genes
//int AfterQuery = this.Elements.get(i).getStop() - Center;
int AfterQuery = this.Elements.get(i).getCenter() - Center;
int AfterCounter = 0;
EndOfContig = false;
CurrentContig = this.Elements.get(i).getContig();
while (AfterQuery <= CSD.getNtRangeAfter() && EndOfContig == false){
AfterCounter++;
GandE = new GenomicElementAndQueryMatch();
//last element in file
if (i+AfterCounter < this.Elements.size()){
GandE.setE(this.Elements.get(i+AfterCounter));
GandE.setQueryMatch(false);
//AfterQuery = GandE.getE().getStop() - Center;
AfterQuery = GandE.getE().getCenter() - Center;
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
if (ElementIsValid){
//check for end of contig
if (CurrentContig.equals(GandE.getE().getContig())){
if (AfterQuery < CSD.getNtRangeAfter()){
LL.add(GandE);
}
} else {
EndOfContig = true;
}
}
//try: just skipping over
// else {
// EndOfContig = true;
// }
} else {
EndOfContig = true;
}
}
//finally, add this to the hit list
Hits.add(LL);
}
}
} else if (CSD.getType().contentEquals("GenesAround")) {
//iterate through all elements
for (int i = 0; i <this.Elements.size(); i++){
//determine if the element is a query match.
QueryMatch = false;
if (IsCluster){
for (int j = 0; j < ClusterNumbers.length; j++){
if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){
QueryMatch = true;
break;
}
}
} else {
for (int j = 0; j < Queries.length; j++){
if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
} else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
}
}
}
//if it is, extract the appropriate range
if (QueryMatch){
//define a new GenomicElementAndQueryMatch
LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>();
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(this.Elements.get(i)); GandE.setQueryMatch(true); LL.add(GandE);
//continue adding genes until sufficient
//before genes
int BeforeCounter = 0;
boolean EndOfContig = false;
String CurrentContig = this.Elements.get(i).getContig();
while (BeforeCounter < CSD.getGenesBefore() && EndOfContig == false){
BeforeCounter++;
GandE = new GenomicElementAndQueryMatch();
//first element in file
if (i-BeforeCounter > 0) {
GandE.setE(this.Elements.get(i-BeforeCounter));
GandE.setQueryMatch(false);
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
//only add elements of the appropriate type - otherwise, skip
if (ElementIsValid){
//check for end of contig
if (CurrentContig.equals(GandE.getE().getContig())){
LL.add(GandE);
} else {
EndOfContig = true;
}
} else {
EndOfContig = true;
}
}
}
//after genes
int AfterCounter = 0;
EndOfContig = false;
CurrentContig = this.Elements.get(i).getContig();
while (AfterCounter < CSD.getGenesAfter() && EndOfContig == false){
AfterCounter++;
GandE = new GenomicElementAndQueryMatch();
//last element in file
if (i+AfterCounter < this.Elements.size()){
GandE.setE(this.Elements.get(i+AfterCounter));
GandE.setQueryMatch(false);
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
//only add elements of the appropriate type - otherwise, skip
if (ElementIsValid){
//check for end of contig
if (CurrentContig.equals(GandE.getE().getContig())){
LL.add(GandE);
} else {
EndOfContig = true;
}
} else {
EndOfContig = true;
}
}
}
//finally, add this to the hit list
Hits.add(LL);
}
}
} else if (CSD.getType().contentEquals("GenesBetween")) {
LinkedList<GenomicElement> FirstQueries = new LinkedList<GenomicElement>();
LinkedList<GenomicElement> SecondQueries = new LinkedList<GenomicElement>();
//iterate through all elements, find first + second queries
for (int i = 0; i <this.Elements.size(); i++){
//determine if the element is a query match.
QueryMatch = false;
if (IsCluster){
for (int j = 0; j <ClusterNumbers.length; j++){
if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){
if (j == 0){
FirstQueries.add(Elements.get(i));
} else {
SecondQueries.add(Elements.get(i));
}
}
}
} else {
for (int j = 0; j < Queries.length; j++){
if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){
if (j == 0){
FirstQueries.add(Elements.get(i));
} else {
SecondQueries.add(Elements.get(i));
}
} else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){
if (j == 0){
FirstQueries.add(Elements.get(i));
} else {
SecondQueries.add(Elements.get(i));
};
}
}
}
}
//pairings of genomic element query matches
HashSet<LinkedList<GenomicElement>> Pairs =
new HashSet<LinkedList<GenomicElement>>();
//find first set matches
int ClosestDistance = 999999999;
GenomicElement Partner = null;
for (GenomicElement E1 : FirstQueries){
//reset values
Partner = null;
ClosestDistance = 999999999;
//find closest
for (GenomicElement E2 : SecondQueries){
if (E1.getContig().contentEquals(E2.getContig()) &&
Math.abs(E1.getCenter() - E2.getCenter()) < ClosestDistance
&& !E1.equals(E2)) {
//Partners that are too far away are excluded anyway, if this option is specified
if ((CSD.isGapLimit() && Math.abs(E1.getCenter() - E2.getCenter()) <= CSD.getGapLimitSize()) ||
!CSD.isGapLimit()){
//operon expansion option - same strand checks
if (CSD.isOperonExpansion && CSD.SameStrandRequired){
//strand must match to count as a partner
if (E1.getStrand().equals(E2.getStrand())){
//update the closest distance
Math.abs(ClosestDistance = E1.getCenter() - E2.getCenter());
Partner = E2;
}
//no operon expansion - strandedness doesn't matter in this case
} else {
//update the closest distance
Math.abs(ClosestDistance = E1.getCenter() - E2.getCenter());
Partner = E2;
}
}
}
}
//there must be a partner to describe a partnership.
if (Partner != null){
//add to hash set
LinkedList<GenomicElement> Partnership = new LinkedList<GenomicElement>();
Partnership.add(E1); Partnership.add(Partner);
Pairs.add(Partnership);
}
}
//find second set matches
ClosestDistance = 999999999;
Partner = null;
for (GenomicElement E2 : SecondQueries){
//reset values
Partner = null;
ClosestDistance = 999999999;
//find closest
for (GenomicElement E1 : FirstQueries){
if (E2.getContig().contentEquals(E1.getContig()) &&
Math.abs(E2.getCenter() - E1.getCenter()) < ClosestDistance
&& !E1.equals(E2) ) {
//Partners that are too far away are excluded anyway, if this option is specified
if ((CSD.isGapLimit() && Math.abs(E1.getCenter() - E2.getCenter()) <= CSD.getGapLimitSize()) ||
!CSD.isGapLimit()){
//operon expansion option - same strand checks
if (CSD.isOperonExpansion && CSD.SameStrandRequired){
//strand must match to count as a partner
if (E1.getStrand().equals(E2.getStrand())){
//update the closest distance
ClosestDistance = Math.abs(E2.getCenter() - E1.getCenter());
Partner = E1;
}
//no operon expansion - strandedness doesn't matter in this case
} else {
//update the closest distance
ClosestDistance = Math.abs(E2.getCenter() - E1.getCenter());
Partner = E1;
}
}
}
}
//there must be a partner for this to even matter.
if (Partner != null){
//add to hash set
LinkedList<GenomicElement> Partnership = new LinkedList<GenomicElement>();
Partnership.add(Partner); Partnership.add(E2);
Pairs.add(Partnership);
}
}
//for all pairs, add all genomic elements
Iterator<LinkedList<GenomicElement>> it = Pairs.iterator();
while(it.hasNext()){
LinkedList<GenomicElement> Pair = it.next();
//find starting /ending points
int StartingE = -1; int StoppingE = -1;
for (int i = 0; i < Elements.size(); i++){
if (this.Elements.get(i).equals(Pair.get(0))){
StartingE = i;
}
if (this.Elements.get(i).equals(Pair.get(1))){
StoppingE = i;
}
}
//initialize an output linked list
LinkedList<GenomicElementAndQueryMatch> LL = new LinkedList<GenomicElementAndQueryMatch>();
//re-order correctly
if (StartingE > StoppingE){
int temp = StartingE;
StartingE = StoppingE;
StoppingE = temp;
}
//for operon expansion
boolean AddListToMatches = true;
//add all intermediate elements
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(this.Elements.get(StartingE)); GandE.setQueryMatch(true); LL.add(GandE);
int ElementNumber = StartingE + 1;
int CurrentListCounter = 0;
while (ElementNumber <= StoppingE){
GandE = new GenomicElementAndQueryMatch();
GandE.setE(Elements.get(ElementNumber));
if (ElementNumber == StoppingE){
GandE.setQueryMatch(true);
} else {
GandE.setQueryMatch(false);
}
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
//only add elements of the appropriate type - otherwise, skip
if (ElementIsValid){
//add these to list, and increment counter (for distance comparison)
LL.add(GandE);
CurrentListCounter++;
//check against max gene count options
if (CSD.InternalGeneNumberLimit){
if (CurrentListCounter - 1 > CSD.MaxNumInternalGenes){
AddListToMatches = false;
break;
}
}
//check against operon expansion options
if (CSD.isOperonExpansion){
//check for same strand violation
if (CSD.SameStrandRequired
&& !GandE.getE().getStrand().equals(LL.get(0).getE().getStrand())){
AddListToMatches = false;
break;
}
//check for distance violation
if (LL.get(CurrentListCounter).getE().getStart() - LL.get(CurrentListCounter-1).getE().getStop() > CSD.IntergenicGapLimit){
AddListToMatches = false;
break;
}
}
}
ElementNumber++;
}
//debugging
//System.out.println(LL);
//add list to hash map
if (AddListToMatches)
Hits.add(LL);
// //legacy
//
// //compute stats about stopping element
// double StoppingECenter = this.Elements.get(StoppingE).getStart()
// + (0.5*(this.Elements.get(StoppingE).getStop() - this.Elements.get(StoppingE).getStart()));
//
// //add list to hash map. Check for inappropriate cases.
// if (CSD.isGapLimit()){
// //System.out.println("Starting: " + StartingECenter + " Stopping: " + StoppingECenter);
// if (Math.abs(StoppingECenter - StartingECenter) <= CSD.getGapLimitSize()){
// Hits.add(LL);
// }
// } else {
// Hits.add(LL);
// }
}
} else if (CSD.getType().contentEquals("MultipleQuery")) {
//all genomic element matches
LinkedList<GenomicElementAndQueryMatch> MQMatches = new LinkedList<GenomicElementAndQueryMatch>();
//iterate through all elements, find all matches
for (GenomicElement E : Elements){
//determine if the element is a query match.
if (IsCluster){
for (int j = 0; j <ClusterNumbers.length; j++){
if (E.getClusterID() == ClusterNumbers[j]){
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(E);
GandE.setQueryMatch(true);
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
if (ElementIsValid){
MQMatches.add(GandE);
}
}
}
} else {
for (int j = 0; j < Queries.length; j++){
//check annotation
if (E.getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(E);
GandE.setQueryMatch(true);
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
if (ElementIsValid){
MQMatches.add(GandE);
}
//check gene ID
} else if (E.getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(E);
GandE.setQueryMatch(true);
//check against user-defined set of valid types
boolean ElementIsValid = false;
for (String s : this.FeatureIncludeTypes){
if (GandE.getE().getType().contentEquals(s)){
ElementIsValid = true;
break;
}
}
if (ElementIsValid){
MQMatches.add(GandE);
}
}
}
}
}
//add all non-null linked lists
if (MQMatches != null){
Hits.add(MQMatches);
}
} else if (CSD.getType().contentEquals("IntergenicDist-pre")) {
//Initialize a hashset for query matches, and for linked lists of genomic elements.
HashSet<GenomicElement> QueryMatchSet
= new HashSet<GenomicElement>();
HashSet<LinkedList<GenomicElement>> E_Hits =
new HashSet<LinkedList<GenomicElement>>();
//iterate through all elements
for (int i = 0; i < this.Elements.size(); i++){
//determine if the element is a query match.
QueryMatch = false;
if (IsCluster){
for (int j = 0; j < ClusterNumbers.length; j++){
if (this.Elements.get(i).getClusterID() == ClusterNumbers[j]){
QueryMatch = true;
break;
}
}
} else {
for (int j = 0; j < Queries.length; j++){
if (this.Elements.get(i).getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
} else if (this.Elements.get(i).getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
}
}
}
//if it is, extract the appropriate range
if (QueryMatch){
//System.out.println("Breakpoint!");
//current element is the query match.
GenomicElement E_curr = this.Elements.get(i);
//add to list of query matches
QueryMatchSet.add(E_curr);
//define a new GenomicElementAndQueryMatch
LinkedList<GenomicElement> LL = new LinkedList<GenomicElement>();
LL.add(E_curr);
// ----- global switches ---- //
boolean AddUpstream = true;
boolean AddDownstream = true;
int GeneNumber;
boolean ValidElementsRemain;
GenomicElement E_can;
boolean Add2Operon;
// ----- Add upstream ---- //
//switches
GeneNumber = i;
ValidElementsRemain = false;
//Initialize a candidate genomic element for operon addition.
E_can = null;
//add to operon switch.
Add2Operon = false;
//add upstream elements to list, if appropriate.
while (AddUpstream){
//default: no more valid elements, do not add to operon
ValidElementsRemain = false;
Add2Operon = false;
//find next valid element
for (int q = GeneNumber-1; q >= 0; q--){
E_can = Elements.get(q);
for (String s : this.FeatureIncludeTypes){
if (E_can.getType().contentEquals(s)){
ValidElementsRemain = true;
GeneNumber = q;
break;
}
}
//break out of outer loop
if (ValidElementsRemain){
break;
}
}
//compare to current to candidate.
if (ValidElementsRemain){
//check operon requirements.
if (E_can.getContig().equals(E_curr.getContig()) && //Contig Match
E_curr.getStart()-E_can.getStop() <= CSD.getIntGenSpacing()){ //Distance match
//check for same strand.
if (CSD.isNeedSameStrand()){
if (E_can.getStrand().equals(E_curr.getStrand())){
Add2Operon = true;
}
} else {
Add2Operon = true;
}
}
//add, if appropriate
if (Add2Operon){
//add genomic element to growing operon chain
LL.add(0,E_can);
//re-set counter
E_curr = E_can;
} else{
//once you stop adding, no going back.
AddUpstream = false;
}
} else {
//finished with operon.
AddUpstream = false;
}
}
// ----- Add downstream ---- //
//switches
GeneNumber = i;
ValidElementsRemain = false;
//Re-initialize genomic elements for comparison.
E_can = null;
E_curr = this.Elements.get(i);
//add to operon switch.
Add2Operon = false;
//add upstream elements to list, if appropriate.
while (AddDownstream){
//default: no more valid elements, do not add to operon
ValidElementsRemain = false;
Add2Operon = false;
//find next valid element
for (int q = GeneNumber+1; q < Elements.size(); q++){
E_can = Elements.get(q);
for (String s : this.FeatureIncludeTypes){
if (E_can.getType().contentEquals(s)){
ValidElementsRemain = true;
GeneNumber = q;
break;
}
}
//break out of outer loop
if (ValidElementsRemain){
break;
}
}
//compare to current to candidate.
if (ValidElementsRemain){
//check operon requirements.
if (E_can.getContig().equals(E_curr.getContig()) && //Contig Match
E_can.getStart()-E_curr.getStop() <= CSD.getIntGenSpacing()){ //Distance match
//check for same strand.
if (CSD.isNeedSameStrand()){
if (E_can.getStrand().equals(E_curr.getStrand())){
Add2Operon = true;
}
} else {
Add2Operon = true;
}
}
//add, if appropriate
if (Add2Operon){
//add genomic element to growing operon chain
LL.add(E_can);
//re-set counter
E_curr = E_can;
} else{
//once you stop adding, no going back.
AddDownstream = false;
}
} else {
//finished with operon.
AddDownstream = false;
}
}
//finally, add this to the hit list (pre-query match tags)
E_Hits.add(LL);
}
}
//build up actual hits - add query information
for (LinkedList<GenomicElement> LL : E_Hits){
//initialize list
LinkedList<GenomicElementAndQueryMatch> LLq
= new LinkedList<GenomicElementAndQueryMatch>();
//iterate through elements, add query tag
for (GenomicElement E : LL){
//initialize genomic element and query match
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(E);
//if this element is in the set of query matches, tag
if (QueryMatchSet.contains(E)){
GandE.setQueryMatch(true);
} else {
GandE.setQueryMatch(false);
}
//add query-updated element to list
LLq.add(GandE);
}
//add completed list to final output set.
Hits.add(LLq);
}
} else if (CSD.getType().contentEquals("SingleGene")) {
//iterate through all elements
for (GenomicElement E : Elements){
//re-set for each gene.
QueryMatch = false;
//check for match
if (IsCluster){
for (int j = 0; j < ClusterNumbers.length; j++){
if (E.getClusterID() == ClusterNumbers[j]){
QueryMatch = true;
break;
}
}
} else {
for (int j = 0; j < Queries.length; j++){
if (E.getAnnotation().toUpperCase().contains(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
} else if (E.getGeneID().toUpperCase().equals(Queries[j].trim().toUpperCase())){
QueryMatch = true;
break;
}
}
}
//add to list
if (QueryMatch){
//System.out.println("Breakpoint!");
//Define Match
GenomicElementAndQueryMatch GandE = new GenomicElementAndQueryMatch();
GandE.setE(E);
GandE.setQueryMatch(true);
//gene should be in a class all of its own
LinkedList<GenomicElementAndQueryMatch> LL
= new LinkedList<GenomicElementAndQueryMatch>();
//add gene to list
LL.add(GandE);
//add list to set of lists
Hits.add(LL);
}
}
} // various gene grouping strategies
return Hits;
}
//----------------------- GETTERS+SETTERS ------------------------//
//Getters and Setters
public String getGenus() {
return Genus;
}
public void setGenus(String genus) {
Genus = genus;
}
public String getSpecies() {
return Species;
}
public void setSpecies(String species) {
Species = species;
}
public LinkedList<GenomicElement> getElements() {
return Elements;
}
public void setElements(LinkedList<GenomicElement> elements) {
Elements = elements;
}
public File getGenomeFile() {
return GenomeFile;
}
public void setGenomeFile(File genomeFile) {
GenomeFile = genomeFile;
}
public LinkedList<ContextSet> getGroupings() {
if (Groupings == null){
Groupings = new LinkedList<ContextSet>();
}
return Groupings;
}
public void setGroupings(LinkedList<ContextSet> groupings) {
Groupings = groupings;
}
public boolean isTryToComputeOperons() {
return TryToComputeOperons;
}
public void setTryToComputeOperons(boolean tryToComputeOperons) {
TryToComputeOperons = tryToComputeOperons;
}
public LinkedList<String> getIncludeTypes() {
return FeatureIncludeTypes;
}
public void setIncludeTypes(LinkedList<String> includeTypes) {
FeatureIncludeTypes = includeTypes;
}
public LinkedList<String> getDisplayOnlyTypes() {
return FeatureDisplayTypes;
}
public void setDisplayOnlyTypes(LinkedList<String> displayOnlyTypes) {
FeatureDisplayTypes = displayOnlyTypes;
}
public LinkedList<MotifGroup> getMotifs() {
return Motifs;
}
public void setMotifs(LinkedList<MotifGroup> motifs) {
Motifs = motifs;
}
public boolean isAGClustersLoaded() {
return AGClustersLoaded;
}
public void setAGClustersLoaded(boolean aGClustersLoaded) {
AGClustersLoaded = aGClustersLoaded;
}
public String getTextDescription() {
return TextDescription;
}
public void setTextDescription(String textDescription) {
TextDescription = textDescription;
}
public String getGenbankID() {
return GenbankID;
}
public void setGenbankID(String genbankID) {
GenbankID = genbankID;
}
public GBKFieldMapping getGFM() {
return GFM;
}
public void setGFM(GBKFieldMapping gFM) {
GFM = gFM;
}
public LinkedHashMap<String, Integer> getContigEnds() {
return ContigEnds;
}
public void setContigEnds(LinkedHashMap<String, Integer> contigEnds) {
ContigEnds = contigEnds;
}
public Integer getLargestCluster() {
return LargestCluster;
}
public void setLargestCluster(Integer largestCluster) {
LargestCluster = largestCluster;
}
public String getGenomeSequenceFile() {
return GenomeSequenceFile;
}
public void setGenomeSequenceFile(String genomeSequenceFile) {
GenomeSequenceFile = genomeSequenceFile;
}
//-----------------------Deprecated ----------------------//
//DEPRECATED biojava this function simply returns a DNA sequence from a particular genome file.
public String retrieveSequence(String contig, int start, int stop, Strand strand){
//initialize and instantiate variable
String seq=null;
//load genome, and recover sequence
LinkedHashMap<String, DNASequence> genome;
try {
//import genome
genome = FastaReaderHelper.readFastaDNASequence(new File(GenomeSequenceFile));
//retrieve string value + extract subsequence
for (Entry<String, DNASequence> entry : genome.entrySet()) {
if (entry.getValue().getOriginalHeader().contains(contig)){
seq = entry.getValue().getSequenceAsString(start, stop, strand).toUpperCase();
System.out.println("Start: " + start + " Stop: " + stop + " Strand: " + strand);
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
return seq;
}
public boolean isSeqsFromFile() {
return SeqsFromFile;
}
public void setSeqsFromFile(boolean seqsFromFile) {
SeqsFromFile = seqsFromFile;
}
} //completes classbody