package au.com.acpfg.misc.biojava;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.seq.RNATools;
import org.biojava.bio.symbol.SimpleSymbolList;
import org.biojava.bio.symbol.SymbolList;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.RowIterator;
import org.knime.core.data.collection.CollectionCellFactory;
import org.knime.core.data.collection.ListCell;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.JoinedRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
public class LongestFrameProcessor implements BioJavaProcessorInterface {
private boolean m_start_codon, m_stop_codon;
private boolean m_convert_to_protein;
private boolean m_forward, m_reverse;
public LongestFrameProcessor(BioJavaProcessorNodeModel m, String task) {
m_convert_to_protein = (task != null && task.trim().toLowerCase().endsWith("AA)")) ? true : false;
m_forward = (task != null && (task.indexOf("(all") > 0 || task.indexOf("(3 forward") > 0)) ? true : false;
m_reverse = (task != null && (task.indexOf("(all") > 0 || task.indexOf("(3 reverse") > 0)) ? true : false;
}
@Override
public void execute(BioJavaProcessorNodeModel m, ExecutionContext exec,
NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer c)
throws Exception {
if (!m.areSequencesDNA()) {
throw new InvalidSettingsException("Only DNA sequences are currently supported. Re-configure...");
}
int n_rows = inData[0].getRowCount();
int done = 0;
RowIterator it = inData[0].iterator();
Pattern p = Pattern.compile("ATG((?:[^T]..)+?)T((?:GA)|(?:AG)|(?:AA))");
SymbolList rev_syms = null;
String rev_seq = null;
while (it.hasNext()) {
DataRow r = it.next();
// compute distance from methionine AA to stop codon
String seq= m.getSequence(r).toUpperCase();
if (m_reverse) {
rev_syms = DNATools.complement(DNATools.createDNA(seq)); // rev_syms is lowercase...
rev_seq = new StringBuffer(rev_syms.seqString()).reverse().toString().toUpperCase();
}
String seq_best = "";
int seq_dist = -1;
int seq_frame= 0;
int found_start = 0;
int found_stop = 0;
for (int offset=0; offset < 3; offset++) {
// try the forward frame first
if (m_forward) {
String rf = seq.substring(offset, seq.length() - (seq.length() - offset) % 3);
int best_dist = find_best_dist(p, rf);
if (best_dist > seq_dist) {
seq_dist = best_dist;
seq_best = rf;
seq_frame = offset+1;
found_start += m_start_codon ? 1 : 0;
found_stop += m_stop_codon ? 1 : 0;
}
}
// now the reverse
if (m_reverse) {
String rf = rev_seq.substring(offset, seq.length() - (seq.length() - offset) % 3);
//l.info(rf.substring(0,100));
int best_dist = find_best_dist(p, rf);
if (best_dist > seq_dist) {
seq_dist = best_dist;
seq_best = rf;
seq_frame = -(offset + 1);
found_start += m_start_codon ? 1 : 0;
found_stop += m_stop_codon ? 1 : 0;
}
}
}
DataCell[] cells = new DataCell[4];
if (seq_dist >= 0) {
cells[0] = dna2cell(seq_best);
cells[1] = new IntCell(seq_frame);
cells[2] = new IntCell(found_start);
cells[3] = new IntCell(found_stop);
} else {
cells[0] = DataType.getMissingCell();
cells[1] = new IntCell(1);
cells[2] = new IntCell(0);
cells[3] = new IntCell(0);
}
DataRow row = new DefaultRow(r.getKey(), cells);
c.addRowToTable(new JoinedRow(r, row));
done++;
if (done % 100 == 0) {
exec.checkCanceled();
exec.setProgress(((double)done) / n_rows, "Processed "+done+" sequences.");
}
}
}
protected StringCell dna2cell(String rf) throws Exception {
StringCell ret;
if (m_convert_to_protein) {
SymbolList syms = DNATools.createDNA(rf);
syms = DNATools.toRNA(syms);
// ensure multiple of 3 (trim excess)
if (syms.length() % 3 != 0) {
syms = syms.subList(1, syms.length() - (syms.length() % 3));
}
SymbolList prot = RNATools.translate(syms);
ret = new StringCell(prot.seqString());
} else {
ret = new StringCell(rf);
}
return ret;
}
protected int find_best_dist(Pattern p, String rf) {
Matcher matcher = p.matcher(rf);
int best_dist = -1;
int base = 0;
/*
* if the DNA sequence does not contain a stop codon, it's best distance extends from the start codon to
* the end of the sequence, otherwise we use the pattern p to determine the best distance for the transcript
*/
boolean found_start = false;
boolean found_stop = false;
while ((base = rf.indexOf("ATG", base)) >= 0) {
if (base % 3 != 0) {
base++;
continue;
}
found_start = true;
// no stop codon available?
if (rf.indexOf("TAA", base+3) < 0 && rf.indexOf("TAG", base+3) < 0 && rf.indexOf("TGA", base+3) < 0) {
m_start_codon = true;
m_stop_codon = false;
return rf.length() - base;
}
// else compute distance in terms of the number of nucleotides between start & stop codons
boolean found = false;
while (matcher.find(base)) {
String dist = matcher.group(1);
int len = dist.length();
if (len > best_dist) {
best_dist = len;
}
base += len + 3;
found = true;
found_stop = true;
}
if (!found) {
base += 3; // ensure no infinite loop by skipping to next codon
}
}
m_start_codon = found_start;
m_stop_codon = found_stop;
// if we could not find a start codon, the we compute the distance to any available stop codon instead
if (! m_start_codon) {
int end_codon1 = -1;
int end_codon2 = -1;
int end_codon3 = -1;
base = 0;
while ((end_codon1 = rf.indexOf("TAA", base)) >= 0) {
if (end_codon1 % 3 == 0)
break;
base = end_codon1 + 3;
}
base = 0;
while ((end_codon2 = rf.indexOf("TAG", base)) >= 0) {
if (end_codon2 % 3 == 0)
break;
base = end_codon2 + 3;
}
base = 0;
while ((end_codon3 = rf.indexOf("TGA", base)) >= 0) {
if (end_codon3 % 3 == 0)
break;
base = end_codon3 + 3;
}
if (end_codon1 >= 0 || end_codon2 >= 0 || end_codon3 >= 0) {
if (end_codon1 < 0)
end_codon1 = Integer.MAX_VALUE;
if (end_codon2 < 0)
end_codon2 = Integer.MAX_VALUE;
if (end_codon3 < 0)
end_codon3 = Integer.MAX_VALUE;
best_dist = Math.min(end_codon1, end_codon2);
best_dist = Math.min(end_codon3, best_dist);
m_stop_codon = true;
}
}
return best_dist;
}
@Override
public DataTableSpec get_table_spec() {
DataColumnSpec[] allColSpecs = new DataColumnSpec[4];
allColSpecs[0] =
new DataColumnSpecCreator("Longest Reading Frame Sequence", StringCell.TYPE).createSpec();
allColSpecs[1] =
new DataColumnSpecCreator("Chosen Frame", IntCell.TYPE).createSpec();
allColSpecs[2] =
new DataColumnSpecCreator("Start codons (total across reading frames)", IntCell.TYPE).createSpec();
allColSpecs[3] =
new DataColumnSpecCreator("Stop codons (total across reading frames)", IntCell.TYPE).createSpec();
// allColSpecs[4] =
// new DataColumnSpecCreator("debug", ListCell.getCollectionType(IntCell.TYPE)).createSpec();
return new DataTableSpec(allColSpecs);
}
@Override
public boolean isMerged() {
return true;
}
}