package au.com.acpfg.misc.biojava;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.RowKey;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.DoubleCell;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.defaultnodesettings.*;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.biojava.bio.symbol.*;
import org.biojava.bio.seq.*;
import au.com.acpfg.misc.biojava.AlternateTranslationProcessor;
import au.com.acpfg.misc.biojava.BioJavaProcessorInterface;
import au.com.acpfg.misc.biojava.FrameTranslationProcessor;
import au.com.acpfg.misc.biojava.HydrophobicityProcessor;
import au.com.acpfg.misc.biojava.SequenceTranslationProcessor;
/**
* This is the model implementation of BioJavaProcessor.
* Analyses the specified data using BioJava (see http://www.biojava.org) and produces the result at output
*
* @author Andrew Cassin
*/
public class BioJavaProcessorNodeModel extends NodeModel {
// the logger instance
private static final NodeLogger logger = NodeLogger
.getLogger(BioJavaProcessorNodeModel.class);
/** the settings key which is used to retrieve and
store the settings (from the dialog or from a settings file)
(package visibility to be usable from the dialog). */
static final String CFGKEY_TASK = "task";
static final String CFGKEY_SEQUENCE_COL = "sequence-column";
static final String CFGKEY_SEQTYPE = "sequence-type";
static final String CFGKEY_MAXLEN = "max-seq-length";
/** initial default task */
private static final String DEFAULT_TASK = getTasks()[0];
private static final String DEFAULT_SEQUENCE_COL = "Sequence";
private static final String DEFAULT_SEQTYPE = "Protein";
private static final int DEFAULT_MAXLEN = 75; // for Illumina short reads
// example value: the models count variable filled from the dialog
// and used in the models execution method. The default components of the
// dialog work with "SettingsModels".
private final SettingsModelString m_task = make_as_string(CFGKEY_TASK);
private final SettingsModelString m_sequence_column = make_as_string(CFGKEY_SEQUENCE_COL);
private final SettingsModelString m_seqtype = make_as_string(CFGKEY_SEQTYPE);
private final SettingsModelInteger m_maxlen = (SettingsModelInteger) make(CFGKEY_MAXLEN);
// state which is not persisted
private int m_sequence_idx;
private boolean is_protein;
private boolean is_dna;
private boolean is_rna;
private boolean m_warned_bad_chars; // a warning is logged if likely non-NA/AA letters are encountered during processing
private int m_bad_char_count;
/**
* Constructor for the node model.
*/
protected BioJavaProcessorNodeModel() {
super(1, 1);
m_sequence_idx = -1;
}
public static SettingsModel make(String cfgkey) {
if (cfgkey.equals(CFGKEY_TASK)) {
return new SettingsModelString(CFGKEY_TASK, DEFAULT_TASK);
} else if (cfgkey.equals(CFGKEY_SEQUENCE_COL)) {
return new SettingsModelString(CFGKEY_SEQUENCE_COL, DEFAULT_SEQUENCE_COL);
} else if (cfgkey.equals(CFGKEY_SEQTYPE)) {
return new SettingsModelString(CFGKEY_SEQTYPE, DEFAULT_SEQTYPE);
} else if (cfgkey.equals(CFGKEY_MAXLEN)) {
SettingsModel sm = new SettingsModelIntegerBounded(CFGKEY_MAXLEN, DEFAULT_MAXLEN, 1, 10000000);
sm.setEnabled(false); // since the default task is not the residue-by-position task
return sm;
}
return null;
}
public static SettingsModelString make_as_string(String cfgkey) {
return (SettingsModelString) make(cfgkey);
}
public boolean areSequencesProtein() {
return m_seqtype.getStringValue().equals("Protein");
}
public boolean areSequencesDNA() {
return m_seqtype.getStringValue().equals("DNA");
}
public boolean areSequencesRNA() {
return m_seqtype.getStringValue().equals("RNA");
}
public static String[] getTasks() {
String[] ret = new String[] {"Hydrophobicity, pI and total mass",
"Convert DNA to RNA (Universal translation only)",
"Convert RNA to Protein Sequence",
"Convert DNA to Protein Sequence",
"Alternate translation of DNA to Protein (all built-in tables)",
"Count Residues",
"Count Di-mers (overlapping)",
"Residue Frequency by Position",
"Longest reading frame (all 6 frames, DNA)",
"Longest reading frame (3 forward frames, DNA)",
"Longest reading frame (3 reverse frames, DNA)",
"Longest reading frame (all 6 frames, AA)",
"Longest reading frame (3 forward frames, AA)",
"Longest reading frame (3 reverse frames, AA)",
"Weighted Homopolymer Rate (WHR)",
"SNP-assisted frameshift detection",
"Tryptic Peptide Extraction (all 6 frames iff DNA/RNA, supports IUPAC code conversion)",
"Six-Frame nucleotide translation (excl. NA frames)",
"Six-Frame nucleotide translation (incl. NA frames)"
};
Arrays.sort(ret);
return ret;
}
public BioJavaProcessorInterface make_biojava_processor(String task) throws Exception {
if (task.startsWith("Hydrophobicity")) {
return new HydrophobicityProcessor();
} else if (task.startsWith("Six")) {
return new FrameTranslationProcessor(task);
} else if (task.startsWith("Convert")) {
return new SequenceTranslationProcessor(this, task);
} else if (task.startsWith("Alternate translation")) {
return new AlternateTranslationProcessor();
} else if (task.startsWith("Count")) {
return new ResidueFrequencyProcessor(this, task);
} else if (task.equals("Residue Frequency by Position")) {
return new PositionByResidueProcessor(this, task, m_maxlen.getIntValue());
} else if (task.startsWith("Longest reading frame")) {
return new LongestFrameProcessor(this, task);
} else if (task.startsWith("Weighted")) {
return new WeightedHomopolymerRateProcessor(this, task);
} else if (task.startsWith("SNP")) {
return new SNPFrameshiftDetector(this, task);
} else if (task.startsWith("Tryptic")) {
return new TrypticPeptideExtractor_v2(this, task);
}
throw new Exception("Unknown BioJava task to perform! Probably a bug...");
}
/**
* Retrieve the sequence as letters only in the user-configured cell. Other characters
* are removed as this would upset biojava conversion (which would silently fail)
*
* @param r
* @return the codes for the
*/
public String getSequence(DataRow r) {
assert m_sequence_idx >= 0;
String val = r.getCell(m_sequence_idx).toString();
StringBuffer sb = new StringBuffer(val.length());
int len = val.length();
for (int i=0; i<len; i++) {
char c = val.charAt(i);
if (Character.isLetter(c) || c == '-' || c == '*') {
sb.append(c);
} else {
m_bad_char_count++;
if (!m_warned_bad_chars) {
logger.warn("Encountered non-letter symbol: "+c+" results may be incorrect (character ignored)");
m_warned_bad_chars = true;
}
}
}
return sb.toString();
}
public SymbolList getSequenceAsSymbol(String seq) throws Exception {
if (seq == null || seq.length() < 1)
throw new InvalidSettingsException("Encountered a non-existant sequence - please fix!");
// SPEED: use is_* members rather than the slower areSequences*() methods
if (is_dna)
return DNATools.createDNA(seq);
else if (is_rna)
return RNATools.createRNA(seq);
else
return ProteinTools.createProtein(seq);
}
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
final ExecutionContext exec) throws Exception {
// the data table spec of the single output table,
// the table will have three columns:
BioJavaProcessorInterface bjpi = make_biojava_processor(m_task.getStringValue());
DataTableSpec result_spec= bjpi.get_table_spec();
DataTableSpec outputSpec;
if (bjpi.isMerged())
outputSpec = new DataTableSpec("BioJava Processor Specification", inData[0].getDataTableSpec(), result_spec);
else
outputSpec = result_spec;
is_protein = areSequencesProtein(); // cache answers for speed
is_dna = areSequencesDNA();
is_rna = areSequencesRNA();
// the execution context will provide us with storage capacity, in this
// case a data container to which we will add rows sequentially
// Note, this container can also handle arbitrary big data tables, it
// will buffer to disc if necessary.
BufferedDataContainer container = exec.createDataContainer(outputSpec);
m_sequence_idx = inData[0].getDataTableSpec().findColumnIndex(m_sequence_column.getStringValue());
if (m_sequence_idx < 0) {
throw new Exception("Cannot find column: "+m_sequence_column.getStringValue());
}
m_warned_bad_chars = false;
m_bad_char_count = 0;
bjpi.execute(this, exec, logger, inData, container);
if (m_bad_char_count > 0) {
logger.warn("WARNING: encountered "+m_bad_char_count+" non-residue symbols during processing. Results may be incorrect!");
}
// once we are done, we close the container and return its table
container.close();
BufferedDataTable out = container.getTable();
return new BufferedDataTable[]{out};
}
/**
* {@inheritDoc}
*/
@Override
protected void reset() {
}
/**
* {@inheritDoc}
*/
@Override
protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
throws InvalidSettingsException {
// TODO: check if user settings are available, fit to the incoming
// table structure, and the incoming types are feasible for the node
// to execute. If the node can execute in its current state return
// the spec of its output data table(s) (if you can, otherwise an array
// with null elements), or throw an exception with a useful user message
return new DataTableSpec[]{null};
}
/**
* {@inheritDoc}
*/
@Override
protected void saveSettingsTo(final NodeSettingsWO settings) {
m_task.saveSettingsTo(settings);
m_sequence_column.saveSettingsTo(settings);
m_seqtype.saveSettingsTo(settings);
m_maxlen.saveSettingsTo(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_task.loadSettingsFrom(settings);
m_sequence_column.loadSettingsFrom(settings);
m_seqtype.loadSettingsFrom(settings);
m_maxlen.loadSettingsFrom(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void validateSettings(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_task.validateSettings(settings);
m_sequence_column.validateSettings(settings);
m_seqtype.validateSettings(settings);
m_maxlen.validateSettings(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
}
/**
* {@inheritDoc}
*/
@Override
protected void saveInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
}
}