package au.com.acpfg.misc.biojava; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.RowIterator; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.DoubleCell; import org.knime.core.data.def.JoinedRow; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.ExecutionContext; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; public class WeightedHomopolymerRateProcessor implements BioJavaProcessorInterface { public WeightedHomopolymerRateProcessor(BioJavaProcessorNodeModel m,String task) { } @Override public void execute(BioJavaProcessorNodeModel m, ExecutionContext exec, NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer c) throws Exception { if (!m.areSequencesDNA()) { throw new InvalidSettingsException("Only DNA sequences are currently supported. No ambiguity in DNA sequence is permitted."); } RowIterator it = inData[0].iterator(); int done = 0; int n_rows = inData[0].getRowCount(); int total_bad = 0; while (it.hasNext()) { DataRow r = it.next(); String seq = m.getSequence(r).trim().toUpperCase(); // calculation as per http://www.broadinstitute.org/crd/wiki/index.php/Homopolymer int idx = 0; int cnt = 0; int sum = 0; int seq_len = seq.length(); boolean bad = false; while (idx < seq_len) { cnt++; char residue = seq.charAt(idx); if (residue != 'A' && residue != 'C' && residue != 'T' && residue != 'G') { bad = true; } idx++; int len = 1; while (idx < seq_len && seq.charAt(idx) == residue) { idx++; len++; } sum += len * len; } DataCell[] cells = new DataCell[1]; if (bad) { cells[0] = DataType.getMissingCell(); total_bad++; } else { double whr = ((double)sum) / cnt; cells[0] = new DoubleCell(whr); } c.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells))); done++; if (done % 100 == 0) { exec.checkCanceled(); exec.setProgress(((double)done)/n_rows, "Computed WHR for "+r.getKey()); } } if (total_bad > 0) { l.warn(""+total_bad+" sequences had ambiguous/unknown base calls, they are ignored (missing WHR)."); } } @Override public DataTableSpec get_table_spec() { DataColumnSpec[] cols = new DataColumnSpec[1]; cols[0] = new DataColumnSpecCreator("Weighted Homopolymer Rate (WHR)", DoubleCell.TYPE).createSpec(); return new DataTableSpec(cols); } @Override public boolean isMerged() { return true; } }