package au.com.acpfg.misc.biojava; import java.util.ArrayList; import java.util.HashMap; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.RowIterator; import org.knime.core.data.RowKey; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.IntCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.ExecutionContext; import org.knime.core.node.NodeLogger; /** * Adds columns for each position (up to the specified maximum length in the constructor) * with a cell for each residue in each column. Each cells contains the number of residues at that * position. Positions are numbered from 1 to correspond to what biologists expect ;-) * * @author andrew.cassin * */ public class PositionByResidueProcessor implements BioJavaProcessorInterface { private BioJavaProcessorNodeModel m_owner; private int m_maxlen; public PositionByResidueProcessor(BioJavaProcessorNodeModel m, String task, int maxlen) { assert(m != null && maxlen > 0); m_maxlen = maxlen; m_owner = m; } @Override public void execute(BioJavaProcessorNodeModel m, ExecutionContext exec, NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer c) throws Exception { RowIterator it = inData[0].iterator(); ResidueByPosition[] pos = new ResidueByPosition[m_maxlen]; for (int i=0; i<m_maxlen; i++) { pos[i] = new ResidueByPosition(i+1); } // scan the sequences -- speed is key here int done = 0; int n_rows = inData[0].getRowCount(); while (it.hasNext()) { DataRow r = it.next(); String seq = m.getSequence(r).toUpperCase(); int len = m_maxlen; if (seq.length() < m_maxlen) len = seq.length(); for (int i=0; i<len; i++ ) { pos[i].bump(seq.charAt(i)); } if (done % 1000 == 0) { exec.setProgress((double) done / n_rows); exec.checkCanceled(); } done++; } // build the output table... char[] letters = pos[0].getResidueLetters(); for (int i=0; i<letters.length; i++) { DataCell[] row = new DataCell[m_maxlen]; for (int j=0; j<m_maxlen; j++) { row[j] = new IntCell(pos[j].count(letters[i])); } c.addRowToTable(new DefaultRow(new RowKey(new Character(letters[i]).toString()), row)); } // all done c.close(); } @Override public DataTableSpec get_table_spec() { assert(m_maxlen > 0); DataColumnSpec[] cols = new DataColumnSpec[m_maxlen]; for (int i=0; i<m_maxlen; i++) { cols[i] = new DataColumnSpecCreator("Position "+new Integer(i+1).toString(), IntCell.TYPE).createSpec(); } return new DataTableSpec(cols); } public class ResidueByPosition { private int m_pos; private final char[] letters = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' }; private int[] count; public ResidueByPosition(int i) { m_pos = i; count = new int[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; } public char[] getResidueLetters() { return letters; } public void bump(char c) { assert(c >= 'A' && c <= 'Z'); int offset = c - 'A'; count[offset]++; } public int count(char c) { assert(c >= 'A' && c <= 'Z'); int offset = c - 'A'; return count[offset]; } } @Override public boolean isMerged() { return false; } }