package au.com.acpfg.misc.biojava;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.logging.Logger;
import org.biojava.bio.dist.IndexedCount;
import org.biojava.bio.symbol.Alphabet;
import org.biojava.bio.symbol.AlphabetManager;
import org.biojava.bio.symbol.AtomicSymbol;
import org.biojava.bio.symbol.FiniteAlphabet;
import org.biojava.bio.symbol.Symbol;
import org.biojava.bio.symbol.SymbolList;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.RowIterator;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.JoinedRow;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.NodeLogger;
/**
* Speed is important here, for large sequence databases (eg. short reads from Solexa etc.)
*
* @author acassin
*
*/
public class ResidueFrequencyProcessor implements BioJavaProcessorInterface {
private boolean m_single_residue;
private BioJavaProcessorNodeModel m_owner;
private HashMap<String, Integer> m_colmap; // maps column name (ie. symbol name) to a corresponding column id
public ResidueFrequencyProcessor(BioJavaProcessorNodeModel owner, String task) {
m_single_residue = task.equals("Count Residues");
m_owner = owner;
m_colmap = new HashMap();
}
@Override
public void execute(BioJavaProcessorNodeModel m, ExecutionContext exec,
NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer c)
throws Exception {
RowIterator it = inData[0].iterator();
int n = inData[0].getRowCount();
int i = 0;
int[] vec = new int[m_colmap.size()];
String[] id = new String[m_colmap.size()];
// populate id array
Iterator iid = m_colmap.keySet().iterator();
int j = 0;
while (iid.hasNext()) {
String col_id = (String) iid.next();
id[j++] = col_id;
}
// process rows for user's dataset
if (m_single_residue) {
while (it.hasNext()) {
DataRow r = it.next();
i++;
String seq = m.getSequence(r);
DataCell[] cells = new DataCell[vec.length];
for (int k=0; k<vec.length; k++) {
int cnt = 0;
String colname = id[k];
assert(colname.length() == 1);
char ch = colname.charAt(0);
for (int m2=0; m2<seq.length(); m2++) {
if (seq.charAt(m2) == ch)
cnt++;
}
if (m_colmap.containsKey(colname)) {
Integer column_idx = m_colmap.get(colname);
cells[column_idx.intValue()] = new IntCell(cnt);
}
}
c.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells)));
if (i % 1000 == 0) {
exec.checkCanceled();
exec.setProgress(((double) i)/n, "Processed "+i+" sequences");
}
}
} else {
// di-mer/di-peptide composition?
while (it.hasNext()) {
DataRow r = it.next();
i++;
String seq = m.getSequence(r).trim().toUpperCase();
int[] cells = new int[vec.length];
for (int k=0; k<cells.length; k++) {
cells[k] = 0;
}
for (int k=0; k<seq.length()-1; k++) {
StringBuffer sb = new StringBuffer();
sb.append(seq.charAt(k));
sb.append(seq.charAt(k+1));
String dimer = sb.toString();
if (m_colmap.containsKey(dimer)) {
Integer column_idx = m_colmap.get(dimer);
cells[column_idx.intValue()]++;
}
}
DataCell[] knime_cells = new DataCell[cells.length];
for (int k=0; k<cells.length; k++) {
knime_cells[k] = new IntCell(new Integer(cells[k]));
}
c.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), knime_cells)));
if (i % 1000 == 0) {
exec.checkCanceled();
exec.setProgress(((double) i)/n, "Processed "+i+" sequences");
}
}
}
}
@Override
public DataTableSpec get_table_spec() {
DataColumnSpec[] cols;
// decide output columns based on the type of sequences being analysed
char[] vec;
if (m_owner.areSequencesProtein()) {
vec = new char[] {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V' };
} else if (m_owner.areSequencesDNA()) {
vec = new char[] {'A', 'C', 'G', 'T', 'N' }; // BUG: support all IUPAC ambiguity conventions?
} else if (m_owner.areSequencesRNA()) {
vec = new char[] {'A', 'C', 'G', 'U' };
} else {
System.err.println("ResidueFrequencyProcessor.java: unsupported sequence type! Aborting execution!");
return null;
}
int k = 0;
for (char i : vec) {
for (char j : vec) {
StringBuffer tmp = new StringBuffer();
tmp.append(i);
if (!m_single_residue) {
tmp.append(j);
}
String as_str = tmp.toString();
if (!m_single_residue || (m_single_residue && i==j)) {
// handle symettry eg. AA
if (!m_colmap.containsKey(as_str)) {
//System.err.println(as_str+ " "+k);
m_colmap.put(as_str, new Integer(k));
k++;
}
}
}
}
// columns are built from the final map to avoid duplicates
int n_cols = m_colmap.size();
cols = new DataColumnSpec[n_cols];
Set<String> colnames = m_colmap.keySet();
for (String colname : colnames) {
k = m_colmap.get(colname).intValue();
cols[k] = new DataColumnSpecCreator(colname, IntCell.TYPE).createSpec();
}
return new DataTableSpec(cols);
}
@Override
public boolean isMerged() {
return true;
}
}