package au.com.acpfg.misc.fasta;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
/**
* This class assumes each sequence for a given file is presented as a block (which is fine for this node) to save
* memory for the median/mean calculations.
*
* @author andrew.cassin
*
*/
public class SequenceStatistics {
private final File m_for_file;
private int m_min, m_max;
private int m_n, m_n_1kb, m_n_10kb, m_n_100kb;
private int m_total, m_total_1kb, m_total_10kb, m_total_100kb;
private final ArrayList<Integer> m_lengths = new ArrayList<Integer>(10*1000);
private static int m_id = 1; // bumped for each row added to the output container
// members for the sole sharing of calc_nxx() and caller's
private int m_nxx, m_nxx_length;
/**
* Sole constructor which takes a file that the stats relate to
* Callers must call <code>grokSequence()</code> for all sequences that are part of this file,
* before processing any other file with the invoking object
*
* @param f
*/
public SequenceStatistics(File f) {
m_for_file = f;
m_min = Integer.MAX_VALUE;
m_max = Integer.MIN_VALUE;
m_n = 0;
m_n_1kb = 0; // number of sequences over 1kb
m_n_10kb = 0;
m_n_100kb = 0;
m_total = 0; // total sequence length
m_total_1kb = 0; // total sequence length for all sequences at least 1kb
m_total_10kb= 0;
m_total_100kb=0;
}
public boolean isFile(File f) {
return m_for_file.equals(f);
}
public static DataTableSpec getOutputSpec() {
DataColumnSpec[] cols = new DataColumnSpec[31];
cols[0] = new DataColumnSpecCreator("Filename", StringCell.TYPE).createSpec();
cols[1] = new DataColumnSpecCreator("N", IntCell.TYPE).createSpec();
cols[2] = new DataColumnSpecCreator("Minimum", IntCell.TYPE).createSpec();
cols[3] = new DataColumnSpecCreator("Maximum", IntCell.TYPE).createSpec();
cols[4] = new DataColumnSpecCreator("Mean (rounded)", IntCell.TYPE).createSpec();
cols[5] = new DataColumnSpecCreator("Median (actual sequence length)", IntCell.TYPE).createSpec();
cols[6] = new DataColumnSpecCreator("Total Length", IntCell.TYPE).createSpec();
cols[7] = new DataColumnSpecCreator("Total Length (sequences >1kb only)", IntCell.TYPE).createSpec();
cols[8] = new DataColumnSpecCreator("Total Length (sequences >10kb only)", IntCell.TYPE).createSpec();
cols[9] = new DataColumnSpecCreator("Total Length (sequences >100kb only)", IntCell.TYPE).createSpec();
cols[10]= new DataColumnSpecCreator("Number of sequences >1kb only", IntCell.TYPE).createSpec();
cols[11]= new DataColumnSpecCreator("Number of sequences >10kb only", IntCell.TYPE).createSpec();
cols[12]= new DataColumnSpecCreator("Number of sequences >100kb only", IntCell.TYPE).createSpec();
cols[13]= new DataColumnSpecCreator("N10", IntCell.TYPE).createSpec();
cols[14]= new DataColumnSpecCreator("N10 Length", IntCell.TYPE).createSpec();
cols[15]= new DataColumnSpecCreator("N20", IntCell.TYPE).createSpec();
cols[16]= new DataColumnSpecCreator("N20 Length", IntCell.TYPE).createSpec();
cols[17]= new DataColumnSpecCreator("N30", IntCell.TYPE).createSpec();
cols[18]= new DataColumnSpecCreator("N30 Length", IntCell.TYPE).createSpec();
cols[19]= new DataColumnSpecCreator("N40", IntCell.TYPE).createSpec();
cols[20]= new DataColumnSpecCreator("N40 Length", IntCell.TYPE).createSpec();
cols[21]= new DataColumnSpecCreator("N50", IntCell.TYPE).createSpec();
cols[22]= new DataColumnSpecCreator("N50 Length", IntCell.TYPE).createSpec();
cols[23]= new DataColumnSpecCreator("N60", IntCell.TYPE).createSpec();
cols[24]= new DataColumnSpecCreator("N60 Length", IntCell.TYPE).createSpec();
cols[25]= new DataColumnSpecCreator("N70", IntCell.TYPE).createSpec();
cols[26]= new DataColumnSpecCreator("N70 Length", IntCell.TYPE).createSpec();
cols[27]= new DataColumnSpecCreator("N80", IntCell.TYPE).createSpec();
cols[28]= new DataColumnSpecCreator("N80 Length", IntCell.TYPE).createSpec();
cols[29]= new DataColumnSpecCreator("N90", IntCell.TYPE).createSpec();
cols[30]= new DataColumnSpecCreator("N90 Length", IntCell.TYPE).createSpec();
return new DataTableSpec(cols);
}
/**
* Adds the cells as defined by <code>getOutputSpec()</code> to the specified container
* @param c
*/
public void addStats(BufferedDataContainer c) {
assert(c != null);
DataCell[] cells = new DataCell[31];
for (int i=0; i<cells.length; i++) {
cells[i] = DataType.getMissingCell();
}
cells[0] = new StringCell(m_for_file.getName());
cells[1] = new IntCell(m_n);
cells[2] = new IntCell(m_min);
cells[3] = new IntCell(m_max);
// sort the length of sequences for use below
Collections.sort(m_lengths, new Comparator<Integer>() {
@Override
public int compare(Integer arg0, Integer arg1) {
return arg1.compareTo(arg0);
}
});
cells[4] = new IntCell(calculate_mean_length(m_lengths));
cells[5] = new IntCell(calculate_median_length(m_lengths));
cells[6] = new IntCell(m_total);
cells[7] = new IntCell(m_total_1kb);
cells[8] = new IntCell(m_total_10kb);
cells[9] = new IntCell(m_total_100kb);
cells[10] = new IntCell(m_n_1kb);
cells[11] = new IntCell(m_n_10kb);
cells[12] = new IntCell(m_n_100kb);
double fac = 0.1;
for (int cell_idx = 13; cell_idx < 31; cell_idx += 2) {
calc_nxx(fac);
cells[cell_idx] = new IntCell(m_nxx);
cells[cell_idx+1] = new IntCell(m_nxx_length);
fac += 0.1;
}
c.addRowToTable(new DefaultRow("file"+m_id++, cells));
}
protected void calc_nxx(double frac) {
int sum_target = (int) (m_total * frac);
m_nxx = 0;
m_nxx_length = 0;
int so_far = 0;
for (int i=0; i<m_lengths.size(); i++) {
int len = m_lengths.get(i).intValue();
so_far += len;
m_nxx++;
m_nxx_length = len;
if (so_far >= sum_target)
return;
}
}
public void grokSequence(String seq) {
String tmp = seq.trim().replaceAll("\\s+", "");
int len = tmp.length();
// stupid sequences dont count
if (len < 1)
return;
m_total += len;
m_n++;
if (len >= 1000) {
m_total_1kb += len;
m_n_1kb++;
if (len >= 10000) {
m_total_10kb += len;
m_n_10kb++;
if (len >= 100000) {
m_total_100kb += len;
m_n_100kb++;
}
}
}
if (len < m_min) {
m_min = len;
}
if (len > m_max) {
m_max = len;
}
m_lengths.add(new Integer(len));
}
protected int calculate_mean_length(List<Integer> i) {
double sum = 0.0;
for (Integer l : i) {
sum += l.intValue();
}
return (((int)Math.round(sum)) / m_n);
}
/**
* Although not strictly a median calculation, we always want an actual length rather than an average if the
* number of sequences is even. Hence this implementation for now.
*
*/
protected int calculate_median_length(ArrayList<Integer> i) {
if (i.size() < 1) {
return 0;
}
boolean is_odd = (m_n % 2 == 1);
if (is_odd) {
return i.get(m_n / 2 + 1);
} else {
return i.get(m_n / 2);
}
}
}