package au.com.acpfg.align.phobius;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.html.HTMLDocument.Iterator;
import javax.xml.parsers.SAXParserFactory;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.RowIterator;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.JoinedRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.knime.core.node.defaultnodesettings.SettingsModel;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import uk.ac.ebi.webservices.axis1.PhobiusClient;
import uk.ac.ebi.webservices.axis1.stubs.phobius.InputParameters;
import uk.ac.ebi.webservices.axis1.stubs.phobius.WsResultType;
import uk.ac.ebi.webservices.wsphobius.InputParams;
/**
* This is the model implementation of PhobiusSource.
* Takes a list of sequences and appends the results of Phobius webservice invocations (text only for now) to the output port
*
* @author Andrew Cassin
*/
public class PhobiusSourceNodeModel extends NodeModel {
// the logger instance
private static final NodeLogger logger = NodeLogger
.getLogger(PhobiusSourceNodeModel.class);
/** the settings key which is used to retrieve and
store the settings (from the dialog or from a settings file)
(package visibility to be usable from the dialog). */
static final String CFGKEY_SEQUENCE_COL = "sequence";
static final String CFGKEY_EMAIL = "email";
private static final String DEFAULT_SEQUENCE_COL = "Sequence";
private static final String DEFAULT_EMAIL = "must@specify.this.to.use.this.node";
// internal state (persisted as part of workflow)
private final SettingsModelString m_seq_col = make_as_string(CFGKEY_SEQUENCE_COL);
private final SettingsModelString m_email = make_as_string(CFGKEY_EMAIL);
// internal state (not persisted)
private int m_done_rows;
private PhobiusClient m_phobius;
/**
* Constructor for the node model.
*/
protected PhobiusSourceNodeModel() {
super(1, 1);
m_phobius = null;
}
public static SettingsModel make (String k) {
if (k.equals(CFGKEY_SEQUENCE_COL)) {
return new SettingsModelString(k, DEFAULT_SEQUENCE_COL);
} else if (k.equals(CFGKEY_EMAIL)) {
return new SettingsModelString(k, DEFAULT_EMAIL);
}
return null;
}
public static SettingsModelString make_as_string(String k) {
SettingsModel sm = make(k);
return (SettingsModelString) sm;
}
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
final ExecutionContext exec) throws Exception {
int seq_col_idx = inData[0].getDataTableSpec().findColumnIndex(m_seq_col.getStringValue());
int batch_size = 10;
if (seq_col_idx < 0) {
throw new Exception("Cannot find column: "+m_seq_col.getStringValue()+" - reset the node?");
}
DataColumnSpec[] cols = new DataColumnSpec[7];
cols[0] = new DataColumnSpecCreator("JobID (EBI)", StringCell.TYPE).createSpec();
cols[1] = new DataColumnSpecCreator("EBI Results (raw)", StringCell.TYPE).createSpec();
cols[2] = new DataColumnSpecCreator("Count(Predicted Signal Peptides)", IntCell.TYPE).createSpec();
cols[3] = new DataColumnSpecCreator("Count(Predicted Transmembrane Helices)", IntCell.TYPE).createSpec();
cols[4] = new DataColumnSpecCreator("Count(Predicted Domain)", IntCell.TYPE).createSpec();
cols[5] = new DataColumnSpecCreator("Count(Predicted Cytoplasmic Regions)", IntCell.TYPE).createSpec();
cols[6] = new DataColumnSpecCreator("Count(Predicted non-Cytoplasmic Regions)", IntCell.TYPE).createSpec();
DataTableSpec outputSpec = new DataTableSpec(inData[0].getDataTableSpec(), new DataTableSpec(cols));
BufferedDataContainer container = exec.createDataContainer(outputSpec);
RowIterator it = inData[0].iterator();
m_phobius = new PhobiusClient();
ArrayList<DataRow> rows_batch = new ArrayList<DataRow>();
int batch_cnt = 0;
m_done_rows = 0;
ArrayList<HashMap> batch = new ArrayList<HashMap>();
while (it.hasNext()) {
DataRow r = it.next();
String sequence = r.getCell(seq_col_idx).toString();
if (sequence == null || sequence.length() < 1) {
logger.warn("Cannot Phobius with an empty sequence... skipping row "+r.getKey().toString());
continue;
}
HashMap<String,String> f = new HashMap<String,String>();
String email = m_email.getStringValue();
if (email.equals(DEFAULT_EMAIL) || email.length() < 1) {
throw new Exception("Must set email address to be valid for you! EBI require this!");
}
f.put("key", r.getKey().getString());
f.put("email", email);
f.put("sequence", sequence);
f.put("async", "true");
batch.add(f);
rows_batch.add(r);
batch_cnt++;
//System.err.println(batch_cnt + " " + batch_size);
if (batch_cnt < batch_size && it.hasNext()) {
continue;
} else {
try {
// got complete batch... time to run entire batch on EBI systems...
String[] jobs = runBatch(exec, batch);
waitForBatchCompletion(exec, jobs, inData[0].getRowCount());
batch.clear();
batch_cnt = 0;
int idx = 0;
for (String jobId : jobs) {
String result= getJobResult(jobId);
// fill in first output port
DataCell[] cells = new DataCell[cols.length];
if (jobId.length() > 0 && result.length() > 0) {
cells[0] = new StringCell(jobId);
cells[1] = new StringCell(result);
grok_cells(jobId, result, cells);
} else {
for (int j=0; j<cells.length; j++) {
cells[j] = DataType.getMissingCell();
}
}
container.addRowToTable(new JoinedRow(rows_batch.get(idx),new DefaultRow(rows_batch.get(idx).getKey(), cells)));
// check if the execution monitor was canceled
exec.checkCanceled();
idx++;
}
rows_batch.clear();
} catch (Exception e) {
e.printStackTrace();
System.err.println(e.getMessage());
throw e;
}
}
}
// once we are done, we close the container and return its table
container.close();
BufferedDataTable out = container.getTable();
return new BufferedDataTable[]{out};
}
public String getJobResult(String jobId) throws Exception {
WsResultType[] results = m_phobius.getResultTypes(jobId);
//System.err.println("Found "+results.length+" files for "+jobId);
for (int i=0; i<results.length; i++) {
WsResultType file = results[i];
if (file.getIdentifier().equals("out")) {
byte[] ret = m_phobius.getSrvProxy().getResult(jobId, file.getIdentifier(), null);
if (ret == null) {
logger.warn("Could not get results for "+jobId+": assuming nothing to report!");
return "";
}
return new String(ret);
} /* else {
System.err.println("WARNING: Unused result "+i+" filetype: "+file.getIdentifier());
} */
}
return "";
}
protected void grok_cells(String jobId, String result, DataCell[] cells) {
String[] lines = result.split("\n");
Pattern p = Pattern.compile("\\s*(FT)\\s*(\\w+)\\s*(\\d+)\\s*(\\d+)\\s*(.*)");
int matched_cnt = 0;
int n_signals = 0;
int n_tm = 0;
int n_dom = 0;
int n_cyto = 0;
int n_non_cyto = 0;
for (String l : lines) {
Matcher m = p.matcher(l);
if (m.matches()) {
String entry_type = m.group(1).toUpperCase();
String type = m.group(2).toUpperCase();
String start_pos = m.group(3).toUpperCase();
String end_pos = m.group(4).toUpperCase();
String descr = m.group(5).toUpperCase();
matched_cnt++;
if (type.startsWith("SIGNAL")) {
n_signals++;
} else if (type.startsWith("DOM")) {
n_dom++;
} else if (type.startsWith("TRANS")) {
n_tm++;
}
if (descr.startsWith("CYTO")) {
n_cyto++;
} else if (descr.startsWith("NON CYTO")) {
n_non_cyto++;
}
}
}
cells[2] = new IntCell(n_signals);
cells[3] = new IntCell(n_tm);
cells[4] = new IntCell(n_dom);
cells[5] = new IntCell(n_cyto);
cells[6] = new IntCell(n_non_cyto);
if (matched_cnt < 1) {
logger.warn("Did not match any records from job: "+jobId);
}
}
/**
* {@inheritDoc}
*/
@Override
protected void reset() {
// TODO Code executed on reset.
// Models build during execute are cleared here.
// Also data handled in load/saveInternals will be erased here.
}
/**
* Called when each job completes, this routine is responsible for updating the progress bar
*/
protected void updateProgress(ExecutionContext exec, int n_rows) {
// and update node progress "traffic light"
exec.setProgress(((double) m_done_rows) / n_rows, "Searched " + m_done_rows);
}
/**
* Waits for the entire batch to complete. Since the batch has just been submitted, we wait
* for at least 60s before checking the first job for completion
*
* @param jobs
* @throws Exception
*/
protected void waitForBatchCompletion(ExecutionContext exec, String[] jobs, int n_rows) throws Exception {
int to_go = jobs.length; // assume none have completed
for (int i=0; i<12; i++) {
Thread.sleep(5 * 1000); // sleep for five seconds and then check for cancel
exec.checkCanceled();
}
while (to_go > 0) {
waitForCompletion(exec, jobs[jobs.length - to_go]);
m_done_rows++;
logger.info("Job completed: "+jobs[jobs.length - to_go]);
exec.checkCanceled();
updateProgress(exec, n_rows);
to_go--;
}
logger.info("Batch completed.");
}
protected void waitForCompletion(ExecutionContext exec, String jobId) throws Exception {
if (jobId.length() > 0) {
int check_period = 20 * 1000; // every 10s
String status = "PENDING";
while (status.equals("PENDING") || status.equals("RUNNING")) {
try {
logger.info("Waiting for "+jobId);
status = m_phobius.checkStatus(jobId);
if (status.equals("RUNNING") || status.equals("PENDING")) {
logger.info(jobId + " " + status + ", sleeping for "+check_period+ " milliseconds");
// check ten times each check_period to see if the user pressed cancel
for (int i=0; i<10; i++) {
Thread.sleep(check_period / 10);
exec.checkCanceled();
}
// each time job is still going, we double check_period to reduce likelihood of overloading EBI
check_period *= 2;
if (check_period > 200000) {
check_period = 200000;
}
}
} catch (IOException e) {
throw new Exception("Cannot connect with Phobius (EBI)... aborting"+e);
}
}
} else {
throw new Exception("Bogus EBI job id... aborting!");
}
}
/**
* {@inheritDoc}
*/
@Override
protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
throws InvalidSettingsException {
// TODO: check if user settings are available, fit to the incoming
// table structure, and the incoming types are feasible for the node
// to execute. If the node can execute in its current state return
// the spec of its output data table(s) (if you can, otherwise an array
// with null elements), or throw an exception with a useful user message
return new DataTableSpec[]{null};
}
/**
* Submits a batch of jobs to EBI and returns the EBI-assigned job-id's to the caller. Returns probably before the jobs complete.
* @param ip
* @param d
* @return
*/
protected String[] runBatch(ExecutionContext exec, List<HashMap> batch) throws Exception {
String[] jobs = new String[batch.size()];
int i = 0;
for (HashMap h : batch) {
exec.checkCanceled(); // stop submitting once cancel chosen by user
InputParameters ip = new InputParameters();
ip.setSequence(h.get("sequence").toString());
ip.setFormat("long");
jobs[i++] = m_phobius.runApp(h.get("email").toString(),
h.get("key").toString(),
ip);
logger.info("Submitted Phobius job for row: " + h.get("key"));
}
return jobs;
}
/**
* {@inheritDoc}
*/
@Override
protected void saveSettingsTo(final NodeSettingsWO settings) {
m_email.saveSettingsTo(settings);
m_seq_col.saveSettingsTo(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_email.loadSettingsFrom(settings);
m_seq_col.loadSettingsFrom(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void validateSettings(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_email.validateSettings(settings);
m_seq_col.validateSettings(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
// TODO load internal data.
// Everything handed to output ports is loaded automatically (data
// returned by the execute method, models loaded in loadModelContent,
// and user settings set through loadSettingsFrom - is all taken care
// of). Load here only the other internals that need to be restored
// (e.g. data used by the views).
}
/**
* {@inheritDoc}
*/
@Override
protected void saveInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
// TODO save internal models.
// Everything written to output ports is saved automatically (data
// returned by the execute method, models saved in the saveModelContent,
// and user settings saved through saveSettingsTo - is all taken care
// of). Save here only the other internals that need to be preserved
// (e.g. data used by the views).
}
}