package au.com.acpfg.misc.jemboss.local;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.emboss.jemboss.JembossParams;
import org.emboss.jemboss.programs.RunEmbossApplication2;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.RowIterator;
import org.knime.core.data.container.BlobDataCell;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.IntCell;
import org.knime.core.data.def.StringCell;
import org.knime.core.data.image.png.PNGImageCell;
import org.knime.core.data.image.png.PNGImageContent;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.knime.core.node.defaultnodesettings.SettingsModelInteger;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import au.com.acpfg.misc.jemboss.io.FastaUnmarshaller;
import au.com.acpfg.misc.jemboss.io.GFFUnmarshaller;
import au.com.acpfg.misc.jemboss.settings.OutputFileSetting;
import au.com.acpfg.misc.jemboss.settings.ProgramSetting;
/**
* This is the model implementation of JEmbossProcessor.
* Runs a EMBOSS command on the local computer, based on the configure-dialog settings. Input data is taken from the input table and automatically converted into a suitable form for EMBOSS based on the chosen program.
*
* @author Andrew Cassin
*/
public class JEmbossProcessorNodeModel extends NodeModel implements ProgramSettingsListener {
// the logger instance
private static final NodeLogger logger = NodeLogger
.getLogger(JEmbossProcessorNodeModel.class);
// dialog configuration keys
static final String CFGKEY_PROGRAM = "DLG_EMBOSS_SELECTED_PROGRAM"; // which program does the user want to run (empty iff none)
static final String CFGKEY_ACD = "DLG_ACD"; // ACD file content (as string) which represents current program (empty iff none)
static final String CFGKEY_SETTINGS = "DLG_SETTINGS";
static final String CFGKEY_BATCH_SIZE = "DLG_BATCH_SIZE";
// state which is persisted via load/save/validate settings methods
private final SettingsModelString m_acd = new SettingsModelString(CFGKEY_ACD, "");
private final SettingsModelString m_program = new SettingsModelString(CFGKEY_PROGRAM, "");
private final SettingsModelString m_input_ser = new SettingsModelString(CFGKEY_SETTINGS, "");
private final SettingsModelInteger m_batch_size=new SettingsModelInteger(CFGKEY_BATCH_SIZE, 0); // 0 denotes unlimited
// state which is not persisted
final static private JembossParams m_je_params = new JembossParams();
private ArrayList<String> m_args = null;
final private HashMap<ProgramSetting,File> m_input_files = new HashMap<ProgramSetting,File>(); // map from input column names to input filenames for EMBOSS prog
final private HashMap<ProgramSetting,File> m_output_files= new HashMap<ProgramSetting,File>();
/**
* Constructor for the node model.
*/
protected JEmbossProcessorNodeModel() {
super(1, 2);
JembossParams.setStandaloneMode(true); // always a local server for this node
// setup the unmarshallers for various EMBOSS data formats (not all are supported)
// if an emboss program produces a data type which is not available, the user
// will only be able to see the raw data
ProgramSetting.addUnmarshaller(new String[] {"outseq", "seqoutall", "seqoutset"},
new FastaUnmarshaller());
ProgramSetting.addUnmarshaller("marscan:report", new GFFUnmarshaller());
}
public static JembossParams getSettings() {
return m_je_params;
}
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData,
final ExecutionContext exec) throws Exception {
HashMap<Integer,ProgramSetting> idx2ps = new HashMap<Integer,ProgramSetting>();
// 0. load key data from the dialog into internal form
String input_ser = m_input_ser.getStringValue();
ProgramSettingsModel mdl = new ProgramSettingsModel();
if (input_ser.length() > 0)
mdl.addSettingsFrom(input_ser.split("\n"));
String prog = m_program.getStringValue();
if (prog == null || prog.trim().length() < 1) {
throw new InvalidSettingsException("No EMBOSS program selected!");
}
m_args = new ArrayList<String>();
m_args.add(prog);
make_args(mdl);
// compute environment variables based on KNIME with the emboss root specified
Map<String,String> env = System.getenv();
ArrayList<String> emboss_env = new ArrayList<String>();
emboss_env.add("EMBOSS_ROOT="+getEmbossRoot());
for (String key : env.keySet()) {
if (!key.equals("EMBOSS_ROOT")) {
emboss_env.add(key+"="+env.get(key));
}
}
logger.info(m_args.toString());
// check that all the sequence columns are present in the input table
DataTableSpec in_spec = inData[0].getDataTableSpec();
HashMap<String,Integer> in_col2idx = new HashMap<String,Integer>();
for (ProgramSetting ps : m_input_files.keySet()) {
String col_name = ps.getColumnName();
if (!in_spec.containsName(col_name))
throw new InvalidSettingsException("Column: "+ps.getColumnName()+" is not in input table - re-configure/reset the node?");
in_col2idx.put(col_name, new Integer(in_spec.findColumnIndex(col_name)));
}
// run giving data from the input table as required
RowIterator it = inData[0].iterator();
int n_rows = inData[0].getRowCount();
double done = 0.0;
final int run = 0;
final int formatted_rows = 0;
int n_in_batch = 0;
int batch_size = m_batch_size.getIntValue();
// compute second output port columns
// compute the output columns based on user settings and expected binary data (eg. PNG images)
final RawAndFormattedTableMapper om = new RawAndFormattedTableMapper(null, null);
for (ProgramSetting ps : mdl) {
ps.addColumns(om, ps);
}
// traverse the input data, invoking local emboss install as required
final BufferedDataContainer container = exec.createDataContainer(om.getRawTableSpec());
final BufferedDataContainer c2 = exec.createDataContainer(om.getFormattedTableSpec());
om.setContainers(container, c2);
try {
while (it.hasNext()) {
DataRow r = it.next();
String rid= r.getKey().getString();
// marshal required values into required files
boolean skip = false;
for (ProgramSetting ps : m_input_files.keySet()) {
DataCell c = r.getCell(in_col2idx.get(ps.getColumnName()).intValue());
if (c == null || c.isMissing()) {
logger.warn("Skipping row "+rid+" as it is missing "+ps.getColumnName());
skip = true;
break;
} else {
File infile = m_input_files.get(ps);
PrintWriter fw = new PrintWriter(new FileWriter(infile));
try {
ps.marshal(rid, c, fw);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
fw.close();
if (infile.length() < 1) {
throw new IOException("Marshalling failed (zero length) for "+ps.getColumnName()+"! Aborting...");
}
}
}
if (skip)
continue;
// run emboss program
File tmp_folder = get_tmp_folder();
RunEmbossApplication2 rea = new RunEmbossApplication2(m_args.toArray(new String[0]), emboss_env.toArray(new String[0]), tmp_folder);
int status = rea.getProcess().waitFor();
String stdout = rea.getProcessStdout();
String stderr = rea.getProcessStderr();
rea.getProcess().destroy();
// load results of each batch run into output table
om.addRequiredCells(rid, status, stdout, stderr);
for (ProgramSetting ps : m_output_files.keySet()) {
File out_file = m_output_files.get(ps);
if (out_file != null)
ps.unmarshal(out_file, om, mdl.getProgram());
}
om.emitRawRow();
exec.checkCanceled();
exec.setProgress(done++/n_rows, "Processed row "+rid);
}
} catch (Exception e) {
e.printStackTrace();
throw e;
}
container.close();
c2.close();
// delete the temporary input & output files
for (File f : m_input_files.values()) {
f.delete();
}
for (File f : m_output_files.values()) {
f.delete();
}
return new BufferedDataTable[]{container.getTable(), c2.getTable()};
}
/**
* Find the binary file, whose format is specified by <code>expected_file_format</code>
* by examining the stdout of the emboss program invoked to find the name and then
* attempting to locate the file in the tmp folder. If this cannot be done, the method
* returns <code>DataType.getMissingCell()</code>
*
* @param stdout
* @param expected_file_format
* @return
*/
protected DataCell find_bin_file(String stdout, String expected_file_format, File tmpdir) throws Exception {
if (stdout == null || stdout.length() < 1 || expected_file_format == null || expected_file_format.length() < 3)
return DataType.getMissingCell();
Pattern p = Pattern.compile("^Created\\s+(\\S+)$", Pattern.MULTILINE);
Matcher m = p.matcher(stdout);
while (m.find()) {
String fname = m.group(1);
if (fname.toLowerCase().endsWith(expected_file_format)) {
File f = new File(tmpdir, fname);
if (!f.exists() || !f.canRead())
return DataType.getMissingCell();
long len = f.length();
FileInputStream fis = new FileInputStream(f);
byte[] bytes = new byte[(int) len];
int got = fis.read(bytes);
fis.close();
if (got < len)
return DataType.getMissingCell();
DataCell c = new PNGImageContent(bytes).toImageCell();
// if we successfully load the PNG (without exception), then its ok to earmark the
// file for deletion at successful exit
f.deleteOnExit();
return c;
}
}
return DataType.getMissingCell();
}
/**
* Wrapper method which iterates thru a specified model, getting the listener (this) to
* handle each argument.
*
* @param psm the model to iterate over for every setting
* @throws Exception
*/
protected void make_args(ProgramSettingsModel psm) throws Exception {
m_input_files.clear();
m_output_files.clear();
for (ProgramSetting ps : psm) {
ps.getArguments(this);
}
}
/**
* Returns the location of the tmp folder to use for temporary calculation files and results
* of the EMBOSS program
*/
public static File get_tmp_folder() {
return new File(System.getProperty("java.io.tmpdir"));
}
/**
* {@inheritDoc}
*/
@Override
protected void reset() {
}
/**
* {@inheritDoc}
*/
@Override
protected DataTableSpec[] configure(final DataTableSpec[] inSpecs)
throws InvalidSettingsException {
return new DataTableSpec[]{null,null};
}
/**
* {@inheritDoc}
*/
@Override
protected void saveSettingsTo(final NodeSettingsWO settings) {
m_acd.saveSettingsTo(settings);
m_input_ser.saveSettingsTo(settings);
m_program.saveSettingsTo(settings);
m_batch_size.saveSettingsTo(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadValidatedSettingsFrom(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_acd.loadSettingsFrom(settings);
m_input_ser.loadSettingsFrom(settings);
m_program.loadSettingsFrom(settings);
m_batch_size.loadSettingsFrom(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void validateSettings(final NodeSettingsRO settings)
throws InvalidSettingsException {
m_acd.validateSettings(settings);
m_input_ser.validateSettings(settings);
m_program.validateSettings(settings);
m_batch_size.validateSettings(settings);
}
/**
* {@inheritDoc}
*/
@Override
protected void loadInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
// TODO: generated method stub
}
/**
* {@inheritDoc}
*/
@Override
protected void saveInternals(final File internDir,
final ExecutionMonitor exec) throws IOException,
CanceledExecutionException {
// TODO: generated method stub
}
/**
* TODO: this should be a knime emboss preference setting (and maybe other stuff too)
* @return
*/
public static String getEmbossRoot() {
return "c:/mEMBOSS";
}
/**
* Runs the specified emboss command with only a single environment variable set: EMBOSS_ROOT
* (used in the program tree code to update the help page when the user changes the selected program)
* @param command_line
* @return
*/
public static String run_emboss_command(String command_line) {
RunEmbossApplication2 rea = new RunEmbossApplication2(command_line, new String[] { "EMBOSS_ROOT=" + getEmbossRoot() }, new File("c:/temp"));
rea.waitFor();
String stdout = rea.getProcessStderr();
return stdout;
}
public static String getACDText(String name) {
String dir = getEmbossRoot() + File.separator + "acd";
File f = new File(dir, name+".acd");
StringBuffer sb = new StringBuffer();
try {
BufferedReader rdr = new BufferedReader(new FileReader(f));
String line;
while ((line = rdr.readLine()) != null) {
sb.append(line);
sb.append('\n');
}
rdr.close();
return sb.toString();
} catch (Exception e) {
// BUG: close rdr if necessary
e.printStackTrace();
return "";
}
}
/**
* Returns the list of nucleotide and protein scoring matrices which are found in the "data"
* directory of the mEMBOSS distribution
*
* @return the list of filenames of the scoring matrices
*/
public static String[] getMatrices() {
// this code, taken from jemboss, ignores SSSUB (but who cares about secondary structure prediction ;-) ???
File mfl = new File(getEmbossRoot() + File.separator + "data");
if (mfl.isDirectory()){
String[] keys = mfl.list(new FilenameFilter(){
public boolean accept(File dir, String name) {
if (name.startsWith("EPAM") ||
name.startsWith("EBLOSUM") ||
name.startsWith("EDNA"))
return true;
return false;
}});
return keys;
}
return new String[] {};
}
/**
* Returns the list of codons which are supported by a given EMBOSS installation. Guaranteed non-<code>null</code>.
*/
public static String[] getCodons() {
File mfl = new File(getEmbossRoot() + File.separator + "data" + File.separator + "CODONS");
if (mfl.isDirectory()) {
String[] keys = mfl.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
File fname = new File(dir, name);
return !fname.isDirectory();
}
});
if (keys == null) {
return new String[] {};
}
return keys;
}
return new String[] {};
}
/**
* returns a File instance (guaranteed non-<code>null</code>) to the Emboss data folder. Not guaranteed to
* exist or be readable.
*
* @return
*/
public static File getEmbossDataFolder() {
return new File(getEmbossRoot() + File.separator + "data");
}
/********************** PROGRAMSETTINGSLISTENER INTERFACE METHODS (called during execute()) **************************************/
@Override
public void addArgument(final ProgramSetting ps, String[] str_list) {
for (String s : str_list) {
m_args.add(s);
}
}
@Override
public void addOutputFileArgument(final OutputFileSetting ps, String opt) {
m_args.add(opt);
OutputFileSetting ops = (OutputFileSetting) ps;
m_args.add(ops.getFileName());
if (ops.isSafeToDelete()) { // SAFETY: dont delete anything with data which exists prior to execute()
m_output_files.put(ps, ops.getFile());
//logger.debug("got output file: "+ps);
}
}
@Override
public void addInputFileArgument(final ProgramSetting ps, String opt, File in_file) {
m_args.add(opt);
// in_file comes from either a pre-existing file (no more work required) or from a column, in which case we must marshal it
if (ps.isInputFromColumn()) {
m_input_files.put(ps, in_file); // placing a file in this map schedules it FOR DELETION: CAREFUL!
}
m_args.add(in_file.getAbsolutePath());
}
}