package au.com.acpfg.io.genbank.reader; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; import org.biojava.bio.seq.Feature.Template; import org.biojava.bio.seq.io.ParseException; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.Alphabet; import org.biojava.bio.symbol.IllegalAlphabetException; import org.biojava.bio.symbol.SimpleSymbolList; import org.biojava.bio.symbol.Symbol; import org.biojava.bio.symbol.SymbolList; import org.biojavax.Namespace; import org.biojavax.RankedCrossRef; import org.biojavax.RankedDocRef; import org.biojavax.SimpleNamespace; import org.biojavax.bio.BioEntryRelationship; import org.biojavax.bio.seq.RichFeature; import org.biojavax.bio.seq.RichSequence; import org.biojavax.bio.seq.SimpleRichFeature; import org.biojavax.bio.seq.io.GenbankFormat; import org.biojavax.bio.seq.io.RichSeqIOListener; import org.biojavax.bio.taxa.NCBITaxon; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.collection.CollectionCellFactory; import org.knime.core.data.collection.ListCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.IntCell; import org.knime.core.data.def.StringCell; import org.knime.core.data.vector.bitvector.DenseBitVector; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelString; import org.knime.core.node.defaultnodesettings.SettingsModelStringArray; /** * This is the model implementation of GenBankReader. * Using BioJava, this node reads the specified files/folder for compressed genbank or .gb files and loads the sequences into a single table along with most of key metadata * * @author http://www.plantcell.unimelb.edu.au */ public class GenBankReaderNodeModel extends NodeModel implements RichSeqIOListener { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(GenBankReaderNodeModel.class); /** the settings key which is used to retrieve and store the settings (from the dialog or from a settings file) (package visibility to be usable from the dialog). */ static final String CFGKEY_ISFILE = "folder-or-folder?"; static final String CFGKEY_FILE = "filename"; static final String CFGKEY_FOLDER = "foldername"; static final String CFGKEY_FEATURES="feature-list"; static final String CFGKEY_SEQTYPE="sequence-type"; static final String CFGKEY_TAXONOMY_FILTER="taxonomy-filter-keywords"; private final SettingsModelBoolean m_isfile = new SettingsModelBoolean(CFGKEY_ISFILE, true); private final SettingsModelString m_filename= new SettingsModelString(CFGKEY_FILE, "c:/temp/gb.seq"); private final SettingsModelString m_folder = new SettingsModelString(CFGKEY_FOLDER, "c:/temp"); private final SettingsModelStringArray m_features = new SettingsModelStringArray(CFGKEY_FEATURES, new String[] { "COMMENT", "ID" }); private final SettingsModelString m_seqtype = new SettingsModelString(CFGKEY_SEQTYPE, "DNA"); private final SettingsModelString m_taxonomy_filter = new SettingsModelString(CFGKEY_TAXONOMY_FILTER, "Lolium"); // internal state during execute -- not persisted private StringBuffer m_symbols; private String m_accsn; private StringBuffer m_comments; private NCBITaxon m_taxon; private boolean m_circular; private String m_descr; private String m_seq_version; private int m_entry_version; private RichFeature m_feature; private ArrayList<StringCell> m_feature_cells; /** * Constructor for the node model. */ protected GenBankReaderNodeModel() { // one outgoing port only super(0, 1); // ensure model is correctly initialised m_folder.setEnabled(!m_isfile.getBooleanValue()); m_filename.setEnabled(m_isfile.getBooleanValue()); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { ArrayList<File> files_to_read = new ArrayList<File>(); File[] scan_files; if (m_isfile.getBooleanValue()) { scan_files = new File[] { new File(m_filename.getStringValue()) }; } else { scan_files = new File(m_folder.getStringValue()).listFiles(); } for (File f : scan_files) { if (!f.isFile() || !f.canRead() || f.length() < 1) { logger.warn("Skipping inaccessible file: "+f.getName()); continue; } String fname = f.getName().toLowerCase(); if (fname.startsWith("gbest")) { files_to_read.add(f); } else if (fname.endsWith(".gb") || fname.endsWith(".gb.gz") || fname.endsWith(".gbk") || fname.endsWith(".gbk.gz")) { files_to_read.add(f); } else if (fname.endsWith(".seq.gz") || fname.endsWith(".seq")) { files_to_read.add(f); } } logger.info("GenBank Reader: found "+files_to_read.size()+" plausible GenBank data files to load"); // the data table spec of the single output table, // the table will have three columns: DataColumnSpec[] allColSpecs = new DataColumnSpec[9]; allColSpecs[0] = new DataColumnSpecCreator("GenBank ID", StringCell.TYPE).createSpec(); allColSpecs[1] = new DataColumnSpecCreator("GenBank Sequence", StringCell.TYPE).createSpec(); allColSpecs[2] = new DataColumnSpecCreator("Filename", StringCell.TYPE).createSpec(); allColSpecs[3] = new DataColumnSpecCreator("NCBI Taxon ID", IntCell.TYPE).createSpec(); allColSpecs[4] = new DataColumnSpecCreator("Sequence Version", StringCell.TYPE).createSpec(); allColSpecs[5] = new DataColumnSpecCreator("Entry Version", IntCell.TYPE).createSpec(); allColSpecs[6] = new DataColumnSpecCreator("Comments", StringCell.TYPE).createSpec(); allColSpecs[7] = new DataColumnSpecCreator("Description", StringCell.TYPE).createSpec(); allColSpecs[8] = new DataColumnSpecCreator("Feature Properties (list)", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); DataTableSpec outputSpec = new DataTableSpec(allColSpecs); // the execution context will provide us with storage capacity, in this // case a data container to which we will add rows sequentially // Note, this container can also handle arbitrary big data tables, it // will buffer to disc if necessary. BufferedDataContainer container = exec.createDataContainer(outputSpec); int done_files = 0; int hit = 1; // setup the match data structure (int[]) for the taxa String[] taxa_ids = m_taxonomy_filter.getStringValue().split("\\s+"); int max = 0; ArrayList<Integer> bits_to_set = new ArrayList<Integer>(); for (String id : taxa_ids) { if (id.trim().length() > 0) { Integer taxa_id = new Integer(id.trim()); if (taxa_id.intValue() > max) { max = taxa_id.intValue(); bits_to_set.add(taxa_id); } } } DenseBitVector bv = new DenseBitVector(max+1); for (Integer i : bits_to_set) { bv.set(i.intValue()); } boolean has_taxa_filter = (bv.cardinality() > 0); int[] final_taxa_ids = new int[(int) bv.cardinality()]; // process the files int failed_files = 0; for (File f : files_to_read) { int cnt = 0; int accepted = 0; // here we use the fully qualified type to make it clear which biojava package we want org.biojavax.bio.seq.io.GenbankFormat gbf = new GenbankFormat(); InputStream is; SymbolTokenization st = RichSequence.IOTools.getDNAParser(); if (m_seqtype.getStringValue().equalsIgnoreCase("RNA")) { st = RichSequence.IOTools.getRNAParser(); } else if (m_seqtype.getStringValue().equalsIgnoreCase("Protein")) { st = RichSequence.IOTools.getProteinParser(); } // make a new stream rather than use one which has been partially read BufferedReader rdr = new BufferedReader(new InputStreamReader(make_input_stream(f))); // the SeqIOListener (this) will setup internal member variables for the loop // to process... boolean more = true; try { do { // setup internal state to ensure missing cells get generated if the entry does not specify it m_accsn = null; m_symbols = null; m_taxon = null; m_comments = null; m_seq_version = null; m_descr = null; m_feature_cells = null; // read the next genbank sequence from the input, failing gracefully to handle poor entries well more = gbf.readRichSequence(rdr, st, this, null); cnt++; if (has_taxa_filter) { int t_id = m_taxon.getNCBITaxID(); if (t_id < 0 || t_id >= bv.length()) { continue; } if (!bv.get(t_id)) { continue; } } // add the row to the table, since it has passed the taxonomy filter (if any) DataCell[] cells = new DataCell[9]; cells[0] = safe_cell(m_accsn); cells[1] = safe_cell(m_symbols); cells[2] = new StringCell(f.getName()); cells[3] = (m_taxon != null) ? new IntCell(m_taxon.getNCBITaxID()) : DataType.getMissingCell(); cells[4] = safe_cell(m_seq_version); cells[5] = new IntCell(m_entry_version); cells[6] = safe_cell(m_comments); cells[7] = safe_cell(m_descr); cells[8] = safe_cell(m_feature_cells); accepted++; container.addRowToTable(new DefaultRow("GB"+hit, cells)); hit++; if (hit % 200 == 0) { exec.checkCanceled(); } } while (more); logger.info("Processed "+cnt+" genbank entries (accepted "+accepted+") in "+f.getName()); } catch (Exception e) { failed_files++; logger.warn("Error in genbank record in "+f.getName()+" error msg is: "); logger.warn(e.getMessage()); e.printStackTrace(); } rdr.close(); done_files++; exec.checkCanceled(); exec.setProgress(((double) done_files) / files_to_read.size()); } logger.info("Processed "+done_files+" files ("+failed_files+" contained errors). Loading complete."); // once we are done, we close the container and return its table container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[]{out}; } /** * Responsible for ensuring a valid DataCell is returned (use a missing cell if <code>str</code> is not valid) * @param str * @return a valid KNIME data cell (NOT guaranteed to be a StringCell) */ protected DataCell safe_cell(String str) { return (str != null) ? new StringCell(str) : DataType.getMissingCell(); } /** * Responsible for ensuring a valid DataCell is returned (use a missing cell if <code>str</code> is not valid) * @param str * @return a valid KNIME data cell (NOT guaranteed to be a StringCell) */ protected DataCell safe_cell(StringBuffer str) { if (str == null) return DataType.getMissingCell(); return safe_cell(str.toString()); } /** * Responsible for ensuring a valid collection cell is returned */ protected DataCell safe_cell(List<StringCell> cells) { if (cells == null || cells.size() < 1) return DataType.getMissingCell(); return CollectionCellFactory.createListCell(cells); } /** * Returns an inputstream object, depending on whether the file is likely compressed or not * * @param f * @return * @throws IOException */ private InputStream make_input_stream(File f) throws IOException { boolean is_compressed = f.getName().endsWith(".gz"); if (is_compressed) { return new GZIPInputStream(new FileInputStream(f)); } else { return new FileInputStream(f); } } /** * {@inheritDoc} */ @Override protected void reset() { } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return new DataTableSpec[]{null}; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_isfile.saveSettingsTo(settings); m_folder.saveSettingsTo(settings); m_filename.saveSettingsTo(settings); m_features.saveSettingsTo(settings); m_seqtype.saveSettingsTo(settings); m_taxonomy_filter.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_isfile.loadSettingsFrom(settings); m_folder.loadSettingsFrom(settings); m_filename.loadSettingsFrom(settings); m_features.loadSettingsFrom(settings); m_seqtype.loadSettingsFrom(settings); m_taxonomy_filter.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_isfile.validateSettings(settings); m_folder.validateSettings(settings); m_filename.validateSettings(settings); m_features.validateSettings(settings); m_seqtype.validateSettings(settings); m_taxonomy_filter.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * **************************** RichSeqIOListener methods **************************** * */ @Override public void addFeatureProperty(Object arg0, Object arg1) throws ParseException { if (m_feature_cells == null) { m_feature_cells = new ArrayList<StringCell>(); } if (arg0 != null && arg1 != null) { String key = arg0.toString(); int colon_idx = key.indexOf(':'); // remove namespace prefix from key (not for users!) if (colon_idx >= 0) { key = key.substring(colon_idx+1); } m_feature_cells.add(new StringCell(key+"="+arg1.toString())); } } @Override public void addSequenceProperty(Object key, Object value) throws ParseException { // TODO Auto-generated method stub } @Override public void addSymbols(Alphabet alphabet, Symbol[] symbols, int start, int len) throws IllegalAlphabetException { assert(start < len && start >= 0); assert(alphabet != null && symbols != null); SymbolList sl; if (start == 0) { sl = new SimpleSymbolList(symbols, len, alphabet); } else { Symbol[] new_list = new Symbol[len-start]; System.arraycopy(symbols, start, new_list, 0, len-start); sl = new SimpleSymbolList(new_list, new_list.length, alphabet); } m_symbols.append(sl.seqString().toUpperCase()); } @Override public void endFeature() throws ParseException { } @Override public void endSequence() throws ParseException { } @Override public void setName(String arg0) throws ParseException { } @Override public void startFeature(Template arg0) throws ParseException { m_feature = RichFeature.Tools.makeEmptyFeature(); } @Override public void startSequence() throws ParseException { m_symbols = new StringBuffer(1024); } @Override public RichFeature getCurrentFeature() throws ParseException { return m_feature; } @Override public void setAccession(String arg0) throws ParseException { m_accsn = arg0; } @Override public void setCircular(boolean arg0) throws ParseException { m_circular = arg0; } @Override public void setComment(String arg0) throws ParseException { if (m_comments == null) { m_comments = new StringBuffer(1024); } m_comments.append(arg0+"\n"); } @Override public void setDescription(String arg0) throws ParseException { m_descr = arg0; } @Override public void setDivision(String arg0) throws ParseException { } @Override public void setIdentifier(String arg0) throws ParseException { } @Override public void setNamespace(Namespace arg0) throws ParseException { } @Override public void setRankedCrossRef(RankedCrossRef arg0) throws ParseException { } @Override public void setRankedDocRef(RankedDocRef arg0) throws ParseException { } @Override public void setRelationship(BioEntryRelationship arg0) throws ParseException { } @Override public void setSeqVersion(String arg0) throws ParseException { m_seq_version = arg0; } @Override public void setTaxon(NCBITaxon arg0) throws ParseException { m_taxon = arg0; } @Override public void setURI(String arg0) throws ParseException { } @Override public void setVersion(int arg0) throws ParseException { m_entry_version = arg0; } }