package edu.harvard.med.iccbl.screensaver.pipelinepilot; import org.apache.log4j.Logger; import com.scitegic.pilot.Component; import com.scitegic.pilot.Context; import com.scitegic.pilot.DataRecord; import com.scitegic.pilot.Property; import com.scitegic.pilot.PropertyCollection; import com.scitegic.pilot.Value; import edu.harvard.med.screensaver.util.StringUtils; import edu.harvard.med.screensaver.util.eutils.NCBIGeneInfo; import edu.harvard.med.screensaver.util.eutils.NCBIGeneInfoProvider; import edu.harvard.med.screensaver.util.eutils.NCBIGeneInfoProviderImpl; /** * TODO: place this code in a command line utility, i.e. PubchemChembankQueryUtility, however; * TODO: rework code using SOAP; see note - sde4 * NOTE: As noted at: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/DOC/esoap_help.html WSDL2Java from Axis2 generates for efetch_gene.xsd a source code which could not be compiled. This can be fixed if you add some prefix to local variables names in Value_type47 declaration. For example, add "x" as a prefix, so _fun variable become x_fun and fun variable become xfun, etc. There 34 variable names to change: _fun, _inv, _mam, _org, _phg, _pln, _pri, _pro, _rod, _syn, _una, _vrl, _vrt, _pat, _est, _sts, _other, fun, inv, mam, org, phg, pln, pri, pro, rod, syn, una, vrl, vrt, pat, est, sts, other. * @author sde4 */ public class NCBIGeneInfoComponent implements com.scitegic.pilot.Component, ScreensaverComponent { private static Logger log = Logger.getLogger(NCBIGeneInfoComponent.class); public static final String PROPERTY_FAIL_REASON_FIELD = "ncbi_fail_reason"; public static final String PROPERTY_VENDOR_ENTREZ_GENE_SYMBOLS = "Vendor EntrezGene Symbols"; public static final String PROPERTY_VENDOR_GENE_NAME = "Vendor Gene Name"; public static final String PROPERTY_VENDOR_SPECIES = "Vendor Species"; private String _listDelimiter = ";"; private String _inputField = "Vendor EntrezGene ID"; private NCBIGeneInfoProvider _geneInfoProvider; /** * From Pipeline Pilot help: "Java Component Development" guide, pp. 14:<br> * "Component.onInitialize(): This method is invoked once before any data records arrive. This * method receives one argument, a com.scitegic.pilot.Context object. Also, it must return a * Component.State value, one of ReadyForInputData, DoneProcessingData, * ReadyForNewData or ReadyForInputThenNewData." */ public State onInitialize(Context context) throws Exception { if(context != null) { PropertyCollection params = context.getComponentParameters(); Property prop = params.findByName(PROPERTY_LIST_DELIMITER); if(prop != null) { _listDelimiter = prop.getValue().getString(); } else { String msg = "Define the property: \""+ PROPERTY_LIST_DELIMITER + "\" to use this component"; log.warn(msg); throw new IllegalArgumentException(msg); } prop = params.findByName(PROPERTY_INPUT_FIELD); if(prop != null) { _inputField = prop.getValue().getString(); } else { String msg = "Define the property: \""+ PROPERTY_INPUT_FIELD + "\" to use this component"; log.warn(msg); throw new IllegalArgumentException(msg); } } else { String msg = "FATAL: no Context defined"; log.error(msg); throw new IllegalArgumentException(msg); } _geneInfoProvider = new NCBIGeneInfoProviderImpl(); return Component.State.ReadyForInputData; } /** * From Pipeline Pilot help: "Java Component Development" guide, pp. 14:<br> * "Component.onProcess(): This method is called once per data record. This method receives * two arguments (a com.scitegic.pilot.Context object and a com.scitegic.pilot.DataRecord * object) and returns a Component.State value to indicate the component's state after * processing the data." */ public State onProcess(Context context, DataRecord data) throws Exception { log.info("onProcess called" ); PropertyCollection record = data.getProperties(); Property property = record.getByName(_inputField); Value value = null; if(( value = property.getValue() )== null || StringUtils.isEmpty(value.getString()) ) { data.routeTo(DataRecord.Route.FailPort); String errMsg ="Field: " + _inputField + " is empty."; record.define(PROPERTY_FAIL_REASON_FIELD, errMsg); record.define(PROPERTY_FAIL_FAST_FIELD, "true"); log.warn(errMsg); //TODO: note that logging is unreliable in multiprocess subprotocols } else { int vendorEntrezGeneId = value.getInteger(); try { long before = System.currentTimeMillis(); NCBIGeneInfo geneInfo = _geneInfoProvider.getGeneInfoForEntrezgeneId(vendorEntrezGeneId); log.info("query time: " + (System.currentTimeMillis()-before) + ", entrezGeneId: " + vendorEntrezGeneId + ", geneInfo: " + geneInfo); record.define(PROPERTY_VENDOR_ENTREZ_GENE_SYMBOLS, geneInfo.getEntrezgeneSymbol() ); record.define(PROPERTY_VENDOR_GENE_NAME, geneInfo.getGeneName()); // TODO: is this req'd (not req'd in the spec: https://wiki.med.harvard.edu/ICCBL/RNAiLibrariesExcelFileFormat2 ) record.define(PROPERTY_VENDOR_SPECIES, geneInfo.getSpeciesName()); } catch (Exception e) { data.routeTo(DataRecord.Route.FailPort); String errMsg = "Exception on querying on inputfield: " + _inputField + ", value: " + value.toString() + ", exception: " + e.getClass().getName() + ": " + e.getMessage(); record.define(PROPERTY_FAIL_REASON_FIELD, errMsg); log.warn(errMsg, e); } } return Component.State.ReadyForInputData; } /** * From Pipeline Pilot help: "Java Component Development" guide, pp. 14:<br> * " * Component.onFinalize(): This method is called when no more data records are available. * " */ public void onFinalize(Context arg0) throws Exception { } }