package eu.dnetlib.iis.wf.export.actionmanager.entity; import static eu.dnetlib.iis.wf.export.actionmanager.ExportWorkflowRuntimeParameters.EXPORT_ACTION_SETID; import static eu.dnetlib.iis.wf.export.actionmanager.ExportWorkflowRuntimeParameters.EXPORT_SEQ_FILE_OUTPUT_DIR_NAME; import static eu.dnetlib.iis.wf.export.actionmanager.ExportWorkflowRuntimeParameters.EXPORT_SEQ_FILE_OUTPUT_DIR_ROOT; import static eu.dnetlib.iis.common.WorkflowRuntimeParameters.OOZIE_ACTION_OUTPUT_FILENAME; import java.io.IOException; import java.lang.reflect.Constructor; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.specific.SpecificRecordBase; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Logger; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import eu.dnetlib.actionmanager.actions.ActionFactory; import eu.dnetlib.actionmanager.actions.AtomicAction; import eu.dnetlib.actionmanager.actions.XsltInfoPackageAction; import eu.dnetlib.actionmanager.common.Operation; import eu.dnetlib.actionmanager.common.Provenance; import eu.dnetlib.data.mdstore.DocumentNotFoundException; import eu.dnetlib.iis.common.InfoSpaceConstants; import eu.dnetlib.iis.common.WorkflowRuntimeParameters; import eu.dnetlib.iis.common.counter.NamedCounters; import eu.dnetlib.iis.common.counter.NamedCountersFileWriter; import eu.dnetlib.iis.common.java.PortBindings; import eu.dnetlib.iis.common.java.Process; import eu.dnetlib.iis.common.java.ProcessUtils; import eu.dnetlib.iis.common.java.io.CloseableIterator; import eu.dnetlib.iis.common.java.io.DataStore; import eu.dnetlib.iis.common.java.io.FileSystemPath; import eu.dnetlib.iis.common.java.porttype.AvroPortType; import eu.dnetlib.iis.common.java.porttype.PortType; import eu.dnetlib.iis.wf.export.actionmanager.api.ActionManagerServiceFacade; import eu.dnetlib.iis.wf.export.actionmanager.api.SequenceFileActionManagerServiceFacade; import eu.dnetlib.iis.wf.export.actionmanager.cfg.StaticConfigurationProvider; import eu.dnetlib.iis.wf.export.actionmanager.entity.facade.MDStoreFacade; import eu.dnetlib.iis.wf.export.actionmanager.entity.facade.MDStoreFacadeFactory; /** * Common codebase responsible for exporting generic entities. * To be extended by classes responsible for exporting specific entity types. * * @author mhorst * */ public abstract class AbstractEntityExporterProcess<T extends SpecificRecordBase> implements Process { public static final String PORT_INPUT = "input"; public static final String MDSTORE_FACADE_FACTORY_CLASS = "mdstore.facade.factory.classname"; public static final String TOTAL_ENTITIES_COUNTER_NAME = "TOTAL_ENTITIES_COUNTER"; public static final String MISSING_ENTITIES_COUNTER_NAME = "MISSING_ENTITIES_COUNTER"; private static final Provenance PROVENANCE_DEFAULT = Provenance.sysimport_mining_repository; private final Logger log = Logger.getLogger(this.getClass()); private final Schema inputPortSchema; private final String entityXSLTName; private final String entityXSLTLocation; private final ActionFactory actionFactory; private final NamedCountersFileWriter countersWriter = new NamedCountersFileWriter(); // ------------------------ CONSTRUCTORS ----------------------------- /** * @param inputPortSchema input port avro schema * @param entityXSLTName entity XSL transformation name * @param entityXSLTLocation entity XSL transformation location */ public AbstractEntityExporterProcess(Schema inputPortSchema, String entityXSLTName, String entityXSLTLocation) { this.inputPortSchema = inputPortSchema; this.entityXSLTName = entityXSLTName; this.entityXSLTLocation = entityXSLTLocation; this.actionFactory = buildActionFactory(); } // ------------------------ LOGIC ----------------------------- /** * @param portBindings input port name bound to HDFS location * @param conf hadoop configuration * @param parameters process parameters configuring action manager and mdstore facade */ @Override public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception { String actionSetId = ProcessUtils.getParameterValue(EXPORT_ACTION_SETID, conf, parameters); Preconditions.checkArgument(StringUtils.isNotBlank(actionSetId) && !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(actionSetId), "unable to export document entities to action manager due to missing action set identifier, " + "no '%s' required parameter provided!", EXPORT_ACTION_SETID); try (ActionManagerServiceFacade actionManager = buildActionManager(conf, parameters); CloseableIterator<T> it = getIterator(portBindings.getInput().get(PORT_INPUT), conf)) { NamedCounters counters = new NamedCounters(new String[] { TOTAL_ENTITIES_COUNTER_NAME, MISSING_ENTITIES_COUNTER_NAME }); MDStoreFacade mdStore = buildMDStoreFacade(parameters); long counter = 0; while (it.hasNext()) { MDStoreIdWithEntityId mdStoreComplexId = convertIdentifier(it.next()); String mdRecordId = convertToMDStoreEntityId(mdStoreComplexId.getEntityId()); try { String mdRecord = mdStore.fetchRecord(mdStoreComplexId.getMdStoreId(), mdRecordId); handleRecord(mdRecord, actionSetId, actionManager); counter++; } catch (DocumentNotFoundException e) { log.error("mdrecord: " + mdRecordId + " wasn't found in mdstore: " + mdStoreComplexId.getMdStoreId(), e); counters.increment(MISSING_ENTITIES_COUNTER_NAME, 1l); } catch (Exception e) { log.error("got exception when trying to retrieve " + "MDStore record for mdstore id " + mdStoreComplexId.getMdStoreId() + ", and document id: " + mdRecordId, e); throw e; } } counters.increment(TOTAL_ENTITIES_COUNTER_NAME, counter); countersWriter.writeCounters(counters, System.getProperty(OOZIE_ACTION_OUTPUT_FILENAME)); } } @Override public Map<String, PortType> getInputPorts() { HashMap<String, PortType> inputPorts = Maps.newHashMap(); inputPorts.put(PORT_INPUT, new AvroPortType(inputPortSchema)); return inputPorts; } @Override public Map<String, PortType> getOutputPorts() { return Collections.emptyMap(); } /** * Converts specific identifier object into generic representation with mdStore and entity identifiers explicitly set. * * @return mdstore and entity identifiers pair */ abstract protected MDStoreIdWithEntityId convertIdentifier(T element); // ------------------------ PRIVATE ----------------------------- /** * Handles single record retrieved from MDStore. * * @param mdStoreRecord MDStore record to be processed * @param actionSetId action set identifier to be attached to generated actions * @param actionManager generated actions consumer */ private void handleRecord(String mdStoreRecord, String actionSetId, ActionManagerServiceFacade actionManager) throws Exception { XsltInfoPackageAction xsltAction = actionFactory.generateInfoPackageAction(entityXSLTName, actionSetId, StaticConfigurationProvider.AGENT_DEFAULT, Operation.INSERT, mdStoreRecord, PROVENANCE_DEFAULT, StaticConfigurationProvider.ACTION_TRUST_0_9); List<AtomicAction> atomicActions = xsltAction.asAtomicActions(); actionManager.storeActions(atomicActions); } /** * Creates action manager instance. * * @param conf * @param parameters * @return action manager instance * @throws IOException */ protected ActionManagerServiceFacade buildActionManager(Configuration conf, Map<String, String> parameters) throws IOException { return new SequenceFileActionManagerServiceFacade(conf, ProcessUtils.getParameterValue(EXPORT_SEQ_FILE_OUTPUT_DIR_ROOT, conf, parameters), ProcessUtils.getParameterValue(EXPORT_SEQ_FILE_OUTPUT_DIR_NAME, conf, parameters)); } /** * @param inputPath input path containing avro records * @param conf hadoop configuration required to initialize {@link FileSystem} * @return closeable iterator over avro input records */ protected CloseableIterator<T> getIterator(Path inputPath, Configuration conf) throws IOException { return DataStore.<T> getReader(new FileSystemPath(FileSystem.get(conf), inputPath)); } /** * Builds MDStore service facade. * * @param parameters set of parameters configuring {@link MDStoreFacade}, * at least {@value AbstractEntityExporterProcess#MDSTORE_FACADE_FACTORY_CLASS} parameter is required */ private MDStoreFacade buildMDStoreFacade(Map<String, String> parameters) { String serviceFactoryClassName = parameters.get(MDSTORE_FACADE_FACTORY_CLASS); Preconditions.checkArgument(StringUtils.isNotBlank(serviceFactoryClassName), "unknown service facade factory, no '%s' parameter provided!", MDSTORE_FACADE_FACTORY_CLASS); try { Class<?> clazz = Class.forName(serviceFactoryClassName); Constructor<?> constructor = clazz.getConstructor(); MDStoreFacadeFactory serviceFactory = (MDStoreFacadeFactory) constructor.newInstance(); return serviceFactory.create(parameters); } catch (Exception e) { throw new RuntimeException("exception occurred while instantiating service by facade factory: " + MDSTORE_FACADE_FACTORY_CLASS, e); } } /** * Creates action factory transforming MDStore records into actions. * */ private ActionFactory buildActionFactory() { Map<String, Resource> xslts = new HashMap<String, Resource>(); xslts.put(entityXSLTName, new ClassPathResource(entityXSLTLocation)); ActionFactory localActionFactory = new ActionFactory(); localActionFactory.setXslts(xslts); return localActionFactory; } /** * Converts entity identifier to MDStore internal entity id by removing result entity prefix being part of InfoSpace model. * * @param id source entity identifier to be processed * @return MDStore compliant entity identifier */ private final String convertToMDStoreEntityId(String id) { if (id != null && id.startsWith(InfoSpaceConstants.ROW_PREFIX_RESULT)) { return id.substring(InfoSpaceConstants.ROW_PREFIX_RESULT.length()); } else { return id; } } // ------------------------ INNER CLASS ---------------------------------- public class MDStoreIdWithEntityId { private final String mdStoreId; private final String entityId; // ------------------------ CONSTRUCTORS ---------------------------------- public MDStoreIdWithEntityId(String mdStoreId, String entityId) { this.mdStoreId = mdStoreId; this.entityId = entityId; } // ------------------------ GETTERS ---------------------------------- public String getMdStoreId() { return mdStoreId; } public String getEntityId() { return entityId; } } }