package eu.dnetlib.iis.wf.importer.infospace; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_INFERENCE_PROVENANCE_BLACKLIST; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_MERGE_BODY_WITH_UPDATES; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_SKIP_DELETED_BY_INFERENCE; import static eu.dnetlib.iis.wf.importer.ImportWorkflowRuntimeParameters.IMPORT_TRUST_LEVEL_THRESHOLD; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.List; import java.util.Map; import org.apache.avro.mapred.AvroKey; import org.apache.avro.specific.SpecificRecord; import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; import com.googlecode.protobuf.format.JsonFormat; import com.googlecode.protobuf.format.JsonFormat.ParseException; import eu.dnetlib.data.mapreduce.util.OafRelDecoder; import eu.dnetlib.data.proto.DedupProtos.Dedup; import eu.dnetlib.data.proto.OafProtos.Oaf; import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization; import eu.dnetlib.data.proto.RelTypeProtos.RelType; import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome; import eu.dnetlib.data.proto.TypeProtos.Type; import eu.dnetlib.iis.common.InfoSpaceConstants; import eu.dnetlib.iis.common.WorkflowRuntimeParameters; import eu.dnetlib.iis.common.javamapreduce.MultipleOutputs; import eu.dnetlib.iis.importer.schemas.DocumentMetadata; import eu.dnetlib.iis.importer.schemas.ProjectToOrganization; import eu.dnetlib.iis.wf.importer.OafHelper; import eu.dnetlib.iis.wf.importer.infospace.approver.DataInfoBasedApprover; import eu.dnetlib.iis.wf.importer.infospace.approver.ResultApprover; import eu.dnetlib.iis.wf.importer.infospace.converter.DeduplicationMappingConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.DocumentMetadataConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.DocumentToProjectRelationConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.InfoSpaceRecordUtils; import eu.dnetlib.iis.wf.importer.infospace.converter.OafEntityToAvroConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.OafRelToAvroConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.OrganizationConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.PersonConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.ProjectConverter; import eu.dnetlib.iis.wf.importer.infospace.converter.ProjectToOrganizationRelationConverter; /** * InformationSpace reducer phase importing {@link InfoSpaceRecord}s grouped by row identifier. * Emits entities and relations as avro records written to multiple outputs. * Each output is associated with individual entity or relation type. * * @author mhorst * */ public class ImportInformationSpaceReducer extends Reducer<Text, InfoSpaceRecord, NullWritable, NullWritable> { // context property names protected static final Logger log = Logger.getLogger(ImportInformationSpaceReducer.class); protected static final String OUTPUT_NAME_DOCUMENT_META = "output.name.document_meta"; protected static final String OUTPUT_NAME_DOCUMENT_PROJECT = "output.name.document_project"; protected static final String OUTPUT_NAME_PROJECT = "output.name.project"; protected static final String OUTPUT_NAME_PERSON = "output.name.person"; protected static final String OUTPUT_NAME_DEDUP_MAPPING = "output.name.dedup_mapping"; protected static final String OUTPUT_NAME_ORGANIZATION = "output.name.organization"; protected static final String OUTPUT_NAME_PROJECT_ORGANIZATION = "output.name.project_organization"; // column family names protected final String projOrgColumnFamily = OafRelDecoder.getCFQ(RelType.projectOrganization, SubRelType.participation, ProjectOrganization.Participation.RelName.hasParticipant.toString()); protected final String resProjColumnFamily = OafRelDecoder.getCFQ(RelType.resultProject, SubRelType.outcome, Outcome.RelName.isProducedBy.toString()); protected final String dedupMappingColumnFamily = OafRelDecoder.getCFQ(RelType.resultResult, SubRelType.dedup, Dedup.RelName.merges.toString()); // output names private String outputNameDocumentMeta; private String outputNameDocumentProject; private String outputNameProject; private String outputNamePerson; private String outputNameDedupMapping; private String outputNameOrganization; private String outputNameProjectOrganization; // converters private DocumentMetadataConverter docMetaConverter; private DocumentToProjectRelationConverter docProjectConverter; private DeduplicationMappingConverter deduplicationMappingConverter; private PersonConverter personConverter; private ProjectConverter projectConverter; private OrganizationConverter organizationConverter; private ProjectToOrganizationRelationConverter projectOrganizationConverter; // others private MultipleOutputs outputs; private ResultApprover resultApprover; /** * Flag indicating {@link Oaf} retrieved from body column family should be merged with all update columns. * Set to false by default. */ private boolean mergeBodyWithUpdates; // ------------------------ LOGIC -------------------------- @Override public void setup(Context context) { setOutputDirs(context); mergeBodyWithUpdates = context.getConfiguration().getBoolean(IMPORT_MERGE_BODY_WITH_UPDATES, false); DataInfoBasedApprover dataInfoBasedApprover = buildApprover(context); this.resultApprover = dataInfoBasedApprover; // initializing converters docMetaConverter = new DocumentMetadataConverter(this.resultApprover, dataInfoBasedApprover); deduplicationMappingConverter = new DeduplicationMappingConverter(); docProjectConverter = new DocumentToProjectRelationConverter(); personConverter = new PersonConverter(); projectConverter = new ProjectConverter(); organizationConverter = new OrganizationConverter(); projectOrganizationConverter = new ProjectToOrganizationRelationConverter(); } @Override public void cleanup(Context context) throws IOException, InterruptedException { try { super.cleanup(context); } finally { outputs.close(); } } @Override public void reduce(Text key, Iterable<InfoSpaceRecord> values, Context context) throws IOException, InterruptedException { String id = key.toString(); Map<String, List<QualifiedOafJsonRecord>> mappedRecords = InfoSpaceRecordUtils.mapByColumnFamily(values); if (id.startsWith(InfoSpaceConstants.ROW_PREFIX_RESULT)) { handleResult(id, mappedRecords); } else if (id.startsWith(InfoSpaceConstants.ROW_PREFIX_PERSON)) { handleEntity(id, mappedRecords.get(Type.person.name()), personConverter, outputNamePerson); } else if (id.startsWith(InfoSpaceConstants.ROW_PREFIX_PROJECT)) { handleEntity(id, mappedRecords.get(Type.project.name()), projectConverter, outputNameProject, new RelationConversionDTO<ProjectToOrganization>(mappedRecords.get(projOrgColumnFamily), projectOrganizationConverter, outputNameProjectOrganization)); } else if (id.startsWith(InfoSpaceConstants.ROW_PREFIX_ORGANIZATION)) { handleEntity(id, mappedRecords.get(Type.organization.name()), organizationConverter, outputNameOrganization); } } /** * Instantiates multiple outputs. */ protected MultipleOutputs instantiateMultipleOutputs(Context context) { return new MultipleOutputs(context); } // ------------------------ PRIVATE -------------------------- /** * Sets output directories. * @param context hadoop context providing directories output names */ private void setOutputDirs(Context context) { outputNameDocumentMeta = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_DOCUMENT_META), "document metadata output name not provided!"); outputNameDocumentProject = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_DOCUMENT_PROJECT), "document project relation output name not provided!"); outputNameProject = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_PROJECT), "project output name not provided!"); outputNamePerson = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_PERSON), "person output name not provided!"); outputNameDedupMapping = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_DEDUP_MAPPING), "deduplication mapping output name not provided!"); outputNameOrganization = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_ORGANIZATION), "organization output name not provided!"); outputNameProjectOrganization = Preconditions.checkNotNull(context.getConfiguration().get(OUTPUT_NAME_PROJECT_ORGANIZATION), "project to organization output name not provided!"); outputs = instantiateMultipleOutputs(context); } /** * Creates data approver. */ private DataInfoBasedApprover buildApprover(Context context) { boolean skipDeletedByInference = true; String skipDeletedByInferenceParamValue = WorkflowRuntimeParameters.getParamValue(IMPORT_SKIP_DELETED_BY_INFERENCE, context.getConfiguration()); if (skipDeletedByInferenceParamValue != null) { skipDeletedByInference = Boolean.valueOf(skipDeletedByInferenceParamValue); } Float trustLevelThreshold = null; String trustLevelThresholdParamValue = WorkflowRuntimeParameters.getParamValue(IMPORT_TRUST_LEVEL_THRESHOLD, context.getConfiguration()); if (trustLevelThresholdParamValue != null) { trustLevelThreshold = Float.valueOf(trustLevelThresholdParamValue); } return new DataInfoBasedApprover(WorkflowRuntimeParameters.getParamValue(IMPORT_INFERENCE_PROVENANCE_BLACKLIST, context.getConfiguration()), skipDeletedByInference, trustLevelThreshold); } /** * Handles result entity with relations. * */ private void handleResult(final String id, Map<String, List<QualifiedOafJsonRecord>> mappedRecords) throws InterruptedException, IOException { Oaf oafObj = buildOafObject(mappedRecords.get(Type.result.name())); if (oafObj == null) { log.error("missing 'body' qualifier value for record " + id); return; } if (resultApprover.approve(oafObj)) { DocumentMetadata docMeta = docMetaConverter.convert(oafObj.getEntity(), mappedRecords); if (docMeta!=null) { outputs.write(outputNameDocumentMeta, new AvroKey<DocumentMetadata>(docMeta)); } // hadling project relations handleRelation(mappedRecords.get(resProjColumnFamily), docProjectConverter, outputNameDocumentProject); // handling deduplication relations, required for contents deduplication and identifiers translation handleRelation(mappedRecords.get(dedupMappingColumnFamily), deduplicationMappingConverter, outputNameDedupMapping); } } /** * Handles relations by converting them to avro format and writing to output. */ private <T extends SpecificRecord> void handleRelation(List<QualifiedOafJsonRecord> relations, OafRelToAvroConverter<T> converter, String outputName) throws InterruptedException, IOException { if (!CollectionUtils.isEmpty(relations)) { for (QualifiedOafJsonRecord relationRecord : relations) { Oaf relOaf = OafHelper.buildOaf(relationRecord.getOafJson()); if (resultApprover.approve(relOaf)) { T avroRelation = converter.convert(relOaf.getRel()); if (avroRelation!=null) { outputs.write(outputName, new AvroKey<T>(avroRelation)); } } } } } /** * Handles entity by converting it to avro format and writing to output. * Each entity may consist of many parts: body with updates. * Optional relations are expected as the last parameters. */ private <T extends SpecificRecord> void handleEntity(final String id, List<QualifiedOafJsonRecord> bodyParts, OafEntityToAvroConverter<T> converter, String outputName, RelationConversionDTO<?>... relationConversionDTO) throws InterruptedException, IOException { Oaf oafObj = buildOafObject(bodyParts); if (oafObj == null) { log.error("missing 'body' qualifier value for record " + id); return; } if (resultApprover.approve(oafObj)) { T avroEntity = converter.convert(oafObj.getEntity()); if (avroEntity != null) { outputs.write(outputName, new AvroKey<T>(avroEntity)); } // handing relations if (relationConversionDTO!=null) { for (RelationConversionDTO<?> currentDTO : relationConversionDTO) { handleRelation(currentDTO.getOafJsonParts(), currentDTO.getConverter(), currentDTO.getOutputName()); } } } } /** * Builds {@link Oaf} object from JSON body represetation and updates. * * @param bodyRecords body records with optional updates * @return {@link Oaf} object built from JSON representation or null when body was undefined * @throws UnsupportedEncodingException * @throws ParseException */ private Oaf buildOafObject(List<QualifiedOafJsonRecord> bodyRecords) throws UnsupportedEncodingException, ParseException { if (bodyRecords !=null) { OafBodyWithOrderedUpdates bodyWithUpdates = new OafBodyWithOrderedUpdates(bodyRecords); if (bodyWithUpdates.getBody() != null) { Oaf.Builder oafBuilder = Oaf.newBuilder(); JsonFormat.merge(bodyWithUpdates.getBody(), oafBuilder); if (this.mergeBodyWithUpdates) { for (String oafUpdate : bodyWithUpdates.getOrderedUpdates()) { JsonFormat.merge(oafUpdate, oafBuilder); } } return oafBuilder.build(); } } return null; } // ------------------------ INNER CLASSES -------------------------- /** * Encapsulates set of parameters required to perform relation conversion. * * @param <T> */ private static class RelationConversionDTO <T extends SpecificRecord> { private final List<QualifiedOafJsonRecord> oafJsonParts; private final OafRelToAvroConverter<T> converter; private final String outputName; // ------------------------ CONSTRUCTORS -------------------------- public RelationConversionDTO(List<QualifiedOafJsonRecord> oafJsonParts, OafRelToAvroConverter<T> converter, String outputName) { this.oafJsonParts = oafJsonParts; this.converter = converter; this.outputName = outputName; } // ------------------------ GETTERS -------------------------- List<QualifiedOafJsonRecord> getOafJsonParts() { return oafJsonParts; } public OafRelToAvroConverter<T> getConverter() { return converter; } String getOutputName() { return outputName; } } }