package eu.dnetlib.iis.wf.affmatching.bucket.projectorg.read; import java.io.Serializable; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import com.google.common.base.Preconditions; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.model.AffMatchDocumentOrganization; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.model.AffMatchDocumentProject; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.model.AffMatchProjectOrganization; /** * Creates {@link AffMatchDocumentOrganization} objects based on {@link AffMatchDocumentProject} * and {@link AffMatchProjectOrganization} combined datastores. * * @author mhorst * */ public class DocumentOrganizationCombiner implements Serializable { private static final long serialVersionUID = 1L; /** * Creates {@link AffMatchDocumentOrganization} relations. * * @param docProjRDD {@link AffMatchDocumentProject} relations * @param projOrgRDD {@link AffMatchProjectOrganization} relations * @param docProjConfidenceLevelThreshold document project relation confidence level threshold, * confidence level check is skipped when this parameter is set to null */ public JavaRDD<AffMatchDocumentOrganization> combine(JavaRDD<AffMatchDocumentProject> docProjRDD, JavaRDD<AffMatchProjectOrganization> projOrgRDD, Float docProjConfidenceLevelThreshold) { Preconditions.checkNotNull(docProjRDD); Preconditions.checkNotNull(projOrgRDD); JavaPairRDD<String, AffMatchDocumentProject> projIdToDocProj = docProjRDD .keyBy(docProj -> docProj.getProjectId()) .filter(docProj -> (docProjConfidenceLevelThreshold == null || docProj._2.getConfidenceLevel() >= docProjConfidenceLevelThreshold)); JavaPairRDD<String, AffMatchProjectOrganization> projIdToProjOrg = projOrgRDD .keyBy(projOrg -> projOrg.getProjectId()); return projIdToDocProj.join(projIdToProjOrg).map(x -> { return new AffMatchDocumentOrganization(x._2._1.getDocumentId(), x._2._2.getOrganizationId()); }).distinct(); } }