package eu.dnetlib.iis.wf.affmatching.bucket; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.model.AffMatchDocumentOrganization; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.read.DocumentOrganizationFetcher; import eu.dnetlib.iis.wf.affmatching.model.AffMatchAffiliation; import eu.dnetlib.iis.wf.affmatching.model.AffMatchOrganization; import scala.Tuple2; /** * Implementation of {@link AffOrgJoiner} that joins {@link AffMatchAffiliation} with {@link AffMatchOrganization} * based on document-organization relations. * * @author madryk */ public class DocOrgRelationAffOrgJoiner implements AffOrgJoiner { private static final long serialVersionUID = 1L; private DocumentOrganizationFetcher documentOrganizationFetcher; //------------------------ LOGIC -------------------------- /** * Joins the given affiliations with organizations based on document-organization relations.<br /> * Method uses {@link DocumentOrganizationFetcher} internally to fetch document-organization pairs.<br/> * * Affiliation will be joined with the organization if fetched document-organization relations will * contain pair ({@link AffMatchAffiliation#getDocumentId()}, {@link AffMatchOrganization#getId()}) */ @Override public JavaRDD<Tuple2<AffMatchAffiliation, AffMatchOrganization>> join(JavaRDD<AffMatchAffiliation> affiliations, JavaRDD<AffMatchOrganization> organizations) { JavaPairRDD<String, AffMatchAffiliation> affiliationsDocIdKey = affiliations.keyBy(aff -> aff.getDocumentId()); JavaPairRDD<String, AffMatchOrganization> organizationsOrgIdKey = organizations.keyBy(org -> org.getId()); JavaRDD<AffMatchDocumentOrganization> documentOrganizations = documentOrganizationFetcher.fetchDocumentOrganizations(); JavaPairRDD<String, AffMatchDocumentOrganization> documentOrganizationDocIdKey = documentOrganizations.keyBy(docOrg -> docOrg.getDocumentId()); JavaRDD<Tuple2<AffMatchAffiliation, AffMatchOrganization>> affOrgBucketPairs = affiliationsDocIdKey .join(documentOrganizationDocIdKey) .mapToPair(x -> new Tuple2<String, AffMatchAffiliation>(x._2._2.getOrganizationId(), x._2._1)) .join(organizationsOrgIdKey) .values(); return affOrgBucketPairs; } //------------------------ SETTERS -------------------------- public void setDocumentOrganizationFetcher(DocumentOrganizationFetcher documentOrganizationFetcher) { this.documentOrganizationFetcher = documentOrganizationFetcher; } }