package eu.dnetlib.iis.wf.affmatching.bucket.projectorg.read; import java.io.Serializable; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.model.AffMatchDocumentProject; import scala.Tuple2; /** * Merger of two input {@link AffMatchDocumentProject} rdds. * * @author madryk */ public class DocumentProjectMerger implements Serializable { private static final long serialVersionUID = 1L; //------------------------ LOGIC -------------------------- /** * Returns merged {@link AffMatchDocumentProject} rdd from two input rdds.<br/> * Method returns only unique records (equality of records is checked based on * {@link AffMatchDocumentProject#getDocumentId()} and {@link AffMatchDocumentProject#getProjectId()} fields).<br/> * In case of duplicates, the one with higher {@link AffMatchDocumentProject#getConfidenceLevel()} * will be picked. */ public JavaRDD<AffMatchDocumentProject> merge(JavaRDD<AffMatchDocumentProject> firstDocumentProjects, JavaRDD<AffMatchDocumentProject> secondDocumentProjects) { JavaPairRDD<Tuple2<String, String>, AffMatchDocumentProject> firstDocProjWithKey = firstDocumentProjects .keyBy(x -> new Tuple2<>(x.getDocumentId(), x.getProjectId())); JavaPairRDD<Tuple2<String, String>, AffMatchDocumentProject> secondDocProjWithKey = secondDocumentProjects .keyBy(x -> new Tuple2<>(x.getDocumentId(), x.getProjectId())); return firstDocProjWithKey .union(secondDocProjWithKey) .reduceByKey((x, y) -> x.getConfidenceLevel() >= y.getConfidenceLevel() ? x : y) .values(); } }