package eu.dnetlib.iis.wf.citationmatching.direct.service; import java.io.Serializable; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import eu.dnetlib.iis.citationmatching.direct.schemas.DocumentMetadata; /** * Extractor of external id to internal id mapping * * @author madryk * */ public class IdentifierMappingExtractor implements Serializable { private static final long serialVersionUID = 1L; //------------------------ LOGIC -------------------------- /** * Extracts an 'external to internal id' mapping from documents * * @param documents * @param idType - type of external identifier (e.g. doi, pmid) * @param pickSingle - function used in case there will be more than one document with the same external identifier. * Function should pick one of those documents and return it as a result. * @return pair rdd where keys are external ids of {@literal idType} type * and values are documents ids */ public JavaPairRDD<String, String> extractIdMapping(JavaRDD<DocumentMetadata> documents, String idType, Function<Iterable<DocumentMetadata>, DocumentMetadata> pickSingle) { JavaPairRDD<String, String> externalIdToId = documents .filter(document -> document.getExternalIdentifiers() != null && document.getExternalIdentifiers().containsKey(idType)) .keyBy(document -> document.getExternalIdentifiers().get(idType).toString()) .groupByKey() .mapValues(pickSingle) .mapValues(document -> document.getId().toString()); return externalIdToId; } }