package eu.dnetlib.iis.wf.citationmatching.direct.service; import java.io.Serializable; import java.util.List; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import eu.dnetlib.iis.citationmatching.direct.schemas.Citation; import eu.dnetlib.iis.citationmatching.direct.schemas.DocumentMetadata; import eu.dnetlib.iis.citationmatching.direct.schemas.ReferenceMetadata; import scala.Tuple2; /** * Extractor of document references * * @author madryk * */ public class ExternalIdReferenceExtractor implements Serializable { private static final long serialVersionUID = 1L; //------------------------ LOGIC -------------------------- /** * Extracts document references based on external id with {@literal idType} type. * * @param documentsMetadata * @param idType - type of external identifier (e.g. doi, pmid) * @return pair rdd where keys are external ids of {@literal idType} type * and values are partially filled {@link Citation} objects * (with null {@link Citation#getDestinationDocumentId()}) */ public JavaPairRDD<String, Citation> extractExternalIdReferences(JavaRDD<DocumentMetadata> documentsMetadata, String idType) { Preconditions.checkNotNull(documentsMetadata); Preconditions.checkNotNull(idType); JavaPairRDD<String, Citation> externalIdReferencesRdd = documentsMetadata .flatMapToPair(metadata -> { List<Tuple2<String, Citation>> externalIdReferences = Lists.newArrayList(); for (ReferenceMetadata referenceMetadata : metadata.getReferences()) { if (referenceMetadata.getExternalIds() == null || !referenceMetadata.getExternalIds().containsKey(idType)) { continue; } String externalId = referenceMetadata.getExternalIds().get(idType).toString(); Citation partialCitation = new Citation(metadata.getId(), referenceMetadata.getPosition(), null); externalIdReferences.add(new Tuple2<String, Citation>(externalId, partialCitation)); } return externalIdReferences; }); return externalIdReferencesRdd; } }