package eu.dnetlib.iis.wf.citationmatching.direct.service;
import java.io.Serializable;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import com.google.common.base.Preconditions;
import eu.dnetlib.iis.citationmatching.direct.schemas.Citation;
import eu.dnetlib.iis.citationmatching.direct.schemas.DocumentMetadata;
/**
* Matcher of citations from {@link DocumentMetadata} rdd based on
* identity of external ids with same type.
*
* @author madryk
*
*/
public class ExternalIdCitationMatcher implements Serializable {
private static final long serialVersionUID = 1L;
private final IdentifierMappingExtractor idMappingExtractor = new IdentifierMappingExtractor();
private final ExternalIdReferenceExtractor referencePicker = new ExternalIdReferenceExtractor();
//------------------------ LOGIC --------------------------
/**
*
* Matches citations from {@link DocumentMetadata} rdd based
* on identity of external ids with type {@literal idType}.<br/>
* That is:
* <br/>
* Citation from first document (document1) to second document (document2) will be found
* if one of document1 references have external id which is equal to
* external id of document2 (considering that both external ids are of {@literal idType}).
* <br/>
* <br/>
* For example, function will find a citation for two below documents:
* <pre>
* {id: "document1", references: [ {externalIds: {"pmid": "123456"}} ], ... }
* {id: "document2", externalIdentifiers: {"pmid": "123456"}, ... }
* </pre>
* Assuming that {@literal idType} is <code>pmid</code>
*
* @param documents -
* @param idType - type of external identifier (e.g. doi, pmid)
* @param pickSingle - function used in case there will be more than one document with the same external identifier.
* Function should pick one of those documents and return it as a result.
* @return rdd of matched citations
*/
public JavaRDD<Citation> matchCitations(JavaRDD<DocumentMetadata> documents, String idType, Function<Iterable<DocumentMetadata>, DocumentMetadata> pickSingle) {
Preconditions.checkNotNull(documents);
Preconditions.checkNotNull(idType);
Preconditions.checkNotNull(pickSingle);
JavaPairRDD<String, String> externalIdToIdMapping = idMappingExtractor.extractIdMapping(documents, idType, pickSingle);
JavaPairRDD<String, Citation> externalIdReferences = referencePicker.extractExternalIdReferences(documents, idType);
JavaRDD<Citation> externalIdCitation = externalIdReferences.join(externalIdToIdMapping)
.map(x -> {
Citation partialCitation = x._2._1;
String destinationDocumentId = x._2._2;
return Citation.newBuilder(partialCitation)
.setDestinationDocumentId(destinationDocumentId)
.build();
});
return externalIdCitation;
}
}