package eu.dnetlib.iis.wf.citationmatching; import java.io.Serializable; import java.util.stream.Collectors; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata; import eu.dnetlib.iis.citationmatching.schemas.ReferenceMetadata; import eu.dnetlib.iis.wf.citationmatching.converter.entity_id.CitEntityId; import pl.edu.icm.coansys.citations.InputCitationReader; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; import scala.Tuple2; /** * Reader of input citations rdd * * @author madryk */ public class ReferenceMetadataInputReader implements InputCitationReader<String, ReferenceMetadata>, Serializable { private static final long serialVersionUID = 1L; private final SparkAvroLoader avroLoader = new SparkAvroLoader(); //------------------------ LOGIC -------------------------- /** * Reads input citations rdd from avro {@link DocumentMetadata} datastore. * Keys of returned rdd will contain citation id. * Id of citation is built by adding {@literal cit_} prefix * and {@literal _position} to document id. * Values of returned rdd will contain citation in form of {@link ReferenceMetadata} object. */ @Override public JavaPairRDD<String, ReferenceMetadata> readCitations(JavaSparkContext sparkContext, String inputCitationsPath) { JavaRDD<DocumentMetadata> fullDocuments = avroLoader.loadJavaRDD(sparkContext, inputCitationsPath, DocumentMetadata.class); JavaPairRDD<String, ReferenceMetadata> references = fullDocuments .flatMapToPair(fullDocument -> fullDocument.getReferences().stream() .map(reference -> new Tuple2<String, ReferenceMetadata>( buildCitationId(fullDocument.getId().toString(), reference), reference)) .collect(Collectors.toList()) ); return references; } //------------------------ PRIVATE -------------------------- private String buildCitationId(String documentId, ReferenceMetadata referenceMetadata) { return new CitEntityId(documentId, referenceMetadata.getPosition()).toString(); } }