package eu.dnetlib.iis.wf.citationmatching; import java.io.Serializable; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata; import eu.dnetlib.iis.wf.citationmatching.converter.entity_id.DocEntityId; import pl.edu.icm.coansys.citations.InputDocumentReader; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; import scala.Tuple2; /** * Reader of input documents rdd * * @author madryk */ public class DocumentMetadataInputReader implements InputDocumentReader<String, DocumentMetadata>, Serializable { private static final long serialVersionUID = 1L; private final SparkAvroLoader avroLoader = new SparkAvroLoader(); //------------------------ LOGIC -------------------------- /** * Reads input documents rdd from avro {@link DocumentMetadata} datastore. * Keys of returned rdd will contain document id with added {@literal doc_} prefix. * Values of returned rdd will contain document in form of {@link DocumentMetadata} object. */ @Override public JavaPairRDD<String, DocumentMetadata> readDocuments(JavaSparkContext sparkContext, String inputDocumentsPath) { JavaRDD<DocumentMetadata> documents = avroLoader.loadJavaRDD(sparkContext, inputDocumentsPath, DocumentMetadata.class); return documents.mapToPair(doc -> new Tuple2<>(buildDocumentId(doc), doc)); } //------------------------ PRIVATE -------------------------- private String buildDocumentId(DocumentMetadata documentMetadata) { return new DocEntityId(documentMetadata.getId().toString()).toString(); } }