package eu.dnetlib.iis.wf.affmatching.bucket.projectorg.read; import org.apache.commons.lang3.StringUtils; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import com.google.common.base.Preconditions; import eu.dnetlib.iis.importer.schemas.DocumentToProject; import eu.dnetlib.iis.wf.affmatching.bucket.projectorg.model.AffMatchDocumentProject; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; /** * Implementation of {@link DocumentProjectReader} that reads * {@link DocumentToProject} objects written in avro files. * * @author madryk * */ public class IisDocumentProjectReader implements DocumentProjectReader { private static final long serialVersionUID = 1L; private SparkAvroLoader avroLoader = new SparkAvroLoader(); private DocumentProjectConverter converter = new DocumentProjectConverter(); //------------------------ LOGIC -------------------------- /** * Reads {@link AffMatchDocumentProject}s rdd written as avro files under <code>inputPath</code> * with {@link DocumentToProject} schema. */ @Override public JavaRDD<AffMatchDocumentProject> readDocumentProjects(JavaSparkContext sc, String inputPath) { Preconditions.checkNotNull(sc); Preconditions.checkArgument(StringUtils.isNotBlank(inputPath)); return avroLoader.loadJavaRDD(sc, inputPath, DocumentToProject.class) .map(docProj -> converter.convert(docProj)); } }