package eu.dnetlib.iis.wf.citationmatching.output; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import eu.dnetlib.iis.citationmatching.schemas.Citation; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; import pl.edu.icm.sparkutils.avro.SparkAvroSaver; /** * Output transformer job for citation matching. * It converts avro datastore from {@link eu.dnetlib.iis.citationmatching.schemas.Citation} * to {@link eu.dnetlib.iis.common.citations.schemas.Citation} format * * @author madryk * */ public class CitationMatchingOutputTransformerJob { private static SparkAvroLoader avroLoader = new SparkAvroLoader(); private static SparkAvroSaver avroSaver = new SparkAvroSaver(); private static CitationToCommonCitationConverter citationToCommonCitationConverter = new CitationToCommonCitationConverter(); //------------------------ LOGIC -------------------------- public static void main(String[] args) { CitationMatchingOutputTransformerJobParameters params = new CitationMatchingOutputTransformerJobParameters(); JCommander jcommander = new JCommander(params); jcommander.parse(args); SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); try (JavaSparkContext sc = new JavaSparkContext(conf)) { JavaRDD<Citation> inputCitations = avroLoader.loadJavaRDD(sc, params.input, Citation.class); JavaRDD<eu.dnetlib.iis.common.citations.schemas.Citation> outputCitations = inputCitations.map(inputCitation -> citationToCommonCitationConverter.convert(inputCitation)); avroSaver.saveJavaRDD(outputCitations, eu.dnetlib.iis.common.citations.schemas.Citation.SCHEMA$, params.output); } } //------------------------ PRIVATE -------------------------- @Parameters(separators = "=") private static class CitationMatchingOutputTransformerJobParameters { @Parameter(names = "-input", required = true) private String input; @Parameter(names = "-output", required = true) private String output; } }