package eu.dnetlib.iis.wf.citationmatching.input; import java.util.Map; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata; import eu.dnetlib.iis.importer.schemas.Person; import eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; import pl.edu.icm.sparkutils.avro.SparkAvroSaver; import scala.Tuple2; /** * * @author madryk * */ public class CitationMatchingInputTransformerJob { private static SparkAvroLoader avroLoader = new SparkAvroLoader(); private static SparkAvroSaver avroSaver = new SparkAvroSaver(); private static DocumentToCitationDocumentConverter documentToCitationDocumentConverter = new DocumentToCitationDocumentConverter(); private static AuthorNameMappingExtractor authorNameMappingExtractor = new AuthorNameMappingExtractor(); private static AuthorNameAttacher authorNameAttacher = new AuthorNameAttacher(); //------------------------ LOGIC -------------------------- public static void main(String[] args) throws InterruptedException { CitationMatchingInputTransformerJobParameters params = new CitationMatchingInputTransformerJobParameters(); JCommander jcommander = new JCommander(params); jcommander.parse(args); SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator"); try (JavaSparkContext sc = new JavaSparkContext(conf)) { JavaRDD<ExtractedDocumentMetadataMergedWithOriginal> inputDocuments = avroLoader.loadJavaRDD(sc, params.inputMetadata, ExtractedDocumentMetadataMergedWithOriginal.class); JavaRDD<Person> inputPersons = avroLoader.loadJavaRDD(sc, params.inputPerson, Person.class); JavaPairRDD<String, DocumentMetadata> documents = inputDocuments.mapToPair( document -> new Tuple2<>(document.getId().toString(), documentToCitationDocumentConverter.convert(document))); JavaPairRDD<String, Map<String, String>> documentAuthors = authorNameMappingExtractor.extractAuthorNameMapping(documents, inputPersons); JavaPairRDD<String, DocumentMetadata> documentsWithAuthorNames = authorNameAttacher.attachAuthorNames(documents, documentAuthors); avroSaver.saveJavaRDD(documentsWithAuthorNames.values(), DocumentMetadata.SCHEMA$, params.output); } } //------------------------ PRIVATE -------------------------- @Parameters(separators = "=") private static class CitationMatchingInputTransformerJobParameters { @Parameter(names = "-inputMetadata", required = true) private String inputMetadata; @Parameter(names = "-inputPerson", required = true) private String inputPerson; @Parameter(names = "-output", required = true) private String output; } }