package eu.dnetlib.iis.wf.citationmatching.direct; import java.io.IOException; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import pl.edu.icm.sparkutils.avro.SparkAvroLoader; import pl.edu.icm.sparkutils.avro.SparkAvroSaver; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import eu.dnetlib.iis.citationmatching.direct.schemas.Citation; import eu.dnetlib.iis.citationmatching.direct.schemas.DocumentMetadata; import eu.dnetlib.iis.common.java.io.HdfsUtils; import eu.dnetlib.iis.transformers.metadatamerger.schemas.ExtractedDocumentMetadataMergedWithOriginal; import eu.dnetlib.iis.wf.citationmatching.direct.converter.DirectCitationToCitationConverter; import eu.dnetlib.iis.wf.citationmatching.direct.converter.DocumentToDirectCitationMetadataConverter; import eu.dnetlib.iis.wf.citationmatching.direct.model.IdWithPosition; import eu.dnetlib.iis.wf.citationmatching.direct.service.CitationMatchingDirectCounterReporter; import eu.dnetlib.iis.wf.citationmatching.direct.service.ExternalIdCitationMatcher; import eu.dnetlib.iis.wf.citationmatching.direct.service.PickFirstDocumentFunction; import eu.dnetlib.iis.wf.citationmatching.direct.service.PickResearchArticleDocumentFunction; public class CitationMatchingDirectJob { private static SparkAvroLoader avroLoader = new SparkAvroLoader(); private static SparkAvroSaver avroSaver = new SparkAvroSaver(); private static DocumentToDirectCitationMetadataConverter documentToDirectCitationMetadataConverter = new DocumentToDirectCitationMetadataConverter(); private static ExternalIdCitationMatcher externalIdCitationMatcher = new ExternalIdCitationMatcher(); private static DirectCitationToCitationConverter directCitationToCitationConverter = new DirectCitationToCitationConverter(); private static CitationMatchingDirectCounterReporter citationMatchingDirectReporter = new CitationMatchingDirectCounterReporter(); //------------------------ LOGIC -------------------------- public static void main(String[] args) throws IOException { CitationMatchingDirectJobParameters params = new CitationMatchingDirectJobParameters(); JCommander jcommander = new JCommander(params); jcommander.parse(args); SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator"); try (JavaSparkContext sc = new JavaSparkContext(conf)) { HdfsUtils.remove(sc.hadoopConfiguration(), params.outputAvroPath); HdfsUtils.remove(sc.hadoopConfiguration(), params.outputReportPath); JavaRDD<ExtractedDocumentMetadataMergedWithOriginal> documents = avroLoader.loadJavaRDD(sc, params.inputAvroPath, ExtractedDocumentMetadataMergedWithOriginal.class); JavaRDD<DocumentMetadata> simplifiedDocuments = documents.map(document -> documentToDirectCitationMetadataConverter.convert(document)); simplifiedDocuments = simplifiedDocuments.cache(); JavaRDD<Citation> directDoiCitations = externalIdCitationMatcher.matchCitations(simplifiedDocuments, "doi", new PickFirstDocumentFunction()); JavaRDD<Citation> directPmidCitations = externalIdCitationMatcher.matchCitations(simplifiedDocuments, "pmid", new PickResearchArticleDocumentFunction()); JavaRDD<Citation> directCitations = mergeCitations(directDoiCitations, directPmidCitations); JavaRDD<eu.dnetlib.iis.common.citations.schemas.Citation> citations = directCitations.map(directCitation -> directCitationToCitationConverter.convert(directCitation)); citations.cache(); citationMatchingDirectReporter.report(sc, citations, params.outputReportPath); avroSaver.saveJavaRDD(citations, eu.dnetlib.iis.common.citations.schemas.Citation.SCHEMA$, params.outputAvroPath); } } //------------------------ PRIVATE -------------------------- private static JavaRDD<Citation> mergeCitations(JavaRDD<Citation> directDoiCitations, JavaRDD<Citation> directPmidCitations) { JavaPairRDD<IdWithPosition, Citation> directDoiCitationsWithKey = attachIdWithPositionKey(directDoiCitations); JavaPairRDD<IdWithPosition, Citation> directPmidCitationsWithKey = attachIdWithPositionKey(directPmidCitations); JavaRDD<Citation> directCitations = directDoiCitationsWithKey.fullOuterJoin(directPmidCitationsWithKey) .map(x -> x._2._1.isPresent() ? x._2._1.get() : x._2._2.get() ); return directCitations; } private static JavaPairRDD<IdWithPosition, Citation> attachIdWithPositionKey(JavaRDD<Citation> directCitations) { JavaPairRDD<IdWithPosition, Citation> directCitationsWithKey = directCitations .keyBy(directCitation -> new IdWithPosition(directCitation.getSourceDocumentId().toString(), directCitation.getPosition())); return directCitationsWithKey; } @Parameters(separators = "=") private static class CitationMatchingDirectJobParameters { @Parameter(names = "-inputAvroPath", required = true) private String inputAvroPath; @Parameter(names = "-outputAvroPath", required = true) private String outputAvroPath; @Parameter(names = "-outputReportPath", required = true) private String outputReportPath; } }