package eu.dnetlib.iis.wf.citationmatching.direct.service; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import com.google.common.collect.Lists; import eu.dnetlib.iis.common.citations.schemas.Citation; import eu.dnetlib.iis.common.report.ReportEntryFactory; import eu.dnetlib.iis.common.schemas.ReportEntry; import pl.edu.icm.sparkutils.avro.SparkAvroSaver; /** * Reporter of citation matching direct job counters.<br/> * It calculates citation matching direct counters and saves them * as {@link ReportEntry} datastore. * * @author madryk */ public class CitationMatchingDirectCounterReporter { private static final String MATCHED_CITATIONS_COUNTER = "processing.citationMatching.direct.citDocReference"; private static final String DOCS_WITH_MATCHED_CITATIONS_COUNTER = "processing.citationMatching.direct.doc"; private SparkAvroSaver avroSaver = new SparkAvroSaver(); //------------------------ LOGIC -------------------------- /** * Calculates citation matching counters using citations rdd * and saves them under outputReportPath. */ public void report(JavaSparkContext sparkContext, JavaRDD<Citation> citations, String outputReportPath) { ReportEntry matchedCitationsCounter = generateMatchedCitationsCounter(citations); ReportEntry docsWithMatchedCitationsCounter = generateDocsWithCitationsCounter(citations); JavaRDD<ReportEntry> report = sparkContext.parallelize(Lists.newArrayList(matchedCitationsCounter, docsWithMatchedCitationsCounter)); avroSaver.saveJavaRDD(report, ReportEntry.SCHEMA$, outputReportPath); } //------------------------ PRIVATE -------------------------- private ReportEntry generateMatchedCitationsCounter(JavaRDD<Citation> matchedCitations) { long citationsCount = matchedCitations.count(); return ReportEntryFactory.createCounterReportEntry(MATCHED_CITATIONS_COUNTER, citationsCount); } private ReportEntry generateDocsWithCitationsCounter(JavaRDD<Citation> matchedCitations) { long docsWithCitationCount = matchedCitations .map(x -> x.getSourceDocumentId().toString()) .distinct().count(); return ReportEntryFactory.createCounterReportEntry(DOCS_WITH_MATCHED_CITATIONS_COUNTER, docsWithCitationCount); } }