package eu.dnetlib.iis.wf.citationmatching; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import eu.dnetlib.iis.citationmatching.schemas.Citation; import eu.dnetlib.iis.common.report.ReportEntryFactory; import eu.dnetlib.iis.common.schemas.ReportEntry; import pl.edu.icm.sparkutils.avro.SparkAvroSaver; /** * Reporter of citation matching job counters.<br/> * It calculates citation matching counters and saves them * as {@link ReportEntry} datastore. * * @author madryk */ public class CitationMatchingCounterReporter { private static final String MATCHED_CITATIONS_COUNTER = "processing.citationMatching.fuzzy.citDocReference"; private static final String DOCS_WITH_MATCHED_CITATIONS_COUNTER = "processing.citationMatching.fuzzy.doc"; private SparkAvroSaver avroSaver = new SparkAvroSaver(); private String reportPath; private JavaSparkContext sparkContext; //------------------------ LOGIC -------------------------- /** * Calculates citation matching counters using matchedCitations rdd * and saves them under {@link #setReportPath(String)} */ public void report(JavaRDD<Citation> matchedCitations) { checkState(); ReportEntry matchedCitationsCounter = generateMatchedCitationsCounter(matchedCitations); ReportEntry docsWithMatchedCitationsCounter = generateDocsWithCitationsCounter(matchedCitations); JavaRDD<ReportEntry> report = sparkContext.parallelize(Lists.newArrayList(matchedCitationsCounter, docsWithMatchedCitationsCounter)); avroSaver.saveJavaRDD(report, ReportEntry.SCHEMA$, reportPath); } //------------------------ PRIVATE -------------------------- private void checkState() { Preconditions.checkNotNull(reportPath, "reportPath has not been set"); Preconditions.checkNotNull(sparkContext, "sparkContext has not been set"); } private ReportEntry generateMatchedCitationsCounter(JavaRDD<Citation> matchedCitations) { long citationsCount = matchedCitations.count(); return ReportEntryFactory.createCounterReportEntry(MATCHED_CITATIONS_COUNTER, citationsCount); } private ReportEntry generateDocsWithCitationsCounter(JavaRDD<Citation> matchedCitations) { long docsWithCitationCount = matchedCitations .map(x -> x.getSourceDocumentId().toString()) .distinct().count(); return ReportEntryFactory.createCounterReportEntry(DOCS_WITH_MATCHED_CITATIONS_COUNTER, docsWithCitationCount); } //------------------------ SETTERS -------------------------- public void setReportPath(String reportPath) { this.reportPath = reportPath; } public void setSparkContext(JavaSparkContext sparkContext) { this.sparkContext = sparkContext; } }