package eu.dnetlib.iis.wf.referenceextraction.project;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import eu.dnetlib.iis.common.java.io.HdfsUtils;
import eu.dnetlib.iis.common.schemas.ReportEntry;
import eu.dnetlib.iis.common.schemas.ReportEntryType;
import eu.dnetlib.iis.importer.schemas.Project;
import eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject;
import pl.edu.icm.sparkutils.avro.SparkAvroLoader;
import pl.edu.icm.sparkutils.avro.SparkAvroSaver;
import scala.Tuple2;
/**
* Generates {@link DocumentToProject} relation counters grouped by funders.
* @author mhorst
*
*/
public class ProjectFunderReportJob {
private static final String FUNDER_FUNDING_SEPARATOR = "::";
private static final String FUNDER_TOKEN = "#{funder}";
private static SparkAvroLoader avroLoader = new SparkAvroLoader();
private static SparkAvroSaver avroSaver = new SparkAvroSaver();
//------------------------ LOGIC --------------------------
public static void main(String[] args) throws IOException {
ProjectFunderReportJobParameters params = new ProjectFunderReportJobParameters();
JCommander jcommander = new JCommander(params);
jcommander.parse(args);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator");
try (JavaSparkContext sc = new JavaSparkContext(conf)) {
HdfsUtils.remove(sc.hadoopConfiguration(), params.outputReportPath);
JavaRDD<DocumentToProject> documentToProject = avroLoader.loadJavaRDD(sc, params.inputDocumentToProjectAvroPath, DocumentToProject.class);
JavaRDD<Project> project = avroLoader.loadJavaRDD(sc, params.inputProjectAvroPath, Project.class);
JavaPairRDD<CharSequence, Integer> projIdToOne = documentToProject.mapToPair(x -> new Tuple2<CharSequence, Integer>(x.getProjectId(), 1));
JavaPairRDD<CharSequence, String> projIdToFunder = project.mapToPair(x -> new Tuple2<CharSequence, String>(
x.getId(), extractFunderName(x.getFundingClass())));
JavaPairRDD<CharSequence, Tuple2<Integer, String>> joinedByProjectId = projIdToOne.join(projIdToFunder);
JavaPairRDD<CharSequence, Integer> funderWithOne = joinedByProjectId.mapToPair(x -> new Tuple2<CharSequence, Integer>(x._2._2, x._2._1));
JavaPairRDD<CharSequence, Integer> reducedFunderWithCount = funderWithOne.reduceByKey((x, y) -> x+y);
avroSaver.saveJavaRDD(convertToReportEntries(reducedFunderWithCount.sortByKey(true), params.reportKeyTemplate),
ReportEntry.SCHEMA$, params.outputReportPath);
}
}
//------------------------ PRIVATE --------------------------
/**
* Extracts funder name out of the funding class.
* Never returns null, "unknown" value is returned when funder detail is not available.
*/
static String extractFunderName(CharSequence fundingClass) {
if (fundingClass != null) {
String fundingClassStr = fundingClass.toString();
if (fundingClassStr.contains(FUNDER_FUNDING_SEPARATOR) && !fundingClassStr.startsWith(FUNDER_FUNDING_SEPARATOR)) {
String funder = StringUtils.splitByWholeSeparator(fundingClassStr, FUNDER_FUNDING_SEPARATOR)[0];
if (!StringUtils.isBlank(funder)) {
return funder;
}
}
}
return "unknown";
}
/**
* Converts all funder names into report entry keys using template and replacing FUNDER_TOKEN with real funder name.
*/
static JavaRDD<ReportEntry> convertToReportEntries(JavaPairRDD<CharSequence, Integer> source,
String reportKeyTemplate) {
return source.map(x -> new ReportEntry(
StringUtils.replace(reportKeyTemplate, FUNDER_TOKEN, x._1.toString().toLowerCase()),
ReportEntryType.COUNTER, String.valueOf(x._2)));
}
@Parameters(separators = "=")
private static class ProjectFunderReportJobParameters {
@Parameter(names = "-inputProjectAvroPath", required = true)
private String inputProjectAvroPath;
@Parameter(names = "-inputDocumentToProjectAvroPath", required = true)
private String inputDocumentToProjectAvroPath;
@Parameter(names = "-outputReportPath", required = true)
private String outputReportPath;
@Parameter(names = "-reportKeyTemplate", required = true)
private String reportKeyTemplate;
}
}