package eu.dnetlib.iis.wf.citationmatching.input; import java.util.List; import java.util.Map; import org.apache.spark.api.java.JavaPairRDD; import com.google.common.collect.Lists; import eu.dnetlib.iis.citationmatching.schemas.BasicMetadata; import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata; /** * Attacher of author names into documents rdd * * @author madryk */ public class AuthorNameAttacher { //------------------------ LOGIC -------------------------- /** * Attaches author names to documents rdd * * @param documents - pair rdd with documents (keys should contain documentIds; values should * contain {@link DocumentMetadata} with authorIds in {@link BasicMetadata#getAuthors()}) * @param documentAuthors - authorId to authorName mapping rdd grouped by documents * @return documents rdd with authorNames instead of authorIds in {@link BasicMetadata#getAuthors()} */ public JavaPairRDD<String, DocumentMetadata> attachAuthorNames(JavaPairRDD<String, DocumentMetadata> documents, JavaPairRDD<String, Map<String, String>> documentAuthors) { JavaPairRDD<String, DocumentMetadata> outputMeta = documents .leftOuterJoin(documentAuthors) .mapValues(x -> { List<CharSequence> authorIds = x._1.getBasicMetadata().getAuthors(); List<CharSequence> authors = Lists.newArrayList(); for (CharSequence authorId : authorIds) { authors.add(x._2.get().get(authorId.toString())); } DocumentMetadata newMeta = DocumentMetadata.newBuilder(x._1).build(); newMeta.getBasicMetadata().setAuthors(authors); return newMeta; }); return outputMeta; } }