package eu.dnetlib.iis.wf.citationmatching.input;
import java.util.List;
import java.util.Map;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import eu.dnetlib.iis.citationmatching.schemas.BasicMetadata;
import eu.dnetlib.iis.citationmatching.schemas.DocumentMetadata;
import eu.dnetlib.iis.importer.schemas.Person;
import scala.Tuple2;
/**
* Extractor of authorId to authorName mapping
*
* @author madryk
*/
public class AuthorNameMappingExtractor {
private static final String AUTHOR_NOT_FOUND_NAME_FALLBACK = "_UNDEFINED_";
//------------------------ LOGIC --------------------------
/**
* Extracts authorId to authorName mapping rdd grouped by documents.
* If author couldn't be found in persons rdd, then algorithm will use {@literal _UNDEFINED_}
* for authorName.
*
* @param documents - pair rdd with documents (keys should contain documentIds; values should
* contain {@link DocumentMetadata} with authorIds in {@link BasicMetadata#getAuthors()})
* @param persons - rdd with persons
* @return rdd where key is documentId and value is a map with authorId to authorName mapping
* for that document. If document doesn't contain any author, then it won't be included
* in returned rdd
*/
public JavaPairRDD<String, Map<String, String>> extractAuthorNameMapping(JavaPairRDD<String, DocumentMetadata> documents, JavaRDD<Person> persons) {
JavaPairRDD<String, String> authorIdToDocumentIdMapping = extractAuthorIdToDocumentIdMapping(documents);
JavaPairRDD<String, String> personIdToNameMapping = extractPersonIdToNameMapping(persons);
JavaPairRDD<String, Map<String, String>> documentAuthors =
matchPersonNamesWithDocumentAuthors(authorIdToDocumentIdMapping, personIdToNameMapping);
return documentAuthors;
}
//------------------------ PRIVATE --------------------------
private JavaPairRDD<String, String> extractAuthorIdToDocumentIdMapping(JavaPairRDD<String, DocumentMetadata> documents) {
JavaPairRDD<String, String> authorIdDocumentIdMapping = documents
.flatMapToPair(sourceMeta -> {
List<Tuple2<String, String>> l = Lists.newArrayList();
for (CharSequence authorId : sourceMeta._2.getBasicMetadata().getAuthors()) {
l.add(new Tuple2<>(authorId.toString(), sourceMeta._2.getId().toString()));
}
return l;
});
return authorIdDocumentIdMapping;
}
private JavaPairRDD<String, String> extractPersonIdToNameMapping(JavaRDD<Person> persons) {
return persons
.filter(p -> p.getFullname()!=null)
.keyBy(x -> x.getId().toString())
.mapValues(p -> p.getFullname().toString());
}
private JavaPairRDD<String, Map<String, String>> matchPersonNamesWithDocumentAuthors(JavaPairRDD<String, String> authorIdToDocumentIdMapping, JavaPairRDD<String, String> personIdToNameMapping) {
JavaPairRDD<String, Map<String, String>> documentAuthors = authorIdToDocumentIdMapping
.leftOuterJoin(personIdToNameMapping)
.mapToPair(x -> {
String authorId = x._1;
String documentId = x._2._1;
String authorFullName = x._2._2.or(AUTHOR_NOT_FOUND_NAME_FALLBACK);
return new Tuple2<>(documentId, new Tuple2<>(authorId, authorFullName));
})
.groupByKey()
.mapValues(authorIdNameIterable -> {
Map<String, String> authorIdNameMapping = Maps.newHashMap();
authorIdNameIterable.forEach(authorIdName -> authorIdNameMapping.put(authorIdName._1, authorIdName._2));
return authorIdNameMapping;
});
return documentAuthors;
}
}