package eu.dnetlib.iis.wf.importer.infospace.converter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; import eu.dnetlib.data.mapreduce.util.OafRelDecoder; import eu.dnetlib.data.proto.FieldTypeProtos.KeyValue; import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; import eu.dnetlib.data.proto.FieldTypeProtos.StringField; import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; import eu.dnetlib.data.proto.OafProtos.Oaf; import eu.dnetlib.data.proto.OafProtos.OafEntity; import eu.dnetlib.data.proto.OafProtos.OafRel; import eu.dnetlib.data.proto.PersonResultProtos.PersonResult.Authorship; import eu.dnetlib.data.proto.RelTypeProtos.RelType; import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; import eu.dnetlib.data.proto.ResultProtos; import eu.dnetlib.data.proto.ResultProtos.Result.Instance; import eu.dnetlib.data.proto.ResultProtos.Result.Journal; import eu.dnetlib.iis.common.InfoSpaceConstants; import eu.dnetlib.iis.importer.schemas.DocumentMetadata; import eu.dnetlib.iis.importer.schemas.PublicationType; import eu.dnetlib.iis.wf.importer.OafHelper; import eu.dnetlib.iis.wf.importer.infospace.QualifiedOafJsonRecord; import eu.dnetlib.iis.wf.importer.infospace.approver.FieldApprover; import eu.dnetlib.iis.wf.importer.infospace.approver.ResultApprover; /** * {@link OafEntity} containing document details to {@link DocumentMetadata} converter. * * @author mhorst * */ public class DocumentMetadataConverter implements OafEntityWithRelsToAvroConverter<DocumentMetadata> { protected static final Logger log = Logger.getLogger(DocumentMetadataConverter.class); private static final String NULL_STRING_VALUE = "null"; private static final String LANG_CLASSID_UNDEFINED = "und"; /** * Person-result relation column family. */ private final String personResultColumnFamily; /** * Result approver to be used when validating person result relations. */ private ResultApprover resultApprover; /** * Field approver to be used when validating inferred fields. */ private FieldApprover fieldApprover; // ------------------------ CONSTRUCTORS -------------------------- /** * * @param resultApprover approves {@link OafRel} objects * @param fieldApprover approves fields */ public DocumentMetadataConverter(ResultApprover resultApprover, FieldApprover fieldApprover) { this.resultApprover = Preconditions.checkNotNull(resultApprover); this.fieldApprover = Preconditions.checkNotNull(fieldApprover); this.personResultColumnFamily = OafRelDecoder.getCFQ(RelType.personResult, SubRelType.authorship, Authorship.RelName.hasAuthor.toString()); } // ------------------------ LOGIC -------------------------- @Override public DocumentMetadata convert(OafEntity oafEntity, Map<String, List<QualifiedOafJsonRecord>> relations) throws IOException { Preconditions.checkNotNull(oafEntity); if (!oafEntity.hasResult()) { log.error("skipping: no result object for id " + oafEntity.getId()); return null; } DocumentMetadata.Builder builder = DocumentMetadata.newBuilder(); builder.setId(oafEntity.getId()); ResultProtos.Result sourceResult = oafEntity.getResult(); createBasicMetadata(sourceResult, builder); handleAdditionalIds(oafEntity, builder); handleDatasourceIds(oafEntity, builder); List<QualifiedOafJsonRecord> personRelations = relations.get(personResultColumnFamily); if (CollectionUtils.isNotEmpty(personRelations)) { handlePersons(personRelations, builder); } return builder.build(); } // ------------------------ PRIVATE -------------------------- /** * Creates basic metadata object. * * @param sourceResult * @param metaBuilder * @return basic metadata object */ private DocumentMetadata.Builder createBasicMetadata(ResultProtos.Result sourceResult, DocumentMetadata.Builder metaBuilder) { handlePublicationType(sourceResult.getInstanceList(), metaBuilder); if (sourceResult.hasMetadata()) { handleTitle(sourceResult.getMetadata().getTitleList(), metaBuilder); handleDescription(sourceResult.getMetadata().getDescriptionList(), metaBuilder); handleLanguage(sourceResult.getMetadata().getLanguage(), metaBuilder); handlePublisher(sourceResult.getMetadata().getPublisher(), metaBuilder); handleJournal(sourceResult.getMetadata().getJournal(), metaBuilder); handleYear(sourceResult.getMetadata().getDateofacceptance(), metaBuilder); handleKeywords(sourceResult.getMetadata().getSubjectList(), metaBuilder); } return metaBuilder; } private void handleTitle(List<StructuredProperty> titleList, DocumentMetadata.Builder metaBuilder) { if (CollectionUtils.isNotEmpty(titleList)) { for (StructuredProperty titleProp : titleList) { if (InfoSpaceConstants.SEMANTIC_CLASS_MAIN_TITLE.equals(titleProp.getQualifier().getClassid()) && fieldApprover.approve(titleProp.getDataInfo())) { metaBuilder.setTitle(titleProp.getValue()); } } if (!metaBuilder.hasTitle()) { // if no main title available, setting first applicable title from the list for (StructuredProperty titleProp : titleList) { if (fieldApprover.approve(titleProp.getDataInfo())) { metaBuilder.setTitle(titleProp.getValue()); break; } } } } } private void handleDescription(List<StringField> descriptionList, DocumentMetadata.Builder metaBuilder) { if (CollectionUtils.isNotEmpty(descriptionList)) { for (StringField currentDescription : descriptionList) { if (fieldApprover.approve(currentDescription.getDataInfo()) && StringUtils.isNotBlank(currentDescription.getValue()) && !NULL_STRING_VALUE.equals(currentDescription.getValue())) { metaBuilder.setAbstract$(currentDescription.getValue()); break; } } } } private void handleLanguage(Qualifier language, DocumentMetadata.Builder metaBuilder) { if (StringUtils.isNotBlank(language.getClassid()) && !LANG_CLASSID_UNDEFINED.equals(language.getClassid())) { metaBuilder.setLanguage(language.getClassid()); } } private void handlePublisher(StringField publisher, DocumentMetadata.Builder metaBuilder) { if (StringUtils.isNotBlank(publisher.getValue()) && fieldApprover.approve(publisher.getDataInfo())) { metaBuilder.setPublisher(publisher.getValue()); } } private void handleJournal(Journal journal, DocumentMetadata.Builder metaBuilder) { if (StringUtils.isNotBlank(journal.getName()) && fieldApprover.approve(journal.getDataInfo())) { metaBuilder.setJournal(journal.getName()); } } private void handleYear(StringField dateOfAcceptance, DocumentMetadata.Builder metaBuilder) { if (fieldApprover.approve(dateOfAcceptance.getDataInfo())) { Integer yearValue = extractYear(dateOfAcceptance.getValue()); if (yearValue != null) { metaBuilder.setYear(yearValue); } } } private void handleKeywords(List<StructuredProperty> subjectList, DocumentMetadata.Builder metaBuilder) { if (CollectionUtils.isNotEmpty(subjectList)) { // setting only selected subjects as keywords, skipping inferred data List<String> extractedKeywords = extractValues(subjectList); if (CollectionUtils.isNotEmpty(extractedKeywords)) { if (metaBuilder.getKeywords() == null) { metaBuilder.setKeywords(new ArrayList<CharSequence>()); } metaBuilder.getKeywords().addAll(extractedKeywords); } } } private void handlePublicationType(List<Instance> instanceList, DocumentMetadata.Builder metaBuilder) { PublicationType.Builder publicationTypeBuilder = PublicationType.newBuilder(); if (CollectionUtils.isNotEmpty(instanceList)) { for (Instance instance : instanceList) { if (InfoSpaceConstants.SEMANTIC_CLASS_INSTANCE_TYPE_ARTICLE .equals(instance.getInstancetype().getClassid())) { publicationTypeBuilder.setArticle(true); } else if (InfoSpaceConstants.SEMANTIC_CLASS_INSTANCE_TYPE_DATASET .equals(instance.getInstancetype().getClassid())) { publicationTypeBuilder.setDataset(true); } } } metaBuilder.setPublicationType(publicationTypeBuilder.build()); } /** * Extracts year integer value from date. * * @param date * @return year int value or null when provided date in invalid format */ private static Integer extractYear(String date) { // expected date format: yyyy-MM-dd if (StringUtils.isNotBlank(date) && date.indexOf('-') == 4) { return Integer.valueOf(date.substring(0, date.indexOf('-'))); } else { return null; } } /** * Handles additional identifiers. * */ private DocumentMetadata.Builder handleAdditionalIds(OafEntity oafEntity, DocumentMetadata.Builder metaBuilder) { // setting additional identifiers Map<CharSequence, CharSequence> additionalIds = new HashMap<CharSequence, CharSequence>(); if (CollectionUtils.isNotEmpty(oafEntity.getPidList())) { for (StructuredProperty currentPid : oafEntity.getPidList()) { if (StringUtils.isNotBlank(currentPid.getQualifier().getClassid()) && StringUtils.isNotBlank(currentPid.getValue()) && fieldApprover.approve(currentPid.getDataInfo())) { additionalIds.put(currentPid.getQualifier().getClassid(), currentPid.getValue()); } } } if (!additionalIds.isEmpty()) { metaBuilder.setExternalIdentifiers(additionalIds); } return metaBuilder; } /** * Handles datasource identifiers. * */ private DocumentMetadata.Builder handleDatasourceIds(OafEntity oafEntity, DocumentMetadata.Builder metaBuilder) { if (CollectionUtils.isNotEmpty(oafEntity.getCollectedfromList())) { List<CharSequence> datasourceIds = new ArrayList<CharSequence>( oafEntity.getCollectedfromList().size()); for (KeyValue currentCollectedFrom : oafEntity.getCollectedfromList()) { datasourceIds.add(currentCollectedFrom.getKey()); } metaBuilder.setDatasourceIds(datasourceIds); } return metaBuilder; } /** * Handles persons. * * @param relations person result relations * @param builder * @return builder with persons set * @throws IOException */ private DocumentMetadata.Builder handlePersons(List<QualifiedOafJsonRecord> relations, DocumentMetadata.Builder builder) throws IOException { TreeMap<CharSequence, CharSequence> authorsSortedMap = new TreeMap<CharSequence, CharSequence>(); for (QualifiedOafJsonRecord personResultRecord : relations) { Oaf persResOAF = OafHelper.buildOaf(personResultRecord.getOafJson()); OafRel personResRel = persResOAF.getRel(); if (resultApprover.approve(persResOAF)) { authorsSortedMap.put(personResRel.getPersonResult().getAuthorship().getRanking(), personResRel.getTarget()); } } // storing authors sorted by ranking if (!authorsSortedMap.isEmpty()) { builder.setAuthorIds(new ArrayList<CharSequence>(authorsSortedMap.values())); } return builder; } /** * Extracts values from {@link StructuredProperty} list. Checks DataInfo * element whether this piece of information should be approved. * */ private List<String> extractValues(Collection<StructuredProperty> source) { if (CollectionUtils.isNotEmpty(source)) { List<String> results = new ArrayList<String>(source.size()); for (StructuredProperty current : source) { if (fieldApprover.approve(current.getDataInfo())) { results.add(current.getValue()); } } return results; } else { return Collections.emptyList(); } } }