/**
* =============================================================================
*
* ORCID (R) Open Source
* http://orcid.org
*
* Copyright (c) 2012-2014 ORCID, Inc.
* Licensed under an MIT-Style License (MIT)
* http://orcid.org/open-source-license
*
* This copyright and license information (including a link to the full license)
* shall be included in its entirety in all copies or substantial portion of
* the software.
*
* =============================================================================
*/
package org.orcid.core.cli;
import java.io.File;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.orcid.core.manager.OrcidProfileManager;
import org.orcid.jaxb.model.message.OrcidProfile;
import org.orcid.jaxb.model.message.OrcidWork;
import org.orcid.jaxb.model.message.OrcidWorks;
import org.orcid.jaxb.model.message.Visibility;
import org.orcid.utils.NullUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
public class FindOrcidWorkDuplicates {
public static final String TAB = "\t";
public static final String SEPERATOR = TAB;
public static final String NEW_LINE = "\n";
public static final String CARRIAGE_RETURN = "\r";
public static final String MISSING_ENTRY = "Missing";
public static final String HEADER = "ORCID" + SEPERATOR + "Definitive/Duplicate" + SEPERATOR + "Put Code" + SEPERATOR + "Title" + SEPERATOR + "Visibility";
private static final Logger LOG = LoggerFactory.getLogger(FindOrcidWorkDuplicates.class);
@Option(name = "-f", usage = "Path to write output results file to")
private String outputFileName;
@Option(name = "-i", usage = "Path to take input from")
private String inputFileName;
@Option(name = "-o", usage = "Standalone orcid identifier")
private String orcid;
private OrcidProfileManager orcidProfileManager;
private List<String> orcidsToQuery;
/**
* @param args
*/
public static void main(String[] args) throws Exception {
FindOrcidWorkDuplicates findOrcidWorkDuplicates = new FindOrcidWorkDuplicates();
CmdLineParser parser = new CmdLineParser(findOrcidWorkDuplicates);
parser.parseArgument(args);
findOrcidWorkDuplicates.validateArgs(parser);
findOrcidWorkDuplicates.createOutputFile();
}
private void validateArgs(CmdLineParser parser) throws Exception {
if (NullUtils.allNull(outputFileName)) {
throw new CmdLineException(parser, "You must specify a file name");
}
// prefer processing single orcid if both are passed, don't accidentally hit DB needlessly
if (orcid != null) {
orcidsToQuery = new ArrayList<String>(Arrays.asList(new String[] { orcid }));
}
else if (inputFileName != null) {
orcidsToQuery = FileUtils.readLines(new File(inputFileName));
}
else {
throw new CmdLineException(parser, "You must specify either a single orcid or provide an input name");
}
}
@SuppressWarnings("resource")
private void createOutputFile() throws Exception {
ApplicationContext context = new ClassPathXmlApplicationContext("orcid-core-context.xml");
orcidProfileManager = (OrcidProfileManager) context.getBean("orcidProfileManager");
LOG.info(MessageFormat.format("Started building file {0} at {1}", new Object[] { outputFileName, new Date() }));
File outputFile = new File(outputFileName);
FileUtils.writeStringToFile(outputFile, "\n" + HEADER + "\n", true);
int counter = 0;
for (String orcidIdentifier : orcidsToQuery) {
try {
StringBuilder records = new StringBuilder();
OrcidProfile orcidProfileWorksOnly = orcidProfileManager.retrieveClaimedOrcidWorks(orcidIdentifier);
// is there are less than 2 works there obv can't be duplicates
if (!multipleWorks(orcidProfileWorksOnly))
continue;
List<OrcidWorkDeduped> dedupedWorks = dedupeWorksForOrcid(orcidProfileWorksOnly.getOrcidActivities().getOrcidWorks());
if (dedupedWorks != null) {
LOG.debug("Found orcid with duplicate works: " + orcidIdentifier);
records.append(buildDuplicationString(dedupedWorks, orcidIdentifier));
}
FileUtils.writeStringToFile(outputFile, records.toString(), true);
records.delete(0, records.length());
}
catch (Exception e) {
LOG.error("exception processing ORCID: " + orcidIdentifier, e);
}
LOG.debug("iteration: " + counter++);
}
// create file with tab seperated headers..
LOG.info(MessageFormat.format("Finished building file {0} at {1}", new Object[] { outputFileName, new Date() }));
}
private List<OrcidWorkDeduped> dedupeWorksForOrcid(OrcidWorks orcidWorks) {
Map<OrcidWorkMatcher, List<OrcidWork>> worksSplitByDuplicates = splitWorksIntoDuplicateSets(orcidWorks);
List<OrcidWorkDeduped> orcidWorkDupes = new ArrayList<FindOrcidWorkDuplicates.OrcidWorkDeduped>();
for (Map.Entry<OrcidWorkMatcher, List<OrcidWork>> entry : worksSplitByDuplicates.entrySet()) {
List<OrcidWork> allOrcidWorks = entry.getValue();
//there may have been more than one work on a profile, but may not be duplicates
if (allOrcidWorks.size() < 2) {
continue;
}
// sort by desc put code in case we cant rely on visibility
Collections.sort(allOrcidWorks, new Comparator<OrcidWork>() {
public int compare(OrcidWork work1, OrcidWork work2) {
return Integer.valueOf(work2.getPutCode()).compareTo(Integer.valueOf(work1.getPutCode()));
}
});
// yes
// add to string
// determine which is the dupe and which the definitive XML
OrcidWork definitiveWork = null;
OrcidWork definitivePublicWork = null;
OrcidWork definitiveLimitedWork = null;
// if there are varying visibilities then the definitive is the must public level of visibility
for (OrcidWork orcidWork : allOrcidWorks) {
if (Visibility.PUBLIC.equals(orcidWork.getVisibility())) {
definitivePublicWork = orcidWork;
break;
}
// keep looping around in case we find a public work, but don't override the most recent limited work
// once set
else if (Visibility.LIMITED.equals(orcidWork.getVisibility()) && definitiveLimitedWork == null) {
definitiveLimitedWork = orcidWork;
}
}
// fallback onto limited work and if nothing else the max put code
definitiveWork = definitivePublicWork != null ? definitivePublicWork : definitiveLimitedWork;
// if they all match the definitive is the most recent date
definitiveWork = definitiveWork != null ? definitiveWork : allOrcidWorks.get(0);
allOrcidWorks.remove(definitiveWork);
orcidWorkDupes.add(new OrcidWorkDeduped(definitiveWork, allOrcidWorks));
}
return orcidWorkDupes;
}
private StringBuffer buildDuplicationString(List<OrcidWorkDeduped> dedupedWorks, String orcid) {
StringBuffer allDupes = new StringBuffer();
for (OrcidWorkDeduped dedupedWork : dedupedWorks) {
allDupes.append(deriveOrcidData(orcid, true, Arrays.asList(new OrcidWork[] { dedupedWork.getDefinitive() })));
allDupes.append(deriveOrcidData(orcid, false, dedupedWork.getDupes()));
}
return allDupes;
}
private StringBuffer deriveOrcidData(String orcid, boolean definitive, List<OrcidWork> orcidWorks) {
StringBuffer duplicationString = new StringBuffer();
String definitiveIdentifier = definitive ? "Definitive" : "Duplicate";
for (OrcidWork duplicate : orcidWorks) {
String putCode = duplicate.getPutCode();
String title = duplicate.getWorkTitle() != null && duplicate.getWorkTitle().getTitle() != null
&& StringUtils.isNotBlank(duplicate.getWorkTitle().getTitle().getContent()) ? duplicate.getWorkTitle().getTitle().getContent() : MISSING_ENTRY;
String visibility = duplicate.getVisibility() != null ? duplicate.getVisibility().value() : MISSING_ENTRY;
duplicationString.append(orcid).append(SEPERATOR);
duplicationString.append(definitiveIdentifier).append(SEPERATOR);
duplicationString.append(putCode).append(SEPERATOR);
duplicationString.append(title).append(SEPERATOR);
duplicationString.append(visibility).append(SEPERATOR);
duplicationString.append(NEW_LINE);
}
return duplicationString;
}
private Map<OrcidWorkMatcher, List<OrcidWork>> splitWorksIntoDuplicateSets(OrcidWorks orcidWorks) {
// do any works match, bar the put code and visibility
Map<OrcidWorkMatcher, List<OrcidWork>> orcidWorksAsDupes = new HashMap<OrcidWorkMatcher, List<OrcidWork>>();
// for each work associated with a profile
for (OrcidWork orcidWork : orcidWorks.getOrcidWork()) {
OrcidWorkMatcher orcidMatcherKey = new OrcidWorkMatcher(orcidWork);
// does anything exist in the map for that key
if (orcidWorksAsDupes.containsKey(orcidMatcherKey)) {
// if so get the map and add
List<OrcidWork> existingDupesForWork = orcidWorksAsDupes.get(orcidMatcherKey);
existingDupesForWork.add(orcidWork);
orcidWorksAsDupes.put(orcidMatcherKey, existingDupesForWork);
}
else {
// if not build a new list - may be adding duplicates for this work
List<OrcidWork> orcidWorksForKey = new ArrayList<OrcidWork>();
orcidWorksForKey.add(orcidWork);
orcidWorksAsDupes.put(orcidMatcherKey, orcidWorksForKey);
}
}
return orcidWorksAsDupes;
}
private boolean multipleWorks(OrcidProfile orcidProfile) {
return orcidProfile != null && orcidProfile.getOrcidActivities() != null && orcidProfile.getOrcidActivities().getOrcidWorks() != null
&& orcidProfile.getOrcidActivities().getOrcidWorks().getOrcidWork().size() > 1;
}
private class OrcidWorkDeduped {
private OrcidWork definitive;
private List<OrcidWork> dupes;
public OrcidWorkDeduped(OrcidWork definitive, List<OrcidWork> dupes) {
super();
this.definitive = definitive;
this.dupes = dupes;
}
public OrcidWork getDefinitive() {
return definitive;
}
public List<OrcidWork> getDupes() {
return dupes;
}
}
}