/** * ============================================================================= * * ORCID (R) Open Source * http://orcid.org * * Copyright (c) 2012-2014 ORCID, Inc. * Licensed under an MIT-Style License (MIT) * http://orcid.org/open-source-license * * This copyright and license information (including a link to the full license) * shall be included in its entirety in all copies or substantial portion of * the software. * * ============================================================================= */ package org.orcid.core.cli; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.commons.lang3.StringUtils; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.orcid.core.manager.OrgManager; import org.orcid.jaxb.model.message.Iso3166Country; import org.orcid.persistence.dao.OrgDisambiguatedDao; import org.orcid.persistence.dao.OrgDisambiguatedSolrDao; import org.orcid.persistence.jpa.entities.IndexingStatus; import org.orcid.persistence.jpa.entities.OrgDisambiguatedEntity; import org.orcid.persistence.jpa.entities.OrgEntity; import org.orcid.pojo.ajaxForm.PojoUtil; import org.orcid.utils.NullUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import au.com.bytecode.opencsv.CSVReader; /** * * @author Will Simpson * */ public class LoadRinggoldData { private static final String RINGGOLD_CHARACTER_ENCODING = "UTF-8"; private static final String RINGGOLD_SOURCE_TYPE = "RINGGOLD"; private static final String DN = "DN"; private static final Logger LOGGER = LoggerFactory.getLogger(LoadRinggoldData.class); @Option(name = "-f", usage = "Path to CSV file containing Ringgold parents to load into DB") private File fileToLoad; @Option(name = "-d", usage = "Path to CSV file containing Ringgold deleted IDs to process") private File deletedIdsFile; @Option(name = "-z", usage = "Path to zip file containing Ringgold data to process") private File zipFile; @Option(name = "-c", usage = "Check for duplicates only (no load)") private Boolean checkForDuplicates; private OrgDisambiguatedDao orgDisambiguatedDao; private OrgDisambiguatedSolrDao orgDisambiguatedSolrDao; private OrgManager orgManager; private int numAdded; private int numUpdated; private int numUnchanged; private int numSkipped; private int numDeleted; private int numDeletionsSkipped; public static void main(String[] args) { LoadRinggoldData loadRinggoldData = new LoadRinggoldData(); CmdLineParser parser = new CmdLineParser(loadRinggoldData); try { parser.parseArgument(args); loadRinggoldData.validateArgs(parser); loadRinggoldData.init(); loadRinggoldData.execute(); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.exit(1); } catch (Throwable t) { System.err.println(t); t.printStackTrace(); System.exit(2); } System.exit(0); } private void validateArgs(CmdLineParser parser) throws CmdLineException { if (NullUtils.allNull(fileToLoad, deletedIdsFile, zipFile, checkForDuplicates)) { throw new CmdLineException(parser, "At least one of -f | -d | -z | -c must be specificed"); } } @SuppressWarnings("resource") private void init() { ApplicationContext context = new ClassPathXmlApplicationContext("orcid-core-context.xml"); orgDisambiguatedDao = (OrgDisambiguatedDao) context.getBean("orgDisambiguatedDao"); orgDisambiguatedSolrDao = (OrgDisambiguatedSolrDao) context.getBean("orgDisambiguatedSolrDao"); orgManager = (OrgManager) context.getBean("orgManager"); } public void execute() { if (checkForDuplicates != null && checkForDuplicates) { checkForDuplicates(); return; } dropUniqueConstraint(); if (fileToLoad != null) { processParentsCsv(); } if (deletedIdsFile != null) { processDeletedIds(); } if (zipFile != null) { processZip(); } createUniqueConstraint(); LOGGER.info("Finished loading Ringgold data"); } private void checkForDuplicates() { LOGGER.info("Checking for duplicates"); List<OrgDisambiguatedEntity> duplicates = orgDisambiguatedDao.findDuplicates(); for (OrgDisambiguatedEntity duplicate : duplicates) { LOGGER.info("Found duplicate: {}\t{}\t{}\t{}\t{}\t{}\t{}", new Object[] { duplicate.getSourceType(), duplicate.getSourceId(), duplicate.getName(), duplicate.getCity(), duplicate.getRegion(), duplicate.getCountry(), duplicate.getOrgType() }); } LOGGER.info("Finished checking for duplicates"); } private void processDeletedIds() { try (Reader reader = openFile(deletedIdsFile)) { processDeletedIdsReader(reader); } catch (IOException e) { throw new RuntimeException("Error reading csv file", e); } } private void processZip() { try (ZipFile zip = new ZipFile(zipFile)) { ZipEntry parentsEntry = null; ZipEntry deletedIdsEntry = null; ZipEntry altNamesEntry = null; for (ZipEntry entry : Collections.list(zip.entries())) { String entryName = entry.getName(); if (entryName.endsWith("_parents.csv")) { LOGGER.info("Found parents file: " + entryName); parentsEntry = entry; } if (entryName.endsWith("deleted_ids.csv")) { LOGGER.info("Found deleted ids file: " + entryName); deletedIdsEntry = entry; } if (entryName.endsWith("alt_names.csv")) { LOGGER.info("Found alt names file: " + entryName); altNamesEntry = entry; } } if (parentsEntry != null) { Reader reader = getReader(zip, parentsEntry); if (altNamesEntry != null) { Reader altNamesReader = getReader(zip, altNamesEntry); Map<String, String> altNames = processAltNamesFile(altNamesReader); processReader(reader, altNames); } else { processReader(reader, null); } } if (deletedIdsEntry != null) { Reader reader = getReader(zip, deletedIdsEntry); processDeletedIdsReader(reader); } } catch (IOException e) { throw new RuntimeException("Error reading zip file", e); } } private Reader getReader(ZipFile zip, ZipEntry entry) throws IOException, UnsupportedEncodingException { InputStream is = zip.getInputStream(entry); Reader reader = new InputStreamReader(is, RINGGOLD_CHARACTER_ENCODING); return reader; } private void processParentsCsv() { try (Reader reader = openFile(fileToLoad)) { processReader(reader, null); } catch (IOException e) { throw new RuntimeException("Error reading csv file", e); } } private Reader openFile(File fileToLoad) { Reader reader = null; try { FileInputStream fis = new FileInputStream(fileToLoad); reader = new InputStreamReader(fis, RINGGOLD_CHARACTER_ENCODING); } catch (FileNotFoundException e) { if (!fileToLoad.exists()) { throw new IllegalArgumentException("Input file does not exist: " + fileToLoad); } if (!fileToLoad.canRead()) { throw new IllegalArgumentException("Input exists, but can't read: " + fileToLoad); } throw new IllegalArgumentException("Unable to read input file: " + fileToLoad + "\n" + e); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } return reader; } private void processReader(Reader reader, Map<String, String> altNames) throws IOException { try (CSVReader csvReader = createCSVReader(reader)) { String[] line; while ((line = csvReader.readNext()) != null) { processLine(line, altNames); } } finally { LOGGER.info("Number added={}, number updated={}, number unchanged={}, num skipped={}, total={}", new Object[] { numAdded, numUpdated, numUnchanged, numSkipped, getTotal() }); } } private int getTotal() { return numAdded + numUpdated + numUnchanged + numSkipped; } private CSVReader createCSVReader(Reader reader) { return new CSVReader(reader, ',', '"', 1); } private void processLine(String[] line, Map<String, String> altNames) { String gpCode = line[0]; String pCode = line[1]; String name = line[2]; String extName = line[3]; if (StringUtils.isNotBlank(extName)) { name = extName; } String city = line[4]; String extCity = line[5]; if (StringUtils.isNotBlank(extCity)) { city = extCity; } Iso3166Country country = parseCountry(line[7]); String state = line[8]; if (StringUtils.isBlank(state)) { state = null; } String type = line[9]; /** * Look for the name in the alt names map, if there is one name, replace * the one found in the parents file */ if (altNames != null && altNames.containsKey(pCode)) { if (!PojoUtil.isEmpty(altNames.get(pCode))) { name = altNames.get(pCode); } } processOrg(gpCode, pCode, name, city, state, country, type); } private Map<String, String> processAltNamesFile(Reader reader) throws IOException { Map<String, String> altNamesMap = new HashMap<String, String>(); Map<String, Date> altNamesTimestamps = new HashMap<String, Date>(); try (CSVReader csvReader = createCSVReader(reader)) { String[] line; while ((line = csvReader.readNext()) != null) { // If the DN indicator exists if (!PojoUtil.isEmpty(line[7]) && DN.equals(line[7])) { String name = null; // Get the name // If the ext_name is not empty, use it if (!PojoUtil.isEmpty(line[2])) { LOGGER.info("Using ext_name {} for pCode {}", new Object[] { line[2], line[0] }); name = line[2]; } else { LOGGER.info("Using name {} for pCode {}", new Object[] { line[2], line[0] }); name = line[1]; } // get the timestamp Date timestamp = null; try { timestamp = getDateFromTimestamp(line[8]); } catch (ParseException p) { LOGGER.warn("Unable to parse timestamp {} for p_code {}", new Object[] { line[8], line[0] }); } // Check if there is already a name for that pCode if (altNamesMap.containsKey(line[0])) { // If the timestamp is not empty, check it against the // new timestamp if (altNamesTimestamps.containsKey(line[0]) && altNamesTimestamps.get(line[0]) != null) { Date existing = altNamesTimestamps.get(line[0]); if (existing.before(timestamp)) { LOGGER.info("Replacing old name {}({}) with {}({})", new Object[] { altNamesMap.get(line[0]), altNamesTimestamps.get(line[0]), name, timestamp }); altNamesMap.put(line[0], name); altNamesTimestamps.put(line[0], timestamp); } else { LOGGER.info("Leaving old name {}({}) instead of using this one {}({})", new Object[] { altNamesMap.get(line[0]), altNamesTimestamps.get(line[0]), name, timestamp }); } } else { // Else, just replace it with the new one altNamesMap.put(line[0], name); altNamesTimestamps.put(line[0], timestamp); } } else { altNamesMap.put(line[0], name); altNamesTimestamps.put(line[0], timestamp); } } } } finally { LOGGER.info("Number added={}, number updated={}, number unchanged={}, num skipped={}, total={}", new Object[] { numAdded, numUpdated, numUnchanged, numSkipped, getTotal() }); } return altNamesMap; } private Date getDateFromTimestamp(String timestamp) throws ParseException { SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMdd HH:mm:ss"); try { return formatter.parse(timestamp); } catch (ParseException e) { throw e; } } private void processOrg(String gpCode, String pCode, String name, String city, String state, Iso3166Country country, String type) { OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(pCode, RINGGOLD_SOURCE_TYPE); if (existingEntity == null) { LOGGER.info("No existing disambiguated org with sourceId={} and sourceType={}", pCode, RINGGOLD_SOURCE_TYPE); processNew(gpCode, pCode, name, city, state, country, type); } else { LOGGER.info("Found existing disambiguated org with sourceId={} and sourceType={}", pCode, RINGGOLD_SOURCE_TYPE); processExisting(existingEntity, gpCode, pCode, name, city, country, state, type); } } private void processNew(String gpCode, String pCode, String name, String city, String state, Iso3166Country country, String type) { if (isDuplicate(pCode, name, city, state, country)) { return; } OrgDisambiguatedEntity orgDisambiguatedEntity = new OrgDisambiguatedEntity(); setFields(orgDisambiguatedEntity, gpCode, pCode, name, city, country, state, type); orgDisambiguatedDao.persist(orgDisambiguatedEntity); createOrUpdateOrg(name, city, country, state, orgDisambiguatedEntity.getId()); numAdded++; } private void processExisting(OrgDisambiguatedEntity existingEntity, String gpCode, String pCode, String name, String city, Iso3166Country country, String state, String type) { if (!hasChanged(existingEntity, gpCode, name, city, country, state, type)) { numUnchanged++; return; } existingEntity.setIndexingStatus(IndexingStatus.PENDING); setFields(existingEntity, gpCode, pCode, name, city, country, state, type); orgDisambiguatedDao.merge(existingEntity); createOrUpdateOrg(name, city, country, state, existingEntity.getId()); numUpdated++; } private boolean isDuplicate(String pCode, String name, String city, String state, Iso3166Country country) { OrgDisambiguatedEntity duplicate = orgDisambiguatedDao.findByNameCityRegionCountryAndSourceType(name, city, state, country, RINGGOLD_SOURCE_TYPE); if (duplicate != null) { LOGGER.info("Skipping disambiguated org with sourceId={} because it appears to be a duplicate of sourceId={}, sourceType={}", new Object[] { pCode, duplicate.getSourceId(), RINGGOLD_SOURCE_TYPE }); numSkipped++; return true; } return false; } private void createOrUpdateOrg(String name, String city, Iso3166Country country, String state, Long orgDisambiguatedId) { // Ensure there is a corresponding org and that the org is linked to the // disambiguated org OrgEntity orgEntity = new OrgEntity(); orgEntity.setName(name); orgEntity.setRegion(state); orgEntity.setCity(city); orgEntity.setCountry(country); orgManager.createUpdate(orgEntity, orgDisambiguatedId); } private boolean hasChanged(OrgDisambiguatedEntity existingEntity, String gpCode, String name, String city, Iso3166Country country, String state, String type) { if (!gpCode.equals(existingEntity.getSourceParentId())) { return true; } if (!name.equals(existingEntity.getName())) { return true; } if (!city.equals(existingEntity.getCity())) { return true; } if (!country.equals(existingEntity.getCountry())) { return true; } String existingRegion = existingEntity.getRegion(); if (state == null) { if (existingRegion != null) { return true; } } else if (!state.equals(existingRegion)) { return true; } if (!type.equals(existingEntity.getOrgType())) { return true; } return false; } private void setFields(OrgDisambiguatedEntity orgDisambiguatedEntity, String gpCode, String pCode, String name, String city, Iso3166Country country, String state, String type) { orgDisambiguatedEntity.setName(name); orgDisambiguatedEntity.setCity(city); orgDisambiguatedEntity.setRegion(state); orgDisambiguatedEntity.setCountry(country); orgDisambiguatedEntity.setOrgType(type); orgDisambiguatedEntity.setSourceId(pCode); orgDisambiguatedEntity.setSourceParentId(gpCode); orgDisambiguatedEntity.setSourceType(RINGGOLD_SOURCE_TYPE); } private Iso3166Country parseCountry(String countryString) { countryString = countryString.toUpperCase(); if ("USA".equals(countryString)) { countryString = "US"; } else if ("CAN".equals(countryString)) { countryString = "CA"; } return Iso3166Country.valueOf(countryString); } private void processDeletedIdsReader(Reader reader) throws IOException { try (CSVReader csvReader = createCSVReader(reader)) { String[] line; while ((line = csvReader.readNext()) != null) { processDeletedIdsLine(line); } } finally { LOGGER.info("Number deleted={}, number deletions skipped={}", new Object[] { numDeleted, numDeletionsSkipped }); } } private void processDeletedIdsLine(String[] line) { String deletedSourceId = line[0]; String replacementSourceId = line[1]; OrgDisambiguatedEntity deletedEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(deletedSourceId, RINGGOLD_SOURCE_TYPE); if (deletedEntity != null) { LOGGER.info("Deleted ID exists in DB, id={}", deletedSourceId); Long deletedEntityId = deletedEntity.getId(); OrgDisambiguatedEntity replacementEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(replacementSourceId, RINGGOLD_SOURCE_TYPE); if (replacementEntity == null) { LOGGER.warn("Replacement does not exist, id={}", replacementEntity); numDeletionsSkipped++; } else { Long replacementEntityId = replacementEntity.getId(); orgDisambiguatedSolrDao.remove(deletedEntityId); orgDisambiguatedDao.replace(deletedEntityId, replacementEntityId); orgDisambiguatedDao.remove(deletedEntityId); numDeleted++; } } } private void createUniqueConstraint() { LOGGER.info("About to create unique constraint"); try { orgDisambiguatedDao.createUniqueConstraint(); LOGGER.info("Finished creating unique constraint"); } catch (RuntimeException e) { LOGGER.warn("Problem creating unique constraint"); checkForDuplicates(); } } private void dropUniqueConstraint() { LOGGER.info("About to drop unique constraint"); orgDisambiguatedDao.dropUniqueConstraint(); } }