/** * ============================================================================= * * ORCID (R) Open Source * http://orcid.org * * Copyright (c) 2012-2014 ORCID, Inc. * Licensed under an MIT-Style License (MIT) * http://orcid.org/open-source-license * * This copyright and license information (including a link to the full license) * shall be included in its entirety in all copies or substantial portion of * the software. * * ============================================================================= */ package org.orcid.core.cli; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Date; import java.util.HashMap; import javax.ws.rs.core.MultivaluedMap; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.lang.StringUtils; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.orcid.jaxb.model.message.Iso3166Country; import org.orcid.persistence.constants.OrganizationStatus; import org.orcid.persistence.dao.GenericDao; import org.orcid.persistence.dao.OrgDisambiguatedDao; import org.orcid.persistence.jpa.entities.IndexingStatus; import org.orcid.persistence.jpa.entities.OrgDisambiguatedEntity; import org.orcid.persistence.jpa.entities.OrgDisambiguatedExternalIdentifierEntity; import org.orcid.pojo.ajaxForm.PojoUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.ApplicationContext; import org.springframework.context.support.ClassPathXmlApplicationContext; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.sun.jersey.api.client.Client; import com.sun.jersey.api.client.WebResource; import com.sun.jersey.core.util.MultivaluedMapImpl; /** * * @author Angel Montenegro * */ public class LoadFundRefData { class RDFOrganization { String doi, name, country, state, stateCode, city, type, subtype, status; } private static final Logger LOGGER = LoggerFactory.getLogger(LoadFundRefData.class); private static final String FUNDREF_SOURCE_TYPE = "FUNDREF"; private static final String STATE_NAME = "STATE"; private static final String STATE_ABBREVIATION = "abbr"; private static final String DEPRECATED_INDICATOR = "http://data.crossref.org/fundingdata/vocabulary/Deprecated"; private static String geonamesApiUrl; // Params @Option(name = "-f", usage = "Path to RDF file containing FundRef info to load into DB") private File fileToLoad; // Resources private GenericDao<OrgDisambiguatedExternalIdentifierEntity, Long> orgDisambiguatedExternalIdentifierDao; private OrgDisambiguatedDao orgDisambiguatedDao; private String apiUser; // Cache private HashMap<String, String> cache = new HashMap<String, String>(); // xPath queries private String conceptsExpression = "/RDF/ConceptScheme/hasTopConcept"; private String itemExpression = "/RDF/Concept[@about='%s']"; private String orgNameExpression = "prefLabel/Label/literalForm"; private String orgCountryExpression = "country"; private String orgStateExpression = "state"; private String orgTypeExpression = "fundingBodyType"; private String orgSubTypeExpression = "fundingBodySubType"; private String statusExpression = "status"; // xPath init private XPath xPath = XPathFactory.newInstance().newXPath(); // Statistics private long updatedOrgs = 0; private long addedDisambiguatedOrgs = 0; private long addedExternalIdentifiers = 0; public static void main(String[] args) { LoadFundRefData loadFundRefData = new LoadFundRefData(); CmdLineParser parser = new CmdLineParser(loadFundRefData); try { parser.parseArgument(args); loadFundRefData.validateArgs(parser); loadFundRefData.init(); loadFundRefData.execute(); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); } System.exit(0); } private void validateArgs(CmdLineParser parser) throws CmdLineException { if (fileToLoad == null) { throw new CmdLineException(parser, "-f parameter must be specificed"); } } @SuppressWarnings({ "resource", "unchecked" }) private void init() { ApplicationContext context = new ClassPathXmlApplicationContext("orcid-core-context.xml"); orgDisambiguatedDao = (OrgDisambiguatedDao) context.getBean("orgDisambiguatedDao"); orgDisambiguatedExternalIdentifierDao = (GenericDao) context.getBean("orgDisambiguatedExternalIdentifierEntityDao"); // Geonames params geonamesApiUrl = (String) context.getBean("geonamesApiUrl"); apiUser = (String) context.getBean("geonamesUser"); } /** * Executes the import process * */ private void execute() { try { long start = System.currentTimeMillis(); FileInputStream file = new FileInputStream(fileToLoad); DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = builderFactory.newDocumentBuilder(); Document xmlDocument = builder.parse(file); // Parent node NodeList nodeList = (NodeList) xPath.compile(conceptsExpression).evaluate(xmlDocument, XPathConstants.NODESET); for (int i = 0; i < nodeList.getLength(); i++) { RDFOrganization rdfOrganization = getOrganization(xmlDocument, nodeList.item(i).getAttributes()); LOGGER.info("Processing organization from RDF, doi:{}, name:{}, country:{}, state:{}, stateCode:{}, type:{}, subtype:{}, status:{}", new String[] { rdfOrganization.doi, rdfOrganization.name, rdfOrganization.country, rdfOrganization.state, rdfOrganization.stateCode, rdfOrganization.type, rdfOrganization.subtype, rdfOrganization.status }); // #1: Look for an existing org OrgDisambiguatedEntity existingEntity = findByDetails(rdfOrganization); if(existingEntity != null) { // #2: If the name, city or region changed, update those values if(entityChanged(rdfOrganization, existingEntity)) { existingEntity.setCity(rdfOrganization.city); Iso3166Country country = StringUtils.isNotBlank(rdfOrganization.country) ? Iso3166Country.fromValue(rdfOrganization.country) : null; existingEntity.setCountry(country); existingEntity.setName(rdfOrganization.name); String orgType = rdfOrganization.type + (StringUtils.isNotBlank(rdfOrganization.subtype) ? ('/' + rdfOrganization.subtype) : ""); existingEntity.setOrgType(orgType); existingEntity.setRegion(rdfOrganization.stateCode); existingEntity.setSourceId(rdfOrganization.doi); existingEntity.setSourceType(FUNDREF_SOURCE_TYPE); existingEntity.setSourceUrl(rdfOrganization.doi); existingEntity.setLastModified(new Date()); existingEntity.setIndexingStatus(IndexingStatus.PENDING); existingEntity.setStatus(rdfOrganization.status); orgDisambiguatedDao.merge(existingEntity); updatedOrgs += 1; } else if(idChanged(rdfOrganization, existingEntity)){ // #3: If the ID changed, create an external identifier createExternalIdentifier(existingEntity, rdfOrganization.doi); addedExternalIdentifiers += 1; } else if(statusChanged(rdfOrganization, existingEntity)) { //If the status changed, update the status existingEntity.setStatus(rdfOrganization.status); existingEntity.setLastModified(new Date()); existingEntity.setIndexingStatus(IndexingStatus.PENDING); orgDisambiguatedDao.merge(existingEntity); } } else { // #4: Else, create the new org createDisambiguatedOrg(rdfOrganization); addedDisambiguatedOrgs += 1; } } long end = System.currentTimeMillis(); LOGGER.info("Time taken to process the files: {}", (end - start)); } catch (FileNotFoundException fne) { LOGGER.error("Unable to read file {}", fileToLoad); } catch (ParserConfigurationException pce) { LOGGER.error("Unable to initialize the DocumentBuilder"); } catch (IOException ioe) { LOGGER.error("Unable to parse document {}", fileToLoad); } catch (SAXException se) { LOGGER.error("Unable to parse document {}", fileToLoad); } catch (XPathExpressionException xpe) { LOGGER.error("XPathExpressionException {}", xpe.getMessage()); } finally { LOGGER.info("Number new Disambiguated Orgs={}, Updated Orgs={}, new External Identifiers={}", new Object[] { addedDisambiguatedOrgs, updatedOrgs, addedExternalIdentifiers, getTotal() }); } } /** * FUNDREF FUNCTIONS * */ /** * Get an RDF organization from the given RDF file * */ private RDFOrganization getOrganization(Document xmlDocument, NamedNodeMap attrs) { RDFOrganization organization = new RDFOrganization(); try { Node node = attrs.getNamedItem("rdf:resource"); String itemDoi = node.getNodeValue(); LOGGER.info("Processing item {}", itemDoi); //Get item node Node organizationNode = (Node) xPath.compile(itemExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.NODE); // Get organization name String orgName = (String) xPath.compile(orgNameExpression).evaluate(organizationNode, XPathConstants.STRING); // Get status indicator Node statusNode = (Node) xPath.compile(statusExpression).evaluate(organizationNode, XPathConstants.NODE); String status = null; if(statusNode != null) { NamedNodeMap statusAttrs = statusNode.getAttributes(); if(statusAttrs != null) { String statusAttribute = statusAttrs.getNamedItem("rdf:resource").getNodeValue(); if(isDeprecatedStatus(statusAttribute)) { status = OrganizationStatus.DEPRECATED.name(); } } } // Get country code Node countryNode = (Node) xPath.compile(orgCountryExpression).evaluate(organizationNode, XPathConstants.NODE); NamedNodeMap countryAttrs = countryNode.getAttributes(); String countryGeonameUrl = countryAttrs.getNamedItem("rdf:resource").getNodeValue(); String countryCode = fetchFromGeoNames(countryGeonameUrl, "countryCode"); // Get state name Node stateNode = (Node) xPath.compile(orgStateExpression).evaluate(organizationNode, XPathConstants.NODE); String stateName = null; String stateCode = null; if (stateNode != null) { NamedNodeMap stateAttrs = stateNode.getAttributes(); String stateGeoNameCode = stateAttrs.getNamedItem("rdf:resource").getNodeValue(); stateName = fetchFromGeoNames(stateGeoNameCode, "name"); stateCode = fetchFromGeoNames(stateGeoNameCode, STATE_NAME); } // Get type String orgType = (String) xPath.compile(orgTypeExpression).evaluate(organizationNode, XPathConstants.STRING); // Get subType String orgSubType = (String) xPath.compile(orgSubTypeExpression).evaluate(organizationNode, XPathConstants.STRING); // Fill the organization object organization.doi = itemDoi; organization.name = orgName; organization.country = countryCode; organization.state = stateName; organization.stateCode = stateCode; // TODO: since we don't have city, we fill this with the state, this // should be modified soon organization.city = stateCode; organization.type = orgType; organization.subtype = orgSubType; organization.status = status; } catch (XPathExpressionException xpe) { LOGGER.error("XPathExpressionException {}", xpe.getMessage()); } return organization; } /** * Indicates if an organization has been marked as deprecated * */ private boolean isDeprecatedStatus(String statusAttribute) { if(!PojoUtil.isEmpty(statusAttribute)) { return DEPRECATED_INDICATOR.equalsIgnoreCase(statusAttribute); } return false; } /** * GEONAMES FUNCTIONS * */ /** * Fetch a property from geonames * */ private String fetchFromGeoNames(String geoNameUri, String propertyToFetch) { String result = null; String geoNameId = geoNameUri.replaceAll("[^\\d]", ""); if (StringUtils.isNotBlank(geoNameId)) { String cacheKey = propertyToFetch + '_' + geoNameId; if (cache.containsKey(cacheKey)) { result = cache.get(cacheKey); } else { String jsonResponse = fetchJsonFromGeoNames(geoNameId); if (STATE_NAME.equals(propertyToFetch)) { result = fetchStateAbbreviationFromJson(jsonResponse); } else { result = fetchValueFromJson(jsonResponse, propertyToFetch); } cache.put(cacheKey, result); } } return result; } /** * Queries GeoNames API for a given geonameId and return the JSON string * */ private String fetchJsonFromGeoNames(String geoNameId) { String result = null; if (cache.containsKey("geoname_json_" + geoNameId)) { return cache.get("geoname_json_" + geoNameId); } else { Client c = Client.create(); WebResource r = c.resource(geonamesApiUrl); MultivaluedMap<String, String> params = new MultivaluedMapImpl(); params.add("geonameId", geoNameId); params.add("username", apiUser); result = r.queryParams(params).get(String.class); cache.put("geoname_json_" + geoNameId, result); } return result; } /** * It only fetches properties in the first level * */ private String fetchValueFromJson(String jsonString, String propetyName) { String result = null; try { ObjectMapper m = new ObjectMapper(); JsonNode rootNode = m.readTree(jsonString); JsonNode nameNode = rootNode.path(propetyName); if (nameNode != null) result = nameNode.asText(); } catch (Exception e) { } return result; } /** * Fetch the state abbreviation from a geonames response * */ private String fetchStateAbbreviationFromJson(String jsonString) { String result = null; try { ObjectMapper m = new ObjectMapper(); JsonNode rootNode = m.readTree(jsonString); JsonNode arrayNode = rootNode.get("alternateNames"); if (arrayNode != null && arrayNode.isArray()) { for (final JsonNode altNameNode : arrayNode) { JsonNode langNode = altNameNode.get("lang"); if (langNode != null && STATE_ABBREVIATION.equals(langNode.asText())) { JsonNode nameNode = altNameNode.get("name"); result = nameNode.asText(); break; } } } } catch (Exception e) { } return result; } /** * DATABASE FUNCTIONS * */ /** * TODO * */ private OrgDisambiguatedEntity findByDetails(RDFOrganization org) { Iso3166Country country = StringUtils.isBlank(org.country) ? null : Iso3166Country.valueOf(org.country); // Find the org by name, city, country and state OrgDisambiguatedEntity existingEntity = orgDisambiguatedDao.findByNameCityRegionCountryAndSourceType(org.name, org.stateCode, org.stateCode, country, FUNDREF_SOURCE_TYPE); // If no match is found, try with the doi and source type if (existingEntity == null) { existingEntity = orgDisambiguatedDao.findBySourceIdAndSourceType(org.doi, FUNDREF_SOURCE_TYPE); } return existingEntity; } /** * Indicates if an entity changed his name, country, state or city * * @param org * The organization with the new values * @param entity * The organization we have stored in the database * * @return true if the entity has changed. */ private boolean entityChanged(RDFOrganization org, OrgDisambiguatedEntity entity) { // Check name if (StringUtils.isNotBlank(org.name)) { if (!org.name.equalsIgnoreCase(entity.getName())) return true; } else if (StringUtils.isNotBlank(entity.getName())) { return true; } // Check country if (StringUtils.isNotBlank(org.country)) { if (entity.getCountry() == null || !org.country.equals(entity.getCountry().value())) { return true; } } else if (entity.getCountry() != null) { return true; } // Check state if (StringUtils.isNotBlank(org.stateCode)) { if (entity.getRegion() == null || !org.stateCode.equals(entity.getRegion())) { return true; } } else if (StringUtils.isNotBlank(entity.getRegion())) { return true; } // Check city if (StringUtils.isNotBlank(org.city)) { if (entity.getCity() == null || !org.city.equals(entity.getCity())) { return true; } } else if (StringUtils.isNotBlank(entity.getCity())) { return true; } return false; } /** * Indicates if an entity status has changed * * @param org * The organization with the new values * @param entity * The organization we have stored in the database * * @return true if the entity status has changed. */ private boolean statusChanged(RDFOrganization org, OrgDisambiguatedEntity entity) { if(!PojoUtil.isEmpty(org.status)) { if(!org.status.equalsIgnoreCase(entity.getStatus())) { return true; } } else if(!PojoUtil.isEmpty(entity.getStatus())) { //If for some reason, the status of the updated organization is removed, remove it also from our data return true; } return false; } /** * TODO * */ private boolean idChanged(RDFOrganization org, OrgDisambiguatedEntity entity) { if(org.doi.equals(entity.getSourceId())) return false; return true; } /** * Creates a disambiguated ORG in the org_disambiguated table * */ private OrgDisambiguatedEntity createDisambiguatedOrg(RDFOrganization organization) { LOGGER.info("Creating disambiguated org {}", organization.name); String orgType = organization.type + (StringUtils.isEmpty(organization.subtype) ? "" : "/" + organization.subtype); Iso3166Country country = StringUtils.isNotBlank(organization.country) ? Iso3166Country.fromValue(organization.country) : null; OrgDisambiguatedEntity orgDisambiguatedEntity = new OrgDisambiguatedEntity(); orgDisambiguatedEntity.setName(organization.name); orgDisambiguatedEntity.setCountry(country); orgDisambiguatedEntity.setCity(organization.city); orgDisambiguatedEntity.setRegion(organization.stateCode); orgDisambiguatedEntity.setOrgType(orgType); orgDisambiguatedEntity.setSourceId(organization.doi); orgDisambiguatedEntity.setSourceUrl(organization.doi); orgDisambiguatedEntity.setSourceType(FUNDREF_SOURCE_TYPE); if(!PojoUtil.isEmpty(organization.status)) { orgDisambiguatedEntity.setStatus(OrganizationStatus.DEPRECATED.name()); } orgDisambiguatedDao.persist(orgDisambiguatedEntity); return orgDisambiguatedEntity; } /** * Creates an external identifier in the * org_disambiguated_external_identifier table * */ private boolean createExternalIdentifier(OrgDisambiguatedEntity disambiguatedOrg, String identifier) { LOGGER.info("Creating external identifier for {}", disambiguatedOrg.getId()); Date creationDate = new Date(); OrgDisambiguatedExternalIdentifierEntity externalIdentifier = new OrgDisambiguatedExternalIdentifierEntity(); externalIdentifier.setIdentifier(identifier); externalIdentifier.setIdentifierType(FUNDREF_SOURCE_TYPE); externalIdentifier.setOrgDisambiguated(disambiguatedOrg); externalIdentifier.setDateCreated(creationDate); externalIdentifier.setLastModified(creationDate); orgDisambiguatedExternalIdentifierDao.persist(externalIdentifier); return true; } /** * STATISTICS * */ /** * Get the total number of orgs processed * */ private long getTotal() { return updatedOrgs + addedDisambiguatedOrgs + addedExternalIdentifiers; } }