package org.orcid.core.cli;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import javax.ws.rs.core.MultivaluedMap;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.orcid.persistence.dao.GenericDao;
import org.orcid.persistence.jpa.entities.OrgDisambiguatedExternalIdentifierEntity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import au.com.bytecode.opencsv.CSVWriter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.sun.jersey.api.client.Client;
import com.sun.jersey.api.client.WebResource;
import com.sun.jersey.core.util.MultivaluedMapImpl;
public class TransformFundRefDataIntoCSV {
class FundRefOrganization {
String id, name, altName, country, state, type, subtype;
}
private static final Logger LOGGER = LoggerFactory.getLogger(TransformFundRefDataIntoCSV.class);
private static String geonamesApiUrl;
// Params
@Option(name = "-f", usage = "Path to RDF file containing the FundRef organizations")
private File fundRefFile;
// xPath init
private XPath xPath = XPathFactory.newInstance().newXPath();
// GeoNames Cache
private HashMap<String, String> cache = new HashMap<String, String>();
// Resources
private GenericDao<OrgDisambiguatedExternalIdentifierEntity, Long> genericDao;
private String apiUser;
// xPath queries
private String conceptsExpression = "/RDF/ConceptScheme/hasTopConcept";
private String itemExpression = "/RDF/Concept[@about='%s']";
private String orgNameExpression = itemExpression + "/prefLabel/Label/literalForm";
private String orgAltNameExpression = itemExpression + "/altLabel/Label/literalForm";
private String orgCountryExpression = itemExpression + "/country";
private String orgStateExpression = itemExpression + "/state";
private String orgTypeExpression = itemExpression + "/fundingBodyType";
private String orgSubTypeExpression = itemExpression + "/fundingBodySubType";
private String FUNDREF_CSV = "C:/Users/angel.montenegro/Desktop/fundref/crossref_complete.csv";
private CSVWriter fundrefCSV = null;
/**
* INIT
* */
@SuppressWarnings("resource")
private void init() {
ApplicationContext context = new ClassPathXmlApplicationContext("orcid-core-context.xml");
// Geonames params
geonamesApiUrl = (String) context.getBean("geonamesApiUrl");
apiUser = (String) context.getBean("geonamesUser");
// Init the CSV file for existing orgs
try {
Writer writer1 = new FileWriter(this.FUNDREF_CSV);
fundrefCSV = createCSVWriter(writer1);
// Write headers
String[] headers = { "id","name","altName","country","state","type","subtype" };
fundrefCSV.writeNext(headers);
} catch (IOException ioe) {
// TODO
}
}
/**
* Validate cmd arguments
* */
private void validateArgs(CmdLineParser parser) throws CmdLineException {
if (fundRefFile == null) {
throw new CmdLineException(parser, "-f parameter must be specificed");
}
}
public void process() {
// Init
init();
// Load fundref organizations
List<FundRefOrganization> fundRefOrgs = loadFundRefOrgs();
for(FundRefOrganization fOrg : fundRefOrgs) {
writeFundRefOrg(fOrg);
}
try {
fundrefCSV.close();
} catch (IOException ioe) {
System.out.println("ERROR CLOSING CSV FILES");
}
// TODO Write duplicates names into a csv
}
/*****************************************************************************
******************************* FUNDREF FUNCTIONS ***************************
***************************************************************************** */
/**
* Load data from FundRef
* */
private List<FundRefOrganization> loadFundRefOrgs() {
List<FundRefOrganization> fundRefOrgs = new ArrayList<FundRefOrganization>();
System.out.println("Begin loading FundRef orgs");
try {
FileInputStream file = new FileInputStream(fundRefFile);
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = builderFactory.newDocumentBuilder();
Document xmlDocument = builder.parse(file);
// Parent node
NodeList nodeList = (NodeList) xPath.compile(conceptsExpression).evaluate(xmlDocument, XPathConstants.NODESET);
for (int i = 0; i < nodeList.getLength(); i++) {
FundRefOrganization fOrg = getFundrefOrganization(xmlDocument, nodeList.item(i).getAttributes());
fundRefOrgs.add(fOrg);
}
} catch (FileNotFoundException fne) {
LOGGER.error("Unable to read file {}", fundRefFile);
} catch (ParserConfigurationException pce) {
LOGGER.error("Unable to initialize the DocumentBuilder");
} catch (IOException ioe) {
LOGGER.error("Unable to parse document {}", fundRefFile);
} catch (SAXException se) {
LOGGER.error("Unable to parse document {}", fundRefFile);
} catch (XPathExpressionException xpe) {
LOGGER.error("XPathExpressionException {}", xpe.getMessage());
}
return fundRefOrgs;
}
/**
* Parse a RDF node and convert it into a FundRefOrganization object
* */
private FundRefOrganization getFundrefOrganization(Document xmlDocument, NamedNodeMap attrs) {
FundRefOrganization organization = new FundRefOrganization();
try {
Node node = attrs.getNamedItem("rdf:resource");
String itemDoi = node.getNodeValue();
LOGGER.info("Processing item {}", itemDoi);
// Get organization name
String orgName = (String) xPath.compile(orgNameExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.STRING);
//Replace "U.S." with "US" to match RingGold info
orgName = orgName.replace("U.S.", "US");
// Get organization alt name
String orgAltName = (String) xPath.compile(orgAltNameExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.STRING);
// Get country geoname id
Node countryNode = (Node) xPath.compile(orgCountryExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.NODE);
NamedNodeMap countryAttrs = countryNode.getAttributes();
String countryGeonameUrl = countryAttrs.getNamedItem("rdf:resource").getNodeValue();
// Get state geoname id
Node stateNode = (Node) xPath.compile(orgStateExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.NODE);
String stateGeoNameCode = null;
if (stateNode != null) {
NamedNodeMap stateAttrs = stateNode.getAttributes();
stateGeoNameCode = stateAttrs.getNamedItem("rdf:resource").getNodeValue();
}
// Get type
String orgType = (String) xPath.compile(orgTypeExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.STRING);
// Get subType
String orgSubType = (String) xPath.compile(orgSubTypeExpression.replace("%s", itemDoi)).evaluate(xmlDocument, XPathConstants.STRING);
// Fill the organization object
organization.type = StringUtils.isBlank(orgType) ? null : orgType;
organization.id = StringUtils.isBlank(itemDoi) ? null : itemDoi;
organization.name = StringUtils.isBlank(orgName) ? null : orgName;
organization.altName = StringUtils.isBlank(orgAltName) ? null : orgAltName;
organization.subtype = StringUtils.isBlank(orgSubType) ? null : orgSubType;
// By this moment the geonames uris hasnt been resolved, so, resolve them
// Fetch country code from geonames
if (StringUtils.isNotBlank(countryGeonameUrl))
organization.country = fetchFromGeoNames(countryGeonameUrl, "countryCode");
// Fetch state from geonames
if (StringUtils.isNotBlank(stateGeoNameCode)) {
organization.state = fetchFromGeoNames(stateGeoNameCode, "STATE");
}
} catch (XPathExpressionException xpe) {
LOGGER.error("XPathExpressionException {}", xpe.getMessage());
}
return organization;
}
/*****************************************************************************
******************************* RINGOLD FUNCTIONS ***************************
***************************************************************************** */
private CSVWriter createCSVWriter(Writer writer) {
return new CSVWriter(writer, ',', '"');
}
private void writeFundRefOrg(FundRefOrganization organization) {
// { "id","name","altName","country","state","type","subtype" }
String[] newOrgLine = new String[7];
newOrgLine[0] = organization.id;
newOrgLine[1] = organization.name;
newOrgLine[2] = organization.altName;
newOrgLine[3] = organization.country;
newOrgLine[4] = organization.state;
newOrgLine[5] = organization.type;
newOrgLine[6] = organization.subtype;
fundrefCSV.writeNext(newOrgLine);
}
/*****************************************************************************
***************************** GEONAMES FUNCTIONS ****************************
***************************************************************************** */
/**
* Fetch a property from geonames
* */
private String fetchFromGeoNames(String geoNameUri, String propertyToFetch) {
String result = null;
String geoNameId = geoNameUri.replaceAll("[^\\d]", "");
if (StringUtils.isNotBlank(geoNameId)) {
String cacheKey = propertyToFetch + '_' + geoNameId;
if (cache.containsKey(cacheKey)) {
result = cache.get(cacheKey);
} else {
String jsonResponse = fetchJsonFromGeoNames(geoNameId);
if(propertyToFetch.equals("STATE")){
result = fetchStateAbbrev(jsonResponse);
} else {
result = fetchValueFromJson(jsonResponse, propertyToFetch);
}
cache.put(cacheKey, result);
}
}
return result;
}
/**
* Queries GeoNames API for a given geonameId and return the JSON string
* */
private String fetchJsonFromGeoNames(String geoNameId) {
String result = null;
if (cache.containsKey("geoname_json_" + geoNameId)) {
return cache.get("geoname_json_" + geoNameId);
} else {
Client c = Client.create();
WebResource r = c.resource(geonamesApiUrl);
MultivaluedMap<String, String> params = new MultivaluedMapImpl();
params.add("geonameId", geoNameId);
params.add("username", apiUser);
result = r.queryParams(params).get(String.class);
cache.put("geoname_json_" + geoNameId, result);
}
return result;
}
/**
* It only fetches properties in the first level
* */
private String fetchValueFromJson(String jsonString, String propetyName) {
String result = null;
try {
ObjectMapper m = new ObjectMapper();
JsonNode rootNode = m.readTree(jsonString);
JsonNode nameNode = rootNode.path(propetyName);
if (nameNode != null)
result = nameNode.asText();
} catch (Exception e) {
}
return result;
}
private String fetchStateAbbrev(String jsonString){
String result = null;
try {
ObjectMapper m = new ObjectMapper();
JsonNode rootNode = m.readTree(jsonString);
JsonNode altNameNode = rootNode.path("alternateNames");
if (altNameNode != null && altNameNode.isArray()){
for(JsonNode node : altNameNode){
JsonNode type = node.path("lang");
if(type != null && "abbr".equalsIgnoreCase(type.asText())){
JsonNode state = node.path("name");
result = state.asText();
break;
}
}
}
} catch (Exception e) {
}
return result;
}
/*****************************************************************************
************************************* MAIN **********************************
***************************************************************************** */
public static void main(String[] args) {
TransformFundRefDataIntoCSV mergeData = new TransformFundRefDataIntoCSV();
CmdLineParser parser = new CmdLineParser(mergeData);
try {
parser.parseArgument(args);
mergeData.validateArgs(parser);
mergeData.process();
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
}
}
}