package edu.unc.ils.mrc.hive.sync.lcsh; import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.List; import javax.xml.namespace.QName; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.ibm.icu.text.SimpleDateFormat; import edu.unc.ils.mrc.hive.api.SKOSScheme; import edu.unc.ils.mrc.hive.api.impl.elmo.SKOSSchemeImpl; /** * Synchronize the the LCSH vocabulary in HIVE using the id.loc.gov Atom feed. * @author craig.willis@unc.edu */ public class AtomSynchronizer { private static final Log logger = LogFactory.getLog(AtomSynchronizer.class); /* Current feed page being processed */ protected int currentPage = 1; /* SKOSScheme representation of vocabulary*/ protected SKOSScheme scheme; /* URL for atom feed */ protected String feedUrl; /* Maximum number of pages to process */ protected static int MAX_PAGES = 4000; /* List of updated concept URIs */ protected List<String> updatedEntries = new ArrayList<String>(); /* List of deleted concept URIs */ protected List<String> deletedEntries = new ArrayList<String>(); /** * Construct a sychronizer for the specified scheme. * @param scheme */ public AtomSynchronizer(SKOSScheme scheme) { this.scheme = scheme; this.feedUrl = scheme.getAtomFeedURL(); } /** * Process the feed */ public void processUpdates() { logger.trace("processUpdates()"); try { // Get the date this vocabulary was last updated. If the // last update date is empty, use the creation date from the // configuration file. Date lastUpdate = (scheme.getLastUpdateDate() != null) ? scheme.getLastUpdateDate() : scheme.getCreationDate(); processUpdates(lastUpdate, null); } catch (Exception e) { logger.error(e); } } /** * Process the feed for the specified date range * @param startDate * @param endDate */ public void processUpdates(Date startDate, Date endDate) { logger.trace("processUpdates " + startDate + "," + endDate); try { // Read the updates and deletes frmo the feed readFeed(startDate, endDate); // Process updates for (String uri: updatedEntries) { QName qname = new QName(uri, "#concept"); logger.debug("Updating " + qname); scheme.importConcept(qname, uri + ".rdf"); } logger.info("Updated " + updatedEntries.size() + " concepts"); // Process deletes for (String uri: deletedEntries) { QName qname = new QName(uri, "#concept"); logger.debug("Deleting " + qname); scheme.deleteConcept(qname); } logger.info("Deleted " + deletedEntries.size() + " concepts"); } catch (Exception e) { logger.error(e); } } /** * Read the updates and deletes from the feed * @param startDate * @param endDate * @throws Exception */ public void readFeed(Date startDate, Date endDate) throws Exception { logger.trace("readFeed " + startDate + "," + endDate); if (endDate == null) endDate = new Date(); logger.debug("Using feed URL: " + feedUrl); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); boolean done = false; for (int x=1; x<MAX_PAGES; x++) { if (done) break; URL url = new URL(feedUrl + "page/" + x); logger.debug("Reading updates from feed URL " + url); Document document = db.parse(url.openStream()); XPath xpath = XPathFactory.newInstance().newXPath(); // Get all of the updates String expression = "/feed/entry"; NodeList nodes = (NodeList) xpath.evaluate(expression, document, XPathConstants.NODESET); for (int i=0; i<nodes.getLength(); i++) { Node node = nodes.item(i); Date updatedDate = null; String uri = null; NodeList children = node.getChildNodes(); for (int j=0; j<children.getLength(); j++) { Node child = children.item(j); String name = child.getNodeName(); if (name != null) { if (name.equals("updated")) { NodeList text = child.getChildNodes(); String date = text.item(0).getNodeValue(); if (date != null) { updatedDate = javax.xml.bind.DatatypeConverter.parseDateTime(date).getTime(); } } else if (name.equals("link")) { NamedNodeMap attrMap = child.getAttributes(); Node href = attrMap.getNamedItem("href"); Node type = attrMap.getNamedItem("type"); if (type == null) uri = href.getNodeValue(); } } } if (updatedDate != null) { if (startDate != null && endDate != null) { if (updatedDate.after(startDate) && updatedDate.before(endDate)) updatedEntries.add(uri); else if (updatedDate.before(startDate)) { done = true; break; } } } } expression = "/feed/deleted-entry"; NodeList deletedNodes = (NodeList) xpath.evaluate(expression, document, XPathConstants.NODESET); for (int i=0; i<deletedNodes.getLength(); i++) { Node node = deletedNodes.item(i); Date updatedDate = null; String uri = null; NodeList children = node.getChildNodes(); for (int j=0; j<children.getLength(); j++) { Node child = children.item(j); if (child.getNodeName().equals("updated")) { NodeList text = child.getChildNodes(); String date = text.item(0).getNodeValue(); if (date != null) updatedDate = javax.xml.bind.DatatypeConverter.parseDateTime(date).getTime(); } else if (child.getNodeName().equals("id")) { NodeList id = child.getChildNodes(); String entryId = id.item(0).getNodeValue(); uri = entryId.replace("info:lc/", "http://id.loc.gov/"); } } if (updatedDate != null) { if (startDate != null && endDate != null) { if (updatedDate.after(startDate) && updatedDate.before(endDate)) deletedEntries.add(uri); else if (updatedDate.before(startDate)) { done = true; break; } } } } } } public static void main(String[] args) throws Exception { String confPath = "/user/local/hive/conf/"; // args[0]; SKOSScheme scheme = new SKOSSchemeImpl(confPath, "lcsh", true); AtomSynchronizer as = new AtomSynchronizer(scheme); //SimpleDateFormat df = new SimpleDateFormat("MM-dd-yyyy HH:mm:ss aa"); //Date d1 = df.parse("04-07-2011 12:00:00 PM"); //Date d2 = df.parse("04-08-2011 09:00:00 AM"); //as.processUpdates(d1, d2); as.processUpdates(); scheme.close(); } }