/* * LCNameAuthority.java * * Version: $Revision: 3705 $ * * Date: $Date: 2009-04-11 13:02:24 -0400 (Sat, 11 Apr 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.content.authority; import java.io.IOException; import java.io.FileNotFoundException; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.FileReader; import java.io.BufferedReader; import java.util.Enumeration; import java.util.List; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import javax.xml.parsers.SAXParserFactory; import javax.xml.parsers.SAXParser; import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.XMLReader; import org.xml.sax.InputSource; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXParseException; import org.apache.log4j.Logger; import org.dspace.core.ConfigurationManager; import org.dspace.content.DCPersonName; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.util.EncodingUtil; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.HttpException; /** * Sample personal name authority based on Library of Congress Name Authority * Also serves as an example of an SRU client as authority. * * This is tuned for the data in the LC Name Authority test instance, see * http://alcme.oclc.org/srw/search/lcnaf * * WARNING: This is just a proof-of-concept implementation. It would need * WARNING: lots of refinement to be used in production, because it is very * WARNING: sloppy about digging through the MARC/XML results. No doubt * WARNING: it is losing a lot of valid results and information. * WARNING: Could also do a better job including more info (title, life dates * WARNING: etc) in the label instead of just the name. * * Reads these DSpace Config properties: * * lcname.url = http://alcme.oclc.org/srw/search/lcnaf * * TODO: make # of results to ask for (and return) configurable. * * @author Larry Stone * @version $Revision $ */ public class LCNameAuthority implements ChoiceAuthority { private static Logger log = Logger.getLogger(LCNameAuthority.class); // get these from configuration private static String url = null; // NS URI for SRU respones private static final String NS_SRU = "http://www.loc.gov/zing/srw/"; // NS URI for MARC/XML private static final String NS_MX = "http://www.loc.gov/MARC21/slim"; // constructor does static init too.. public LCNameAuthority() { if (url == null) { url = ConfigurationManager.getProperty("lcname.url"); // sanity check if (url == null) throw new IllegalStateException("Missing DSpace configuration keys for LCName Query"); } } // punt! this is a poor implementation.. public Choices getBestMatch(String text, int collection, String locale) { return getMatches(text, collection, 0, 2, locale); } /** * Match a proposed value against name authority records * Value is assumed to be in "Lastname, Firstname" format. */ public Choices getMatches(String text, int collection, int start, int limit, String locale) { boolean error = false; Choices result = queryPerson(text, start, limit); if (result == null) result = new Choices(true); return result; } // punt; supposed to get the canonical display form of a metadata authority key // XXX FIXME implement this with a query on the authority key, cache results public String getLabel(String key, String locale) { return key; } /** * Guts of the implementation, returns a complete Choices result, or * null for a failure. */ private Choices queryPerson(String text, int start, int limit) { // punt if there is no query text if (text == null || text.trim().length() == 0) return new Choices(true); // 1. build CQL query DCPersonName pn = new DCPersonName(text); StringBuilder query = new StringBuilder(); query.append("local.FirstName = \"").append(pn.getFirstNames()). append("\" and local.FamilyName = \"").append(pn.getLastName()). append("\""); // XXX arbitrary default limit - should be configurable? if (limit == 0) limit = 50; NameValuePair args[] = new NameValuePair[6]; args[0] = new NameValuePair("operation", "searchRetrieve"); args[1] = new NameValuePair("version", "1.1"); args[2] = new NameValuePair("recordSchema", "info:srw/schema/1/marcxml-v1.1"); args[3] = new NameValuePair("query", query.toString()); args[4] = new NameValuePair("maximumRecords", String.valueOf(limit)); args[5] = new NameValuePair("startRecord", String.valueOf(start+1)); HttpClient hc = new HttpClient(); String srUrl = url + "?" + EncodingUtil.formUrlEncode(args, "UTF8"); GetMethod get = new GetMethod(srUrl); log.debug("Trying SRU query, URL="+srUrl); // 2. web request try { int status = hc.executeMethod(get); if (status == 200) { SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser sp = spf.newSAXParser(); XMLReader xr = sp.getXMLReader(); SRUHandler handler = new SRUHandler(); // XXX FIXME: should turn off validation here explicitly, but // it seems to be off by default. xr.setFeature("http://xml.org/sax/features/namespaces", true); xr.setContentHandler(handler); xr.setErrorHandler(handler); xr.parse(new InputSource(get.getResponseBodyAsStream())); // this probably just means more results available.. if (handler.hits != handler.result.size()) log.warn("Discrepency in results, result.length="+handler.result.size()+ ", yet expected results="+handler.hits); boolean more = handler.hits > (start + handler.result.size()); // XXX add non-auth option; perhaps the UI should do this? // XXX it's really a policy matter if they allow unauth result. // XXX good, stop it. // handler.result.add(new Choice("", text, "Non-Authority: \""+text+"\"")); int confidence; if (handler.hits == 0) confidence = Choices.CF_NOTFOUND; else if (handler.hits == 1) confidence = Choices.CF_UNCERTAIN; else confidence = Choices.CF_AMBIGUOUS; return new Choices(handler.result.toArray(new Choice[handler.result.size()]), start, handler.hits, confidence, more); } } catch (HttpException e) { log.error("SRU query failed: ", e); return new Choices(true); } catch (IOException e) { log.error("SRU query failed: ", e); return new Choices(true); } catch (ParserConfigurationException e) { log.warn("Failed parsing SRU result: ", e); return new Choices(true); } catch (SAXException e) { log.warn("Failed parsing SRU result: ", e); return new Choices(true); } finally { get.releaseConnection(); } return new Choices(true); } /** * XXX FIXME TODO: Very sloppy MARC/XML parser. * This only reads subfields 010.a (for LCCN, to use as key) * and 100.a (for "established personal name") * Maybe look at Indicator on 100 too. * Should probably read other 100 subfields to build a more detailed label. */ private static class SRUHandler extends DefaultHandler { private List<Choice> result = new ArrayList<Choice>(); private int hits = -1; private String textValue = null; private String name = null; private String oname = null; private String bname = null; private String lccn = null; private String lastTag = null; private String lastCode = null; // NOTE: text value MAY be presented in multiple calls, even if // it all one word, so be ready to splice it together. // BEWARE: subclass's startElement method should call super() // to null out 'value'. (Don't you miss the method combination // options of a real object system like CLOS?) public void characters(char[] ch, int start, int length) throws SAXException { String newValue = new String(ch, start, length); if (newValue.length() > 0) { if (textValue == null) textValue = newValue; else textValue += newValue; } } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (localName.equals("numberOfRecords") && namespaceURI.equals(NS_SRU)) { hits = Integer.parseInt(textValue.trim()); if (hits > 0) { name = null; lccn = null; log.debug("Expecting "+hits+" records in results."); } } // after record get next hit ready else if (localName.equals("record") && namespaceURI.equals(NS_SRU)) { if (name != null && lccn != null) { // HACK: many LC name entries end with ',' ...trim it. if (name.endsWith(",")) name = name.substring(0, name.length()-1); // XXX DEBUG // log.debug("Got result, name="+name+", lccn="+lccn); result.add(new Choice(lccn, name, name)); } else log.warn("Got anomalous result, at least one of these null: lccn="+lccn+", name="+name); name = null; lccn = null; } else if (localName.equals("subfield") && namespaceURI.equals(NS_MX)) { if (lastTag != null && lastCode != null) { // 010.a is lccn, "authority code" if (lastTag.equals("010") && lastCode.equals("a")) lccn = textValue; // 100.a is the personal name else if (lastTag.equals("100") && lastCode.equals("a")) name = textValue; if (lastTag.equals("100") && lastCode.equals("d") && (name != null)) name = name+" "+textValue; } } } // subclass overriding this MUST call it with super() public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { textValue = null; if (localName.equals("datafield") && namespaceURI.equals(NS_MX)) { lastTag = atts.getValue("tag"); if (lastTag == null) log.warn("MARC datafield without tag attribute!"); } else if (localName.equals("subfield") && namespaceURI.equals(NS_MX)) { lastCode = atts.getValue("code"); if (lastCode == null) log.warn("MARC subfield without code attribute!"); } } public void error(SAXParseException exception) throws SAXException { throw new SAXException(exception); } public void fatalError(SAXParseException exception) throws SAXException { throw new SAXException(exception); } } }