/* The contents of this file are subject to the license and copyright terms * detailed in the license directory at the root of the source tree (also * available online at http://fedora-commons.org/license/). */ package fedora.server.storage.translation; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.text.ParseException; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.log4j.Logger; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import fedora.common.Constants; import fedora.common.xml.format.XMLFormat; import fedora.common.xml.namespace.XMLNamespace; import fedora.server.errors.ObjectIntegrityException; import fedora.server.errors.StreamIOException; import fedora.server.errors.ValidationException; import fedora.server.storage.types.AuditRecord; import fedora.server.storage.types.DSBinding; import fedora.server.storage.types.DSBindingMap; import fedora.server.storage.types.Datastream; import fedora.server.storage.types.DatastreamManagedContent; import fedora.server.storage.types.DatastreamReferencedContent; import fedora.server.storage.types.DatastreamXMLMetadata; import fedora.server.storage.types.DigitalObject; import fedora.server.storage.types.Disseminator; import fedora.server.utilities.DateUtility; import fedora.server.utilities.StreamUtility; import fedora.server.validation.ValidationUtility; import fedora.utilities.Base64; /** * Deserializes objects in the constructor-provided version of the METS Fedora * Extension format. * * @author Sandy Payette * @author Chris Wilper */ @SuppressWarnings("deprecation") public class METSFedoraExtDODeserializer extends DefaultHandler implements Constants, DODeserializer { /** * The format this deserializer will read if unspecified at construction. * This defaults to the latest FOXML format. */ public static final XMLFormat DEFAULT_FORMAT = METS_EXT1_1; /** Logger for this class. */ private static final Logger LOG = Logger.getLogger(METSFedoraExtDODeserializer.class.getName()); /** The format this deserializer reads. */ private final XMLFormat m_format; /** The xlink namespace this deserializer understands; depends on format. */ private final XMLNamespace m_xlink; /** The current translation context. */ private int m_transContext; /** The object to deserialize to. */ private DigitalObject m_obj; /** Buffer to build RDF expression of ADMID and DMDID relationships * */ private StringBuffer m_relsBuffer; private boolean hasRels = false; /** Hashtables to record DMDID references */ private HashMap<String, List<String>> m_dsDMDIDs; // key=dsVersionID, value=ArrayList of dsID /** Hashtables to record ADMID references */ private HashMap<String, List<String>> m_dsADMIDs; // key=dsVersionID, value=ArrayList of dsID /** Hashtables to correlate audit record ids to datastreams */ private HashMap<String, String> m_AuditIdToComponentId; private SAXParser m_parser; private String m_characterEncoding; /** Namespace prefix-to-URI mapping info from SAX2 startPrefixMapping events. */ private HashMap<String, String> m_prefixMap; private HashMap<String, String> m_localPrefixMap; private ArrayList<String> m_prefixList; /** Variables to parse into */ private boolean m_rootElementFound; private String m_agentRole; private String m_dsId; private String m_dsVersId; private Date m_dsCreateDate; private String m_dissemId; private String m_dissemState; private String m_dsState; private String m_dsInfoType; private String m_dsOtherInfoType; private String m_dsLabel; private int m_dsMDClass; private long m_dsSize; private String m_dsLocation; private String m_dsLocationType; private String m_dsMimeType; private String m_dsControlGrp; private boolean m_dsVersionable; private String m_dsFormatURI; private String[] m_dsAltIDs; private String m_dsChecksum; private String m_dsChecksumType; private StringBuffer m_dsXMLBuffer; // are we reading binary in an FContent element? (base64-encoded) private boolean m_readingContent; // indicates reading element content private boolean m_readingBinaryContent; // indicates reading binary element content private File m_binaryContentTempFile; private StringBuffer m_elementContent; // single element /** While parsing, are we inside XML metadata? */ private boolean m_inXMLMetadata; /** * Used to differentiate between a metadata section in this object and a * metadata section in an inline XML datastream that happens to be a METS * document. */ private int m_xmlDataLevel; /** String buffer for audit element contents */ private StringBuffer m_auditBuffer; private String m_auditId; private String m_auditProcessType; private String m_auditAction; private String m_auditComponentID; private String m_auditResponsibility; private String m_auditDate; private String m_auditJustification; /** * Hashmap for holding disseminators during parsing, keyed by structMapId */ private HashMap<String, Disseminator> m_dissems; /** * Currently-being-initialized disseminator, during structmap parsing. */ private Disseminator m_diss; /** * Whether, while in structmap, we've already seen a div */ private boolean m_indiv; /** The structMapId of the dissem currently being parsed. */ private String m_structId; /** * Creates a deserializer that reads the default Fedora METS Extension * format. */ public METSFedoraExtDODeserializer() { this(DEFAULT_FORMAT); } /** * Creates a deserializer that reads the given Fedora METS Extension format. * * @param format * the version-specific Fedora METS Extension format. * @throws IllegalArgumentException * if format is not a known Fedora METS Extension format. */ public METSFedoraExtDODeserializer(XMLFormat format) { if (format.equals(METS_EXT1_0)) { m_xlink = OLD_XLINK; } else if (format.equals(METS_EXT1_1)) { m_xlink = XLINK; } else { throw new IllegalArgumentException("Not a METSFedoraExt format: " + format.uri); } m_format = format; } //--- // DODeserializer implementation //--- /** * {@inheritDoc} */ public DODeserializer getInstance() { return new METSFedoraExtDODeserializer(m_format); } /** * {@inheritDoc} */ public void deserialize(InputStream in, DigitalObject obj, String encoding, int transContext) throws ObjectIntegrityException, StreamIOException, UnsupportedEncodingException { LOG.debug("Deserializing " + m_format.uri + " for transContext: " + transContext); // initialize sax for this parse try { SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setNamespaceAware(true); m_parser = spf.newSAXParser(); } catch (Exception e) { throw new RuntimeException("Error initializing SAX parser", e); } m_obj = obj; m_obj.setOwnerId(""); m_obj.setLabel(""); m_characterEncoding = encoding; m_transContext = transContext; initialize(); try { m_parser.parse(in, this); } catch (IOException ioe) { throw new StreamIOException("Low-level stream IO problem occurred " + "while SAX parsing this object."); } catch (SAXException se) { throw new ObjectIntegrityException("METS stream was bad : " + se.getMessage()); } if (!m_rootElementFound) { throw new ObjectIntegrityException("METS root element not found"); } // POST-PROCESSING... // convert audit records to contain component ids convertAudits(); // preserve ADMID and DMDID relationships in a RELS-INT // datastream, if one does not already exist. createRelsInt(); DOTranslationUtility.normalizeDatastreams(m_obj, m_transContext, m_characterEncoding); if (m_format.equals(METS_EXT1_0)) { // DISSEMINATORS... put disseminators in the instantiated digital // object Iterator<Disseminator> dissemIter = m_dissems.values().iterator(); while (dissemIter.hasNext()) { Disseminator diss = dissemIter.next(); m_obj.disseminators(diss.dissID).add(diss); } } } //--- // DefaultHandler overrides //--- /** * {@inheritDoc} */ @Override public void startPrefixMapping(String prefix, String uri) { // Keep the prefix map up-to-date throughout the entire parse, // and maintain a list of newly mapped prefixes on a per-element basis. m_prefixMap.put(prefix, uri); if (m_inXMLMetadata) { m_localPrefixMap.put(prefix, uri); m_prefixList.add(prefix); } } /** * {@inheritDoc} */ @Override public void endPrefixMapping(String prefix) { m_prefixMap.remove(prefix); if (m_inXMLMetadata) { m_localPrefixMap.remove(prefix); } } /** * {@inheritDoc} */ @Override public void startElement(String uri, String localName, String qName, Attributes a) throws SAXException { if (uri.equals(METS.uri) && !m_inXMLMetadata) { // a new mets element is starting if (localName.equals("mets")) { m_rootElementFound = true; m_obj.setPid(grab(a, METS.uri, "OBJID")); m_obj.setLabel(grab(a, METS.uri, "LABEL")); if (m_format.equals(METS_EXT1_0)) { // In METS_EXT 1.0, the PROFILE attribute mapped to an // object property, fedora-model:contentModel. This will be // retained as an extended property in the DigitalObject. m_obj.setExtProperty(MODEL.CONTENT_MODEL.uri, grab(a, METS.uri, "PROFILE")); // Similarly, the TYPE attribute mapped to rdf:type, and // will also be retained as an external property. m_obj.setExtProperty(RDF.TYPE.uri, grab(a, METS.uri, "TYPE")); } } else if (localName.equals("metsHdr")) { m_obj.setCreateDate(DateUtility .convertStringToDate(grab(a, METS.uri, "CREATEDATE"))); m_obj.setLastModDate(DateUtility .convertStringToDate(grab(a, METS.uri, "LASTMODDATE"))); try { m_obj.setState(DOTranslationUtility .readStateAttribute(grab(a, METS.uri, "RECORDSTATUS"))); } catch (ParseException e) { throw new SAXException("Could not read object state", e); } } else if (localName.equals("agent")) { m_agentRole = grab(a, METS.uri, "ROLE"); } else if (localName.equals("name") && m_agentRole.equals("IPOWNER")) { m_readingContent = true; m_elementContent = new StringBuffer(); } else if (localName.equals("amdSec")) { m_dsId = grab(a, METS.uri, "ID"); m_dsState = grab(a, METS.uri, "STATUS"); String dsVersionable = grab(a, METS.uri, "VERSIONABLE"); if (dsVersionable != null && !dsVersionable.equals("")) { m_dsVersionable = new Boolean(grab(a, METS.uri, "VERSIONABLE")) .booleanValue(); } else { m_dsVersionable = true; } } else if (localName.equals("dmdSecFedora")) { m_dsId = grab(a, METS.uri, "ID"); m_dsState = grab(a, METS.uri, "STATUS"); String dsVersionable = grab(a, METS.uri, "VERSIONABLE"); if (dsVersionable != null && !dsVersionable.equals("")) { m_dsVersionable = new Boolean(grab(a, METS.uri, "VERSIONABLE")) .booleanValue(); } else { m_dsVersionable = true; } } else if (localName.equals("techMD") || localName.equals("descMD") || localName.equals("sourceMD") || localName.equals("rightsMD") || localName.equals("digiprovMD")) { m_dsVersId = grab(a, METS.uri, "ID"); if (localName.equals("techMD")) { m_dsMDClass = DatastreamXMLMetadata.TECHNICAL; } if (localName.equals("sourceMD")) { m_dsMDClass = DatastreamXMLMetadata.SOURCE; } if (localName.equals("rightsMD")) { m_dsMDClass = DatastreamXMLMetadata.RIGHTS; } if (localName.equals("digiprovMD")) { m_dsMDClass = DatastreamXMLMetadata.DIGIPROV; } if (localName.equals("descMD")) { m_dsMDClass = DatastreamXMLMetadata.DESCRIPTIVE; } String dateString = grab(a, METS.uri, "CREATED"); if (dateString != null && !dateString.equals("")) { m_dsCreateDate = DateUtility.convertStringToDate(dateString); } } else if (localName.equals("mdWrap")) { m_dsInfoType = grab(a, METS.uri, "MDTYPE"); m_dsOtherInfoType = grab(a, METS.uri, "OTHERMDTYPE"); m_dsLabel = grab(a, METS.uri, "LABEL"); m_dsMimeType = grab(a, METS.uri, "MIMETYPE"); m_dsFormatURI = grab(a, METS.uri, "FORMAT_URI"); String altIDs = grab(a, METS.uri, "ALT_IDS"); if (altIDs.length() == 0) { m_dsAltIDs = new String[0]; } else { m_dsAltIDs = altIDs.split(" "); } m_dsChecksum = grab(a, METS.uri, "CHECKSUM"); m_dsChecksumType = grab(a, METS.uri, "CHECKSUMTYPE"); } else if (localName.equals("xmlData")) { m_dsXMLBuffer = new StringBuffer(); m_xmlDataLevel = 0; m_inXMLMetadata = true; } else if (localName.equals("fileGrp")) { m_dsId = grab(a, METS.uri, "ID"); String dsVersionable = grab(a, METS.uri, "VERSIONABLE"); if (dsVersionable != null && !dsVersionable.equals("")) { m_dsVersionable = new Boolean(grab(a, METS.uri, "VERSIONABLE")) .booleanValue(); } else { m_dsVersionable = true; } // reset the values for the next file m_dsVersId = ""; m_dsCreateDate = null; m_dsMimeType = ""; m_dsControlGrp = ""; m_dsFormatURI = ""; m_dsAltIDs = new String[0]; m_dsState = grab(a, METS.uri, "STATUS"); m_dsSize = -1; m_dsChecksum = ""; m_dsChecksumType = ""; } else if (localName.equals("file")) { m_dsVersId = grab(a, METS.uri, "ID"); String dateString = grab(a, METS.uri, "CREATED"); if (dateString != null && !dateString.equals("")) { m_dsCreateDate = DateUtility.convertStringToDate(dateString); } m_dsMimeType = grab(a, METS.uri, "MIMETYPE"); m_dsControlGrp = grab(a, METS.uri, "OWNERID"); String ADMID = grab(a, METS.uri, "ADMID"); if (ADMID != null && !"".equals(ADMID)) { ArrayList<String> al = new ArrayList<String>(); if (ADMID.indexOf(" ") != -1) { String[] admIds = ADMID.split(" "); for (String element : admIds) { al.add(element); } } else { al.add(ADMID); } m_dsADMIDs.put(m_dsVersId, al); } String DMDID = grab(a, METS.uri, "DMDID"); if (DMDID != null && !"".equals(DMDID)) { ArrayList<String> al = new ArrayList<String>(); if (DMDID.indexOf(" ") != -1) { String[] dmdIds = DMDID.split(" "); for (String element : dmdIds) { al.add(element); } } else { al.add(DMDID); } m_dsDMDIDs.put(m_dsVersId, al); } String sizeString = grab(a, METS.uri, "SIZE"); if (sizeString != null && !sizeString.equals("")) { try { m_dsSize = Long.parseLong(sizeString); } catch (NumberFormatException nfe) { throw new SAXException("If specified, a datastream's " + "SIZE attribute must be an xsd:long."); } } String formatURI = grab(a, METS.uri, "FORMAT_URI"); if (formatURI != null && !formatURI.equals("")) { m_dsFormatURI = formatURI; } String altIDs = grab(a, METS.uri, "ALT_IDS"); if (altIDs.length() == 0) { m_dsAltIDs = new String[0]; } else { m_dsAltIDs = altIDs.split(" "); } m_dsChecksum = grab(a, METS.uri, "CHECKSUM"); m_dsChecksumType = grab(a, METS.uri, "CHECKSUMTYPE"); // inside a "file" element, it's either going to be // FLocat (a reference) or FContent (inline) } else if (localName.equals("FLocat")) { m_dsLabel = grab(a, m_xlink.uri, "title"); String dsLocation = grab(a, m_xlink.uri, "href"); if (dsLocation == null || dsLocation.equals("")) { throw new SAXException("xlink:href must be specified in FLocat element"); } if (m_dsControlGrp.equalsIgnoreCase("E") || m_dsControlGrp.equalsIgnoreCase("R")) { // URL FORMAT VALIDATION for dsLocation: // make sure we have a properly formed URL (must have protocol) try { ValidationUtility.validateURL(dsLocation, m_dsControlGrp); } catch (ValidationException ve) { throw new SAXException(ve.getMessage()); } // system will set dsLocationType for E and R datastreams... m_dsLocationType = "URL"; m_dsInfoType = "DATA"; m_dsLocation = dsLocation; instantiateDatastream(new DatastreamReferencedContent()); } else if (m_dsControlGrp.equalsIgnoreCase("M")) { // URL FORMAT VALIDATION for dsLocation: // For Managed Content the URL is only checked when we are parsing a // a NEW ingest file because the URL is replaced with an internal identifier // once the repository has sucked in the content for storage. if (m_obj.isNew()) { try { ValidationUtility.validateURL(dsLocation, m_dsControlGrp); } catch (ValidationException ve) { throw new SAXException(ve.getMessage()); } } m_dsLocationType = "INTERNAL_ID"; m_dsInfoType = "DATA"; m_dsLocation = dsLocation; instantiateDatastream(new DatastreamManagedContent()); } } else if (localName.equals("FContent")) { // In METS_EXT, the FContent element contains base64-encoded // data. m_readingContent = true; m_elementContent = new StringBuffer(); if (m_dsControlGrp.equalsIgnoreCase("M")) { m_readingBinaryContent = true; m_binaryContentTempFile = null; try { m_binaryContentTempFile = File.createTempFile("binary-datastream", null); } catch (IOException ioe) { throw new SAXException(new StreamIOException("Unable to create temporary file for binary content")); } } } else if (m_format.equals(METS_EXT1_0)) { startDisseminators(localName, a); } } else { if (m_inXMLMetadata) { // must be in xmlData... just output it, remembering the number // of METS:xmlData elements we see appendElementStart(uri, localName, qName, a, m_dsXMLBuffer); // METS INSIDE METS! we have an inline XML datastream // that is itself METS. We do not want to parse this! if (uri.equals(METS.uri) && localName.equals("xmlData")) { m_xmlDataLevel++; } // remember this stuff... (we don't have to look at level // because the audit schema doesn't allow for xml elements inside // these, so they're never set incorrectly) // signaling that we're interested in sending char data to // the m_auditBuffer by making it non-null, and getting // ready to accept data by allocating a new StringBuffer if (m_dsId.equals("FEDORA-AUDITTRAIL") || m_dsId.equals("AUDIT")) { if (localName.equals("record")) { m_auditId = grab(a, uri, "ID"); } else if (localName.equals("process")) { m_auditProcessType = grab(a, uri, "type"); } else if (localName.equals("action") || localName.equals("componentID") || localName.equals("responsibility") || localName.equals("date") || localName.equals("justification")) { m_auditBuffer = new StringBuffer(); } } } else { // ignore all else } } } /** * {@inheritDoc} */ @Override public void characters(char[] ch, int start, int length) { if (m_inXMLMetadata) { if (m_auditBuffer != null) { m_auditBuffer.append(ch, start, length); } else { // since this data is encoded straight back to xml, // we need to make sure special characters &, <, >, ", and ' // are re-converted to the xml-acceptable equivalents. StreamUtility.enc(ch, start, length, m_dsXMLBuffer); } } else if (m_readingContent) { // read normal element content into a string buffer if (m_elementContent != null) { m_elementContent.append(ch, start, length); } } } /** * {@inheritDoc} */ @Override public void endElement(String uri, String localName, String qName) throws SAXException { // first, deal with the situation when we are processing a block of inline XML if (m_inXMLMetadata) { if (uri.equals(METS.uri) && localName.equals("xmlData") && m_xmlDataLevel == 0) { // finished all xml metadata for this datastream if (m_dsId.equals("FEDORA-AUDITTRAIL") || m_dsId.equals("AUDIT")) { // we've been looking at an audit trail... set audit record AuditRecord a = new AuditRecord(); // In METS each audit record is in its own <digiprovMD> // element within an <amdSec>. So, pick up the XML ID // of the <digiprovMD> element for the audit record id. // This amdSec is treated like a datastream, and each // digiprovMD is a version, so id was parsed into dsVersId. a.id = m_auditId; //m_dsVersId; a.processType = m_auditProcessType; a.action = m_auditAction; a.componentID = m_auditComponentID; a.responsibility = m_auditResponsibility; a.date = DateUtility.convertStringToDate(m_auditDate); a.justification = m_auditJustification; m_obj.getAuditRecords().add(a); m_inXMLMetadata = false; // other stuff is re-initted upon // startElement for next xml metadata // element } else { // Create the right kind of datastream and add to the object DatastreamXMLMetadata ds = new DatastreamXMLMetadata(); instantiateXMLDatastream(ds); m_inXMLMetadata = false; m_localPrefixMap.clear(); } } else { // finished an element within inline xml metadata m_dsXMLBuffer.append("</" + qName + ">"); // make sure we know when to pay attention to METS again if (uri.equals(METS.uri) && localName.equals("xmlData")) { m_xmlDataLevel--; } if (m_dsId.equals("FEDORA-AUDITTRAIL") || m_dsId.equals("AUDIT")) { if (localName.equals("action")) { m_auditAction = m_auditBuffer.toString(); m_auditBuffer = null; } else if (localName.equals("componentID")) { m_auditComponentID = m_auditBuffer.toString(); m_auditBuffer = null; } else if (localName.equals("responsibility")) { m_auditResponsibility = m_auditBuffer.toString(); m_auditBuffer = null; } else if (localName.equals("date")) { m_auditDate = m_auditBuffer.toString(); m_auditBuffer = null; } else if (localName.equals("justification")) { m_auditJustification = m_auditBuffer.toString(); m_auditBuffer = null; } } } // ALL OTHER ELEMENT CASES: we are NOT processing a block of inline XML metadata } else { if (m_readingBinaryContent) { // In the version of METS Fedora uses, FContent assumes base64-encoded content if (uri.equals(METS.uri) && localName.equals("FContent")) { if (m_binaryContentTempFile != null) { try { FileOutputStream os = new FileOutputStream(m_binaryContentTempFile); // remove all spaces and newlines, this might not be necessary. String elementStr = m_elementContent.toString() .replaceAll("\\s", ""); byte elementBytes[] = Base64.decode(elementStr); os.write(elementBytes); os.close(); m_dsLocationType = "INTERNAL_ID"; m_dsLocation = DatastreamManagedContent.TEMP_SCHEME + m_binaryContentTempFile .getAbsolutePath(); instantiateDatastream(new DatastreamManagedContent()); } catch (FileNotFoundException fnfe) { throw new SAXException(new StreamIOException("Unable to open temporary file created for binary content")); } catch (IOException fnfe) { throw new SAXException(new StreamIOException("Error writing to temporary file created for binary content")); } } } m_binaryContentTempFile = null; m_readingBinaryContent = false; m_elementContent = null; // all other cases... } else { if (m_readingContent) { // elements for which we were reading regular content if (uri.equals(METS.uri) && localName.equals("name") && m_agentRole.equals("IPOWNER")) { m_obj.setOwnerId(m_elementContent.toString()); } else if (uri.equals(METS.uri) && localName.equals("agent")) { m_agentRole = null; } m_readingContent = false; m_elementContent = null; } else { // no other processing requirements at this time } } } } //--- // Instance helpers //--- private void startDisseminators(String localName, Attributes a) throws SAXException { if (localName.equals("structMap")) { // this is a component of a disseminator. here we assume the rest // of the disseminator's information will be seen later, so we // construct a new Disseminator object to hold the structMap... // and later, the other info // // Building up a global map of Disseminators, m_dissems, // keyed by bindingmap ID. // if (grab(a, METS.uri, "TYPE").equals("fedora:dsBindingMap")) { String bmId = grab(a, METS.uri, "ID"); if (bmId == null || bmId.equals("")) { throw new SAXException("structMap with TYPE " + "fedora:dsBindingMap must specify a non-empty " + "ID attribute."); } else { Disseminator diss = new Disseminator(); diss.dsBindMapID = bmId; m_dissems.put(bmId, diss); m_diss = diss; m_diss.dsBindMap = new DSBindingMap(); m_diss.dsBindMap.dsBindMapID = bmId; m_indiv = false; // flag we're not looking at inner part yet } } else { throw new SAXException("StructMap must have TYPE fedora:dsBindingMap"); } } else if (localName.equals("div")) { if (m_indiv) { // inner part of structmap DSBinding binding = new DSBinding(); if (m_diss.dsBindMap.dsBindings == null) { // none yet.. create array of size one DSBinding[] bindings = new DSBinding[1]; m_diss.dsBindMap.dsBindings = bindings; m_diss.dsBindMap.dsBindings[0] = binding; } else { // need to expand the array size by one, // and do an array copy. int curSize = m_diss.dsBindMap.dsBindings.length; DSBinding[] oldArray = m_diss.dsBindMap.dsBindings; DSBinding[] newArray = new DSBinding[curSize + 1]; for (int i = 0; i < curSize; i++) { newArray[i] = oldArray[i]; } newArray[curSize] = binding; m_diss.dsBindMap.dsBindings = newArray; } // now populate 'binding' values...we'll have // everything at this point except datastreamID... // that comes as a child: <fptr FILEID="DS2"/> binding.bindKeyName = grab(a, METS.uri, "TYPE"); binding.bindLabel = grab(a, METS.uri, "LABEL"); binding.seqNo = grab(a, METS.uri, "ORDER"); } else { m_indiv = true; // first (outer div) part of structmap m_diss.dsBindMap.dsBindMechanismPID = grab(a, METS.uri, "TYPE"); m_diss.dsBindMap.dsBindMapLabel = grab(a, METS.uri, "LABEL"); } } else if (localName.equals("fptr")) { // assume we're inside the inner div... that's the // only place the fptr element is valid. DSBinding binding = m_diss.dsBindMap.dsBindings[m_diss.dsBindMap.dsBindings.length - 1]; binding.datastreamID = grab(a, METS.uri, "FILEID"); } else if (localName.equals("behaviorSec")) { // looks like we're in a disseminator... it should be in the // hash by now because we've already gone through structmaps // ...keyed by structmap id... remember the id (group id) // so we can put it in when parsing serviceBinding m_dissemId = grab(a, METS.uri, "ID"); m_dissemState = grab(a, METS.uri, "STATUS"); } else if (localName.equals("serviceBinding")) { // remember the structId so we can grab the right dissem // when parsing children m_structId = grab(a, METS.uri, "STRUCTID"); // grab the disseminator associated with the provided structId Disseminator dissem = m_dissems.get(m_structId); // plug known items in.. dissem.dissID = m_dissemId; dissem.dissState = m_dissemState; // then grab the new stuff for the dissem for this element, and // put it in. dissem.dissVersionID = grab(a, METS.uri, "ID"); dissem.bDefID = grab(a, METS.uri, "BTYPE"); dissem.dissCreateDT = DateUtility .convertStringToDate(grab(a, METS.uri, "CREATED")); dissem.dissLabel = grab(a, METS.uri, "LABEL"); } else if (localName.equals("interfaceMD")) { Disseminator dissem = m_dissems.get(m_structId); } else if (localName.equals("serviceBindMD")) { Disseminator dissem = m_dissems.get(m_structId); dissem.sDepID = grab(a, m_xlink.uri, "href"); } } private void appendElementStart(String uri, String localName, String qName, Attributes a, StringBuffer out) { out.append("<" + qName); // add the current qName's namespace to m_localPrefixMap // and m_prefixList if it's not already in m_localPrefixMap // This ensures that all namespaces used in inline XML are declared within, // since it's supposed to be a standalone chunk. String[] parts = qName.split(":"); if (parts.length == 2) { String nsuri = m_localPrefixMap.get(parts[0]); if (nsuri == null) { m_localPrefixMap.put(parts[0], parts[1]); m_prefixList.add(parts[0]); } } // do we have any newly-mapped namespaces? while (m_prefixList.size() > 0) { String prefix = m_prefixList.remove(0); out.append(" xmlns"); if (prefix.length() > 0) { out.append(":"); } out.append(prefix + "=\"" + StreamUtility.enc(m_prefixMap.get(prefix)) + "\""); } for (int i = 0; i < a.getLength(); i++) { out.append(" " + a.getQName(i) + "=\"" + StreamUtility.enc(a.getValue(i)) + "\""); } out.append(">"); } private void instantiateDatastream(Datastream ds) throws SAXException { // set datastream variables with values grabbed from the SAX parse ds.DatastreamID = m_dsId; ds.DSVersionable = m_dsVersionable; ds.DSFormatURI = m_dsFormatURI; ds.DatastreamAltIDs = m_dsAltIDs; ds.DSVersionID = m_dsVersId; ds.DSLabel = m_dsLabel; ds.DSCreateDT = m_dsCreateDate; ds.DSMIME = m_dsMimeType; ds.DSControlGrp = m_dsControlGrp; ds.DSState = m_dsState; ds.DSLocation = m_dsLocation; ds.DSLocationType = m_dsLocationType; ds.DSInfoType = m_dsInfoType; ds.DSChecksumType = m_dsChecksumType; LOG.debug("instantiate datastream: dsid = " + m_dsId + "checksumType = " + m_dsChecksumType + "checksum = " + m_dsChecksum); if (m_obj.isNew()) { if (m_dsChecksum != null && !m_dsChecksum.equals("") && !m_dsChecksum.equals(Datastream.CHECKSUM_NONE)) { String tmpChecksum = ds.getChecksum(); LOG.debug("checksum = " + tmpChecksum); if (!m_dsChecksum.equals(tmpChecksum)) { throw new SAXException(new ValidationException("Checksum Mismatch: " + tmpChecksum)); } } ds.DSChecksumType = ds.getChecksumType(); } else { ds.DSChecksum = m_dsChecksum; } // Normalize the dsLocation for the deserialization context ds.DSLocation = (DOTranslationUtility.normalizeDSLocationURLs(m_obj.getPid(), ds, m_transContext)).DSLocation; // FINALLY! add the datastream to the digital object instantiation m_obj.addDatastreamVersion(ds, true); } private void instantiateXMLDatastream(DatastreamXMLMetadata ds) throws SAXException { // set the attrs common to all datastream versions ds.DatastreamID = m_dsId; ds.DSVersionable = m_dsVersionable; ds.DSFormatURI = m_dsFormatURI; ds.DatastreamAltIDs = m_dsAltIDs; ds.DSVersionID = m_dsVersId; ds.DSLabel = m_dsLabel; ds.DSCreateDT = m_dsCreateDate; if (m_dsMimeType == null || m_dsMimeType.equals("")) { ds.DSMIME = "text/xml"; } else { ds.DSMIME = m_dsMimeType; } // set the attrs specific to datastream version ds.DSControlGrp = "X"; ds.DSState = m_dsState; ds.DSLocation = m_obj.getPid() + "+" + m_dsId + "+" + m_dsVersId; ds.DSLocationType = m_dsLocationType; ds.DSInfoType = m_dsInfoType; // METS only ds.DSMDClass = m_dsMDClass; // METS only ds.DSChecksumType = m_dsChecksumType; // now set the xml content stream itself... try { String xmlString = m_dsXMLBuffer.toString(); ds.xmlContent = xmlString.getBytes(m_characterEncoding); //LOOK! this sets bytes, not characters. Do we want to set this? ds.DSSize = ds.xmlContent.length; } catch (Exception uee) { LOG.debug("Error processing inline xml content in SAX parse: " + uee.getMessage()); } LOG.debug("instantiate datastream: dsid = " + m_dsId + "checksumType = " + m_dsChecksumType + "checksum = " + m_dsChecksum); if (m_obj.isNew()) { if (m_dsChecksum != null && !m_dsChecksum.equals("") && !m_dsChecksum.equals(Datastream.CHECKSUM_NONE)) { String tmpChecksum = ds.getChecksum(); LOG.debug("checksum = " + tmpChecksum); if (!m_dsChecksum.equals(tmpChecksum)) { throw new SAXException(new ValidationException("Checksum Mismatch: " + tmpChecksum)); } } ds.DSChecksumType = ds.getChecksumType(); } else { ds.DSChecksum = m_dsChecksum; } // FINALLY! add the xml datastream to the digitalObject m_obj.addDatastreamVersion(ds, true); } /** * convertAudits: In Fedora 2.0 and beyond, we want self-standing audit * records. Make sure audit records are converted to new format that * contains a componentID to show what component in the object the audit * record is about. */ private void convertAudits() { // Only do this if ADMID values were found in the object. if (m_dsADMIDs.size() > 0) { // Look at datastreams to see if there are audit records for them. // NOTE: we do not look at disseminators because in pre-2.0 // the disseminators did not point to their audit records as // did the datastreams. Iterator<String> dsIdIter = m_obj.datastreamIdIterator(); while (dsIdIter.hasNext()) { for (Datastream ds : m_obj.datastreams(dsIdIter.next())) { // ADMID processing... // get list of ADMIDs that go with a datastream version List<String> admIdList = m_dsADMIDs.get(ds.DSVersionID); List<String> cleanAdmIdList = new ArrayList<String>(); if (admIdList != null) { Iterator<String> admIdIter = admIdList.iterator(); while (admIdIter.hasNext()) { String admId = admIdIter.next(); // Detect ADMIDs that reference audit records // vs. regular admin metadata. Drop audits from // the list. We know we have an audit if the ADMID // is not a regular datatream in the object. Iterator<Datastream> matchedDatastreams = m_obj.datastreams(admId).iterator(); if (matchedDatastreams.hasNext()) { // Keep track of audit metadata correlated with the // datastream version it's about (for later use). m_AuditIdToComponentId.put(admId, ds.DSVersionID); } else { // Keep track of non-audit metadata in a new list. cleanAdmIdList.add(admId); } } } if (cleanAdmIdList.size() <= 0) { // we keep track of admin metadata references // for each datastream, but we exclude the audit // records from this list. If there are no // non-audit metadata references, remove the // datastream entry from the master hashmap. m_dsADMIDs.remove(ds.DSVersionID); } else { // otherwise, update the master hashmap with the // clean list of non-audit metadata m_dsADMIDs.put(ds.DSVersionID, cleanAdmIdList); } } } // Now, put component ids on audit records. Pre-Fedora 2.0 // datastream versions pointed to their audit records. Iterator<AuditRecord> iter = m_obj.getAuditRecords().iterator(); while (iter.hasNext()) { AuditRecord au = iter.next(); if (au.componentID == null || au.componentID.equals("")) { // Before Fedora 2.0 audit records were associated with // datastream version ids. From now on, the datastream id // will be posted as the component id in the audit record, // and associations to particular datastream versions can // be derived via the datastream version dates and the audit // record dates. String dsVersId = m_AuditIdToComponentId.get(au.id); if (dsVersId != null && !dsVersId.equals("")) { au.componentID = dsVersId.substring(0, dsVersId.indexOf(".")); } } } } } /** * addRelsInt: Build an RDF relationship datastream to preserve DMDID and * ADMID references in the digital object when METS is converted to FOXML * (or other formats in the future). If there is no pre-existing RELS-INT, * look for DMDID and ADMID attributes to create new RELS-INT datastream. */ private void createRelsInt() { // create a new RELS-INT datastream only if one does not already exist. Iterator<Datastream> metsrels = m_obj.datastreams("RELS-INT").iterator(); if (metsrels.hasNext()) { m_relsBuffer = new StringBuffer(); appendRDFStart(m_relsBuffer); Iterator<String> dsIds = m_obj.datastreamIdIterator(); while (dsIds.hasNext()) { // initialize hash sets to keep a list of // unique DMDIDs or ADMIDs at the datatream id level. HashSet<String> uniqueDMDIDs = new HashSet<String>(); HashSet<String> uniqueADMIDs = new HashSet<String>(); // get list of datastream *versions* for (Datastream dsVersion : m_obj.datastreams(dsIds .next())) { // DMDID processing... List<String> dmdIdList = m_dsDMDIDs.get(dsVersion.DSVersionID); if (dmdIdList != null) { hasRels = true; Iterator<String> dmdIdIter = dmdIdList.iterator(); while (dmdIdIter.hasNext()) { String dmdId = dmdIdIter.next(); // APPEND TO RDF: record the DMDID relationship. // Relationships will now be recorded at the // datastream level, not the datastream version level. // So, is the relationship existed on more than one // datastream version, only write it once to the RDF. if (!uniqueDMDIDs.contains(dmdId)) { appendRDFRel(m_relsBuffer, m_obj.getPid(), dsVersion.DatastreamID, "hasDescMetadata", dmdId); } uniqueDMDIDs.add(dmdId); } } // ADMID processing (already cleansed of audit refs)... List<String> cleanAdmIdList = m_dsADMIDs.get(dsVersion.DSVersionID); if (cleanAdmIdList != null) { hasRels = true; Iterator<String> admIdIter = cleanAdmIdList.iterator(); while (admIdIter.hasNext()) { String admId = admIdIter.next(); // APPEND TO RDF: record the ADMID relationship. // Relationships will now be recorded at the // datastream level, not the datastream version level. // So, is the relationship existed on more than one // datastream version, only write it once to the RDF. if (!uniqueADMIDs.contains(admId)) { appendRDFRel(m_relsBuffer, m_obj.getPid(), dsVersion.DatastreamID, "hasAdminMetadata", admId); } uniqueADMIDs.add(admId); } } } } // APPEND RDF: finish up and add RDF as a system-generated datastream if (hasRels) { appendRDFEnd(m_relsBuffer); setRDFAsDatastream(m_relsBuffer); } else { m_relsBuffer = null; } } } // Create a system-generated datastream from the RDF expression of the // DMDID and ADMID relationships found in the METS file. private void setRDFAsDatastream(StringBuffer buf) { DatastreamXMLMetadata ds = new DatastreamXMLMetadata(); // set the attrs common to all datastream versions ds.DatastreamID = "RELS-INT"; ds.DSVersionable = false; ds.DSFormatURI = m_dsFormatURI; ds.DatastreamAltIDs = m_dsAltIDs; ds.DSVersionID = "RELS-INT.0"; ds.DSLabel = "DO NOT EDIT: System-generated datastream to preserve METS DMDID/ADMID relationships."; ds.DSCreateDT = new Date(); ds.DSMIME = "application/rdf+xml"; // set the attrs specific to datastream version ds.DSControlGrp = "X"; ds.DSState = "A"; ds.DSLocation = m_obj.getPid() + "+" + ds.DatastreamID + "+" + ds.DSVersionID; ds.DSLocationType = "INTERNAL_ID"; ds.DSInfoType = "DATA"; ds.DSMDClass = DatastreamXMLMetadata.TECHNICAL; // now set the xml content stream itself... try { ds.xmlContent = buf.toString().getBytes(m_characterEncoding); ds.DSSize = ds.xmlContent.length; } catch (UnsupportedEncodingException uee) { LOG.error("Encoding error when creating RELS-INT datastream", uee); } // FINALLY! add the RDF and an inline xml datastream in the digital object m_obj.addDatastreamVersion(ds, true); } private StringBuffer appendRDFStart(StringBuffer buf) { buf.append("<" + RDF.prefix + ":RDF" + " xmlns:" + RDF.prefix + "=\"" + RDF.uri + "\"" + " xmlns:" + RELS_EXT.prefix + "=\"" + RELS_EXT.uri + "\">\n"); return buf; } private StringBuffer appendRDFRel(StringBuffer buf, String pid, String subjectNodeId, String relType, String objectNodeId) { // RDF subject node buf.append(" <" + RDF.prefix + ":Description " + RDF.prefix + ":about=\"" + "info:fedora/" + pid + "/" + subjectNodeId + "\">\n"); // RDF relationship property and object node buf.append(" <" + RELS_EXT.prefix + ":" + relType + " " + RDF.prefix + ":resource=\"" + "info:fedora/" + pid + "/" + objectNodeId + "\"/>\n"); buf.append(" </" + RDF.prefix + ":Description" + ">\n"); return buf; } private void initialize() { // temporary variables and state variables m_rootElementFound = false; m_inXMLMetadata = false; m_prefixMap = new HashMap<String, String>(); m_localPrefixMap = new HashMap<String, String>(); m_prefixList = new ArrayList<String>(); // temporary variables for processing datastreams m_dsId = ""; m_dsVersionable = true; m_dsVersId = ""; m_dsCreateDate = null; m_dsState = ""; m_dsFormatURI = ""; m_dsAltIDs = new String[0]; m_dsSize = -1; m_dsLocationType = ""; m_dsLocation = ""; m_dsMimeType = ""; m_dsControlGrp = ""; m_dsInfoType = ""; m_dsOtherInfoType = ""; m_dsMDClass = 0; m_dsLabel = ""; m_dsXMLBuffer = null; m_dsADMIDs = new HashMap<String, List<String>>(); m_dsDMDIDs = new HashMap<String, List<String>>(); m_dsChecksum = ""; m_dsChecksumType = ""; // temporary variables for processing disseminators m_dissems = new HashMap<String, Disseminator>(); // temporary variables for processing audit records m_auditBuffer = null; m_auditId = ""; m_auditComponentID = ""; m_auditProcessType = ""; m_auditAction = ""; m_auditResponsibility = ""; m_auditDate = ""; m_auditJustification = ""; m_AuditIdToComponentId = new HashMap<String, String>(); m_relsBuffer = null; } //--- // Static helpers //--- private static StringBuffer appendRDFEnd(StringBuffer buf) { buf.append("</" + RDF.prefix + ":RDF>\n"); return buf; } private static String grab(Attributes a, String namespace, String elementName) { String ret = a.getValue(namespace, elementName); if (ret == null) { ret = a.getValue(elementName); } // set null attribute value to empty string since it's // generally helpful in the code to avoid null pointer exception // when operations are performed on attributes values. if (ret == null) { ret = ""; } return ret; } }