/* The contents of this file are subject to the license and copyright terms * detailed in the license directory at the root of the source tree (also * available online at http://fedora-commons.org/license/). */ package fedora.server.storage.translation; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.text.ParseException; import java.util.Comparator; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import javax.xml.stream.XMLStreamException; import javax.activation.MimeType; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.apache.abdera.Abdera; import org.apache.abdera.ext.thread.ThreadHelper; import org.apache.abdera.i18n.iri.IRI; import org.apache.abdera.model.Category; import org.apache.abdera.model.Content; import org.apache.abdera.model.Document; import org.apache.abdera.model.Entry; import org.apache.abdera.model.Feed; import org.apache.abdera.model.Person; import org.apache.abdera.parser.Parser; import org.apache.abdera.util.MimeTypeHelper; import org.apache.abdera.xpath.XPath; import fedora.common.Constants; import fedora.common.MalformedPIDException; import fedora.common.PID; import fedora.common.xml.format.XMLFormat; import fedora.server.errors.ObjectIntegrityException; import fedora.server.errors.StreamIOException; import fedora.server.errors.ValidationException; import fedora.server.storage.types.Datastream; import fedora.server.storage.types.DatastreamManagedContent; import fedora.server.storage.types.DatastreamReferencedContent; import fedora.server.storage.types.DatastreamXMLMetadata; import fedora.server.storage.types.DigitalObject; import fedora.server.utilities.DateUtility; import fedora.server.validation.ValidationUtility; import fedora.utilities.FileUtils; import fedora.utilities.NormalizedURI; /** * Deserializer for Fedora Objects in Atom format. * * @author Edwin Shin * @since 3.0 * @version $Id$ */ public class AtomDODeserializer implements DODeserializer, Constants { public static final XMLFormat DEFAULT_FORMAT = ATOM1_1; /** Logger for this class. */ private static final Logger LOG = Logger.getLogger(AtomDODeserializer.class); /** The object to deserialize to. */ private DigitalObject m_obj; private String m_encoding; /** The current translation context. */ private int m_transContext; /** The format this deserializer reads. */ private final XMLFormat m_format; private Abdera abdera = Abdera.getInstance(); private Feed m_feed; private XPath m_xpath; private ZipInputStream m_zin; /** * Temporary directory for the unpacked contents of an Atom Zip archive. */ private File m_tempDir; public AtomDODeserializer() { this(DEFAULT_FORMAT); } public AtomDODeserializer(XMLFormat format) { if (format.equals(ATOM1_1) || format.equals(ATOM_ZIP1_1)) { m_format = format; } else { throw new IllegalArgumentException("Not an Atom format: " + format.uri); } } /** * {@inheritDoc} */ public void deserialize(InputStream in, DigitalObject obj, String encoding, int transContext) throws ObjectIntegrityException, StreamIOException, UnsupportedEncodingException { if (m_format.equals(ATOM_ZIP1_1)) { try { m_tempDir = FileUtils.createTempDir("atomzip", null); m_zin = new ZipInputStream(new BufferedInputStream(in)); ZipEntry entry; while ((entry = m_zin.getNextEntry()) != null) { FileUtils.copy(m_zin, new FileOutputStream(new File(m_tempDir, entry.getName()))); } in = new FileInputStream(new File(m_tempDir, "atommanifest.xml")); } catch (FileNotFoundException e) { throw new StreamIOException(e.getMessage(), e); } catch (IOException e) { throw new StreamIOException(e.getMessage(), e); } } Parser parser = abdera.getParser(); Document<Feed> feedDoc = parser.parse(in); m_feed = feedDoc.getRoot(); m_xpath = abdera.getXPath(); m_obj = obj; m_encoding = encoding; m_transContext = transContext; addObjectProperties(); addDatastreams(); DOTranslationUtility.normalizeDatastreams(m_obj, m_transContext, m_encoding); FileUtils.delete(m_tempDir); } /** * {@inheritDoc} */ public DODeserializer getInstance() { return new AtomDODeserializer(m_format); } /** * Set the Fedora Object properties from the Feed metadata. * * @throws ObjectIntegrityException */ private void addObjectProperties() throws ObjectIntegrityException { PID pid; try { pid = new PID(m_feed.getId().toString()); } catch (MalformedPIDException e) { throw new ObjectIntegrityException(e.getMessage(), e); } String label = m_feed.getTitle(); String state = m_xpath.valueOf("/a:feed/a:category[@scheme='" + MODEL.STATE.uri + "']/@term", m_feed); String createDate = m_xpath.valueOf("/a:feed/a:category[@scheme='" + MODEL.CREATED_DATE.uri + "']/@term", m_feed); m_obj.setPid(pid.toString()); try { m_obj.setState(DOTranslationUtility.readStateAttribute(state)); } catch (ParseException e) { throw new ObjectIntegrityException("Could not read object state", e); } m_obj.setLabel(label); m_obj.setOwnerId(getOwnerId()); m_obj.setCreateDate(DateUtility.convertStringToDate(createDate)); m_obj.setLastModDate(m_feed.getUpdated()); setExtProps(); } private void addDatastreams() throws UnsupportedEncodingException, StreamIOException, ObjectIntegrityException { m_feed.sortEntries(new UpdatedIdComparator(true)); List<Entry> entries = m_feed.getEntries(); for (Entry entry : entries) { if (ThreadHelper.getInReplyTo(entry) != null) { addDatastreamVersion(entry); } } } private void addDatastreamVersion(Entry entry) throws UnsupportedEncodingException, StreamIOException, ObjectIntegrityException { IRI ref = ThreadHelper.getInReplyTo(entry).getRef(); Entry parent = m_feed.getEntry(ref.toString()); Datastream ds; String controlGroup = getDSControlGroup(parent); if (controlGroup.equals("X")) { ds = addInlineDatastreamVersion(entry); } else if (controlGroup.equals("M")) { ds = addManagedDatastreamVersion(entry); } else { ds = addExternalReferencedDatastreamVersion(entry); } m_obj.addDatastreamVersion(ds, true); } private Datastream addInlineDatastreamVersion(Entry entry) throws ObjectIntegrityException, StreamIOException { DatastreamXMLMetadata ds = new DatastreamXMLMetadata(); setDSCommonProperties(ds, entry); String dsId = ds.DatastreamID; String dsvId = ds.DSVersionID; ds.DSLocation = m_obj.getPid() + "+" + dsId + "+" + dsvId; if (ds.DSVersionID.equals("AUDIT.0")) { addAuditDatastream(entry); } else { try { if (m_format.equals(ATOM_ZIP1_1)) { ByteArrayOutputStream bout = new ByteArrayOutputStream(); FileUtils.copy(new FileInputStream(getContentSrcAsFile(entry.getContentSrc())), bout); ds.xmlContent = bout.toByteArray(); } else { ds.xmlContent = entry.getContent().getBytes(m_encoding); //IOUtils.toByteArray(entry.getContentStream()); } } catch (UnsupportedEncodingException e) { throw new StreamIOException(e.getMessage(), e); } catch (FileNotFoundException e) { throw new ObjectIntegrityException(e.getMessage(), e); } } if (ds.xmlContent != null) { ds.DSSize = ds.xmlContent.length; } MimeType mimeType = entry.getContentMimeType(); if (mimeType == null) { ds.DSMIME = "text/xml"; } else { ds.DSMIME = mimeType.toString(); } return ds; } private Datastream addExternalReferencedDatastreamVersion(Entry entry) throws ObjectIntegrityException { Datastream ds = new DatastreamReferencedContent(); setDSCommonProperties(ds, entry); ds.DSLocation = entry.getContentSrc().toString(); // Normalize the dsLocation for the deserialization context ds.DSLocation = (DOTranslationUtility.normalizeDSLocationURLs(m_obj.getPid(), ds, m_transContext)).DSLocation; ds.DSLocationType = "URL"; ds.DSMIME = entry.getContentMimeType().toString(); return ds; } private Datastream addManagedDatastreamVersion(Entry entry) throws StreamIOException, ObjectIntegrityException { Datastream ds = new DatastreamManagedContent(); setDSCommonProperties(ds, entry); ds.DSLocationType = "INTERNAL_ID"; ds.DSMIME = getDSMimeType(entry); // Managed Content can take any of the following forms: // 1) inline text (plaintext, html, xml) // 2) inline Base64 // 3) referenced content IRI contentLocation = entry.getContentSrc(); if (contentLocation != null) { // URL FORMAT VALIDATION for dsLocation: // For Managed Content the URL is only checked when we are parsing a // a NEW ingest file because the URL is replaced with an internal identifier // once the repository has sucked in the content for storage. if (m_obj.isNew()) { ValidationUtility .validateURL(contentLocation.toString(),ds.DSControlGrp); } if (m_format.equals(ATOM_ZIP1_1)) { if (!contentLocation.isAbsolute() && !contentLocation.isPathAbsolute()) { File f = getContentSrcAsFile(contentLocation); contentLocation = new IRI(DatastreamManagedContent.TEMP_SCHEME + f.getAbsolutePath()); } } ds.DSLocation = contentLocation.toString(); ds.DSLocation = (DOTranslationUtility.normalizeDSLocationURLs(m_obj .getPid(), ds, m_transContext)).DSLocation; return ds; } try { File temp = File.createTempFile("binary-datastream", null); OutputStream out = new FileOutputStream(temp); if (MimeTypeHelper.isText(ds.DSMIME) || MimeTypeHelper.isXml(ds.DSMIME)) { IOUtils.copy(new StringReader(entry.getContent()), out, m_encoding); } else { IOUtils.copy(entry.getContentStream(), out); } ds.DSLocation = DatastreamManagedContent.TEMP_SCHEME + temp.getAbsolutePath(); } catch (IOException e) { throw new StreamIOException(e.getMessage(), e); } return ds; } private void addAuditDatastream(Entry entry) throws ObjectIntegrityException, StreamIOException { try { Reader auditTrail; if (m_format.equals(ATOM_ZIP1_1)) { File f = getContentSrcAsFile(entry.getContentSrc()); auditTrail = new InputStreamReader(new FileInputStream(f), m_encoding); } else { auditTrail = new StringReader(entry.getContent()); } m_obj.getAuditRecords().addAll(DOTranslationUtility .getAuditRecords(auditTrail)); auditTrail.close(); } catch (XMLStreamException e) { throw new ObjectIntegrityException(e.getMessage(), e); } catch (IOException e) { throw new StreamIOException(e.getMessage(), e); } } private String getOwnerId() { Person owner = m_feed.getAuthor(); if (owner == null) { return ""; } else { return owner.getName(); } } /** * Parses the id to determine a datastreamId. * * @param id * @return */ private String getDatastreamId(Entry entry) { String entryId = entry.getId().toString(); // matches info:fedora/pid/dsid/timestamp Pattern pattern = Pattern.compile("^" + Constants.FEDORA.uri + ".+?/([^/]+)/?.*"); Matcher matcher = pattern.matcher(entryId); if (matcher.find()) { return matcher.group(1); } else { return m_obj.newDatastreamID(); } } private String getDatastreamVersionId(Entry entry) { String dsId = getDatastreamId(entry); String dsvId = entry.getTitle(); // e.g. Match DS1.0 but not DS1 if (dsvId.matches("^" + dsId + ".*\\.[\\w]")) { return dsvId; } else { if (!m_obj.datastreams(dsId).iterator().hasNext()) { return dsId + ".0"; } else { return m_obj.newDatastreamID(dsId); } } } private String getDSControlGroup(Entry entry) throws ObjectIntegrityException { List<Category> controlGroups = entry.getCategories(MODEL.CONTROL_GROUP.uri); // Try to infer the control group if not provided if (controlGroups.isEmpty() || controlGroups.size() > 1) { if (entry.getContentType() != null) { if (entry.getContentType().equals(Content.Type.XML)) { return "X"; } else { // only XML can be inline return "M"; } } if (entry.getContentSrc() != null) { return "M"; } // TODO other cases // link alts, link enclosures else { throw new ObjectIntegrityException("No control group provided by " + m_obj.getPid()); } } else { return controlGroups.get(0).getTerm(); } } private String getDSState(Entry entry) { List<Category> state = entry.getCategories(MODEL.STATE.uri); if (state.isEmpty() || state.size() > 1) { return "A"; } else { return state.get(0).getTerm(); } } /** * Note: AUDIT datastreams always return false, otherwise defaults to true. * * @param entry * @return */ private boolean getDSVersionable(Entry entry) { if (getDatastreamId(entry).equals("AUDIT")) { return false; } List<Category> versionable = entry.getCategories(MODEL.VERSIONABLE.uri); if (versionable.isEmpty() || versionable.size() > 1) { return true; } else { return Boolean.valueOf(versionable.get(0).getTerm()); } } private String[] getDSAltIds(Entry entry) { List<Category> altIds = entry.getCategories(MODEL.ALT_IDS.uri); if (altIds.isEmpty()) { return new String[0]; } else { return altIds.get(0).getTerm().split(" "); // TODO we could handle size > 1 } } private String getDSFormatURI(Entry entry) { List<Category> formatURI = entry.getCategories(MODEL.FORMAT_URI.uri); if (formatURI.isEmpty() || formatURI.size() > 1) { return null; } else { return formatURI.get(0).getTerm(); } } private String getDSLabel(Entry entry) { List<Category> label = entry.getCategories(MODEL.LABEL.uri); if (label.isEmpty()) { return ""; } return label.get(0).getTerm(); } private String getDSMimeType(Entry entry) { String dsMimeType = "application/unknown"; MimeType mimeType = entry.getContentMimeType(); if (mimeType == null) { Content.Type type = entry.getContentType(); if (type != null) { if (type == Content.Type.HTML) { dsMimeType = "text/html"; } else if (type == Content.Type.TEXT) { dsMimeType = "text/plain"; } else if (type == Content.Type.XHTML) { dsMimeType = "application/xhtml+xml"; } else if (type == Content.Type.XML) { dsMimeType = "text/xml"; } } } else { dsMimeType = mimeType.toString(); } return dsMimeType; } private String getDSChecksumType(Entry entry) { List<Category> digestType = entry.getCategories(MODEL.DIGEST_TYPE.uri); if (digestType.isEmpty()) { return Datastream.CHECKSUMTYPE_DISABLED; } else { return digestType.get(0).getTerm(); } } private String getDSChecksum(Entry entry) { List<Category> digest = entry.getCategories(MODEL.DIGEST.uri); if (digest.isEmpty()) { return Datastream.CHECKSUM_NONE; } else { return digest.get(0).getTerm(); } } private void setDSCommonProperties(Datastream dsVersion, Entry entry) throws ObjectIntegrityException { IRI ref = ThreadHelper.getInReplyTo(entry).getRef(); Entry parent = m_feed.getEntry(ref.toString()); dsVersion.DatastreamID = getDatastreamId(parent); dsVersion.DSControlGrp = getDSControlGroup(parent); dsVersion.DSState = getDSState(parent); dsVersion.DSVersionable = getDSVersionable(parent); setDatastreamVersionProperties(dsVersion, entry); } private void setDatastreamVersionProperties(Datastream ds, Entry entry) throws ValidationException { ds.DatastreamAltIDs = getDSAltIds(entry); ds.DSCreateDT = entry.getUpdated(); ds.DSFormatURI = getDSFormatURI(entry); ds.DSLabel = getDSLabel(entry); ds.DSVersionID = getDatastreamVersionId(entry); ds.DSChecksumType = getDSChecksumType(entry); String checksum = getDSChecksum(entry); if (m_obj.isNew()) { if (LOG.isDebugEnabled()) { LOG.debug("New Object: checking supplied checksum"); } if (checksum != null && !checksum.equals("") && !checksum.equals(Datastream.CHECKSUM_NONE)) { String tmpChecksum = ds.getChecksum(); if (LOG.isDebugEnabled()) { LOG.debug("checksum = " + tmpChecksum); } if (!checksum.equals(tmpChecksum)) { throw new ValidationException("Checksum Mismatch: " + tmpChecksum); } } ds.DSChecksumType = ds.getChecksumType(); } else { ds.DSChecksum = checksum; } } private void setExtProps() { List<Category> epCategories = m_feed.getCategories(MODEL.EXT_PROPERTY.uri); for (Category epCategory : epCategories) { m_obj.setExtProperty(epCategory.getTerm(), epCategory.getLabel()); } } /** * Returns the an Entry's contentSrc as a File relative to {@link #m_tempDir}. * * @param contentSrc * @return the contentSrc as a File relative to m_tempDir. * @throws ObjectIntegrityException */ protected File getContentSrcAsFile(IRI contentSrc) throws ObjectIntegrityException { if (contentSrc.isAbsolute() || contentSrc.isPathAbsolute()) { throw new ObjectIntegrityException("contentSrc must not be absolute"); } try { // Normalize the IRI to resolve percent-encoding and // backtracking (e.g. "../") NormalizedURI nUri = new NormalizedURI(m_tempDir.toURI().toString() + contentSrc.toString()); nUri.normalize(); File f = new File(nUri.toURI()); if (f.getParentFile().equals(m_tempDir)) { return f; } else { throw new ObjectIntegrityException(contentSrc.toString() + " is not a valid path."); } } catch (URISyntaxException e) { throw new ObjectIntegrityException(e.getMessage(), e); } } private static class UpdatedIdComparator implements Comparator<Entry> { private boolean ascending = true; UpdatedIdComparator(boolean ascending) { this.ascending = ascending; } public int compare(Entry o1, Entry o2) { Date d1 = o1.getUpdated(); Date d2 = o2.getUpdated(); String id1 = o1.getId().toString(); String id2 = o2.getId().toString(); int r = d1.compareTo(d2); if (d1.equals(d2)) { r = id1.compareTo(id2); } return (ascending) ? r : -r; } } }