/** * JHOVE2 - Next-generation architecture for format-aware characterization * * Copyright (c) 2009 by The Regents of the University of California, * Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford * Junior University. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * o Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * o Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * o Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of * its contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ package org.jhove2.module.format.gzip; import java.io.BufferedInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.nio.ByteOrder; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.LinkedList; import java.util.Map; import java.util.TreeMap; import java.util.zip.DataFormatException; import org.jhove2.annotation.ReportableProperty; import org.jhove2.core.JHOVE2; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.Message; import org.jhove2.core.Message.Severity; import org.jhove2.core.format.Format; import org.jhove2.core.format.FormatIdentification; import org.jhove2.core.io.Input; import org.jhove2.core.reportable.AbstractReportable; import org.jhove2.core.source.Source; import org.jhove2.core.source.SourceFactory; import org.jhove2.module.format.BaseFormatModule; import org.jhove2.module.format.Validator; import org.jhove2.module.format.gzip.properties.GzipEntryData; import org.jhove2.persist.FormatModuleAccessor; import org.jwat.arc.ArcReader; import org.jwat.common.Diagnosis; import org.jwat.common.Diagnostics; import org.jwat.gzip.GzipConstants; import org.jwat.gzip.GzipEntry; import org.jwat.gzip.GzipReader; import org.jwat.warc.WarcReader; import com.sleepycat.persist.model.NotPersistent; import com.sleepycat.persist.model.Persistent; /** * JHOVE2 GZip module. This module parses and validates GZip files * in compliance with * <a href="http://www.ietf.org/rfc/rfc1952.txt">RFC 1952</a> (GZIP * file format specification version 4.3) and supports multiple member * GZIP files.</p> * * @author lbihanic, selghissassi, nicl */ @Persistent public class GzipModule extends BaseFormatModule implements Validator { /** Module version identifier. */ public final static String VERSION = "2.1.0"; /** Module release date. */ public final static String RELEASE = "2013-02-11"; /** Module validation coverage. */ public static final Coverage COVERAGE = Coverage.Selective; /** Validation status. */ private Validity isValid; /** The name of the GZip file. */ private String gzipFileName; /** The size of the GZip file, in bytes. */ private Long gzipFileSize; /** Last modified date of the GZip file. */ private Date gzipFileLastModified; /** Number of members compressed with the deflate compression method. */ private long deflateMemberCount = 0; /** The amount of bytes consumed by the GZipReader. */ private long gzipReaderConsumedBytes; /** Number of non-valid members. */ private long invalidMembers = 0; /** Validation error messages. */ //private final Collection<Message> validationMessages = // new ConcurrentLinkedQueue<Message>(); private final Collection<Message> validationMessages = new LinkedList<Message>(); /** A parser object to decode woven compressed formats. */ //private Parser wovenFormatParser = null; /** Whether to recursively characterize GZip members. */ private boolean recurse = true; /** Thread pool size for parallel characterization of GZip member. */ //private int nThreads = 0; /** * GZip instance id to GZipModule lookup <code>Map</code>. * Used by ARC/WARC modules to access the ACTUAL parent GZipModule instead * of only getting access to a new instance populated with persisted data. */ @NotPersistent public static final transient Map<Integer, GzipModule> gzipMap = new TreeMap<Integer, GzipModule>(); /** Used to generate a unique id for each file parsed. */ @NotPersistent public static final transient AutoIncrement autoIncId = new AutoIncrement(); /** Id used by this instance of the module. */ public Integer instanceId; /** * ARC/WARC reader set by a child ARC/WARC module in order to use the same * reader for all the entries in the same GZip file. */ @NotPersistent public transient Object reader; @Persistent public static class GZipOffsetProperty extends AbstractReportable { public long offset; public GZipOffsetProperty() { } public GZipOffsetProperty(long offset) { this.offset = offset; } } /** * Presumptive format used to identify subsequent ARC/WARC records which are * not identified by the identifier module. */ @NotPersistent public transient FormatIdentification presumptiveFormat; /** * Instantiate a new <code>ZipModule</code>. * This constructor is used by the Spring framework. * @param format Gzip format * @param formatModuleAccessor FormatModuleAccessor to manage access to Format Profiles */ public GzipModule(Format format, FormatModuleAccessor formatModuleAccessor) { super(VERSION, RELEASE, RIGHTS, format, formatModuleAccessor); isValid = Validity.Undetermined; } /** * Instantiate a new <code>ArcModule</code> instance. * This constructor is used by the persistence layer. */ public GzipModule() { this(null, null); } /** * Method for creating test instances. * @return <code>GzipModule</code> instance */ protected GzipModule getTestInstance() { GzipModule gzipModule = new GzipModule(format, (FormatModuleAccessor)moduleAccessor); gzipModule.isValid = Validity.Undetermined; gzipModule.recurse = recurse; return gzipModule; } //------------------------------------------------------------------------ // BaseFormatModule contract support //------------------------------------------------------------------------ /* * Parse a GZip file/entry. * @see org.jhove2.module.format.BaseFormatModule#parse(org.jhove2.core.JHOVE2, org.jhove2.core.source.Source, org.jhove2.core.io.Input) */ @Override public long parse(final JHOVE2 jhove2, Source source, Input input) throws EOFException, IOException, JHOVE2Exception { /* * Module init. */ long consumed = 0L; deflateMemberCount = 0L; invalidMembers = 0L; validationMessages.clear(); isValid = Validity.Undetermined; // In GZip format, least-significant bytes come first. input.setByteOrder(ByteOrder.LITTLE_ENDIAN); instanceId = autoIncId.get(); // This is done because it is not persisted immediately. // It is needed in recursive calls and not when the gzip module exits. // Each time jhove2 looks up an existing module it actually // instantiates a new class and loads the persisted values. // So a version with the correct instanceId exists on the call stack // but every time someone requests it a new one is created and // populated with persisted data. Epic fail! getModuleAccessor().persistModule(this); synchronized (gzipMap) { gzipMap.put(instanceId, this); } // Characterize each GZip member from the source, validating // the corresponding GZip headers and trailers. GzipReader gzipReader = null; GzipEntry gzipEntry = null; try { source.setIsAggregate(true); SourceFactory factory = jhove2.getSourceFactory(); /* * Reportable: Filename, file size, etc. */ if (!source.isTemp()) { gzipFileName = source.getFile().getName(); gzipFileSize = source.getFile().length(); gzipFileLastModified = new Date(source.getFile().lastModified()); } /* * Read some GZip entries. */ gzipReader = new GzipReader( new BufferedInputStream(source.getInputStream(), 8192)); int memberCount = 0; while ((gzipEntry = gzipReader.getNextEntry()) != null) { // Wrap found member in a JHove2 Source object. InputStream stream = gzipEntry.getInputStream(); String name = gzipEntry.fname; Source src = factory.getSource(jhove2, stream, name, null); if (src != null) { src.setDeleteTempFileOnClose(jhove2.getInvocation().getDeleteTempFilesOnClose()); memberCount++; // Attach member to parent source. source.addChildSource(src); if (presumptiveFormat != null) { src.addPresumptiveFormat(presumptiveFormat); } if (recurse) { // expose offset to ARC/WARC modules. src.addExtraProperties(new GZipOffsetProperty(gzipEntry.getStartOffset())); characterizeMember(jhove2, src); } src.close(); } gzipEntry.close(); /* * Properties. */ GzipEntryData gzipEntryData = new GzipEntryData(gzipEntry); src.addExtraProperties(gzipEntryData.getGzipEntryProperties()); // Check member compression method (always deflate). if (gzipEntry.cm == GzipConstants.CM_DEFLATE) { ++deflateMemberCount; } // Check member validity. if (! gzipEntry.isCompliant()) { ++invalidMembers; isValid = Validity.False; // Report errors on child source object. reportValidationErrors(src, gzipEntry.diagnostics, jhove2); } } // Report reader errors on source object. reportValidationErrors(source, gzipReader.diagnostics, jhove2); if (!gzipReader.isCompliant()) { isValid = Validity.False; } consumed = gzipReader.getConsumed(); gzipReaderConsumedBytes = gzipReader.getConsumed(); if (isValid == Validity.Undetermined) { // No invalid members found and EOF reached without // any exception being thrown => Source is valid. isValid = Validity.True; } } catch (IOException e) { handleError(e, jhove2, gzipEntry.getStartOffset()); if (e.getCause() != null && e.getCause() instanceof DataFormatException) { isValid = Validity.False; source.addMessage(newValidityError(jhove2, Message.Severity.ERROR, "error", new Object[]{"GZip data", e.getMessage()})); } else if (! ((e instanceof EOFException) && (gzipEntry.getStartOffset() != 0L))) { // Not an EOF error occurring before the very first entry. throw e; } } finally { // Close GZip input stream. try { gzipReader.close(); } catch (Exception e) { /* Ignore... */ } } /* * Cleanup. */ if (reader != null) { if (reader instanceof ArcReader) { ((ArcReader)reader).close(); } else if (reader instanceof WarcReader) { ((WarcReader)reader).close(); } } synchronized (gzipMap) { gzipMap.remove(instanceId); } /* * Consumed. */ return consumed; } private void characterizeMember(JHOVE2 jhove2, Source source) throws JHOVE2Exception, IOException { Input input = source.getInput(jhove2); try { jhove2.characterize(source, input); } finally { // Make sure all file descriptors are properly closed. if (input != null) { input.close(); } } } private void handleError(Exception e, JHOVE2 jhove2, long offset) { try { isValid = Validity.False; validationMessages.add( newValidityError(jhove2, Message.Severity.ERROR, "invalidGzipFile", new Object[] {Long.valueOf(offset), e})); } catch (JHOVE2Exception ex) { throw new RuntimeException(ex); } } /** * Checks GZip entry validity and reports validation errors. * @param src GZip source unit * @param entry the GZip entry to characterize. * @param jhove2 the JHove2 characterization context. * @throws IOException if an IO error occurs while processing * @throws JHOVE2Exception if a serious problem needs to be reported */ private void reportValidationErrors(Source src, Diagnostics<Diagnosis> diagnostics, JHOVE2 jhove2) throws JHOVE2Exception { if (diagnostics.hasErrors()) { // Report errors on source object. for (Diagnosis d : diagnostics.getErrors()) { src.addMessage(newValidityError(jhove2, Message.Severity.ERROR, d.type.toString().toLowerCase(), d.getMessageArgs())); } } if (diagnostics.hasWarnings()) { // Report warnings on source object. for (Diagnosis d : diagnostics.getWarnings()) { src.addMessage(newValidityError(jhove2, Message.Severity.WARNING, d.type.toString().toLowerCase(), d.getMessageArgs())); } } } /** * Instantiates a new localized message. * @param jhove2 the JHove2 characterization context. * @param severity message severity * @param id the configuration property relative name. * @param messageArgs the values to add in the message * @return the new localized message * @throws JHOVE2Exception if a serious problem needs to be reported */ private Message newValidityError(JHOVE2 jhove2, Severity severity, String id, Object[] messageArgs) throws JHOVE2Exception { return new Message(severity, Message.Context.OBJECT, this.getClass().getName() + '.' + id, messageArgs, jhove2.getConfigInfo()); } //------------------------------------------------------------------------ // Validator interface support //------------------------------------------------------------------------ /** * Validate the Gzip file. * @param jhove2 JHOVE2 framework object * @param source Gzip file source unit * @param input Gzip file source input * @see org.jhove2.module.format.Validator#validate(org.jhove2.core.JHOVE2, org.jhove2.core.source.Source, org.jhove2.core.io.Input) */ @Override public Validity validate(JHOVE2 jhove2, Source source, Input input) throws JHOVE2Exception { return isValid(); } /** * Get validation coverage. * @return Validation coverage * @see org.jhove2.module.format.Validator#getCoverage() */ @Override public Coverage getCoverage() { return COVERAGE; } /** * Get validity. * @return Validity * @see org.jhove2.module.format.Validator#isValid() */ @Override public Validity isValid() { return isValid; } //------------------------------------------------------------------------ // Reportable properties //------------------------------------------------------------------------ /** * gzipFileName getter. * @return the gzipFileName */ @ReportableProperty(order=1, value="GZip file name") public String getGZipFileName() { return gzipFileName; } /** * gzipFileSize getter. * @return the gzipFileSize */ @ReportableProperty(order=2, value="GZip file size, in bytes") public Long getGZipFileSize() { return gzipFileSize; } /** * Returns GZip file last modified date. * @return GZip file last modified date */ @ReportableProperty(order=3, value="GZip file last modified date") public Date getLastModified() { return gzipFileLastModified; } /** * Returns the number of GZip entries found. * @return the number of GZip entries found */ @ReportableProperty(order = 4, value = "Number of members compressed with the deflate compression method") public long getNumDeflateMembers() { return deflateMemberCount; } /** * gzipReaderConsumedBytes getter. * @return the gzipReaderConsumedBytes */ @ReportableProperty(order=5, value="GZip reader consumed bytes, in bytes") public long getGZipReaderConsumedBytes() { return gzipReaderConsumedBytes; } /** * Returns the number of invalid GZip entries found. * @return the number of invalid GZip entries found */ @ReportableProperty(order = 6, value = "Number of non-valid members") public long getNumInvalidMembers() { return invalidMembers; } /** * Returns the number of GZip entries marked as invalid. * @return the number of invalid GZip entries found */ @ReportableProperty(order = 7, value = "Validation error messages") public Collection<Message> getValidationMessages() { // Return null if the list is empty to prevent the displayer // from rendering this property. return (validationMessages.isEmpty())? null: Collections.unmodifiableCollection(validationMessages); } //------------------------------------------------------------------------ // Specific implementation //------------------------------------------------------------------------ /** * <i>Dependency injection<i/> Sets whether to recursively * characterize GZip members. * @param recurse whether to recursively characterize GZip members */ public void setRecurse(boolean recurse) { this.recurse = recurse; } /** * Returns whether this module recursively characterizes the * found GZip members. * @return <code>true</code> if GZip members are recursively * characterized; <code>false</code> otherwise. Defaults * to <code>true</code> */ public boolean getRecurse() { return recurse; } }