/**
* JHOVE2 - Next-generation architecture for format-aware characterization
*
* Copyright (c) 2009 by The Regents of the University of California,
* Ithaka Harbors, Inc., and The Board of Trustees of the Leland Stanford
* Junior University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* o Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* o Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* o Neither the name of the University of California/California Digital
* Library, Ithaka Harbors/Portico, or Stanford University, nor the names of
* its contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.jhove2.module.aggrefy;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.jhove2.annotation.ReportableProperty;
import org.jhove2.core.I8R;
import org.jhove2.core.JHOVE2;
import org.jhove2.core.JHOVE2Exception;
import org.jhove2.core.format.FormatIdentification;
import org.jhove2.core.format.FormatIdentification.Confidence;
import org.jhove2.core.source.ClumpSource;
import org.jhove2.core.source.Source;
import org.jhove2.persist.ModuleAccessor;
import com.sleepycat.persist.model.Persistent;
/**
* Aggregate identifier that uses filepath globbing to detect instances of Clump Formats
* Each instance of a GlobPathRecognizer can recognize instances of a single format
*
* @author smorrissey
*/
@Persistent
public class GlobPathRecognizer
extends AbstractRecognizer
{
/** Identification module version identifier. */
public static final String VERSION = "2.1.0";
/** Identification module release date. */
public static final String RELEASE = "2012-10-31";
/** Identification module rights statement. */
public static final String RIGHTS = "Copyright 2010 by The Regents of the University of California, "
+ "Ithaka Harbors, Inc., and The Board of Trustees of the Leland "
+ "Stanford Junior University. "
+ "Available under the terms of the BSD license.";
/** Aggregate identifier confidence. */
public static final Confidence GLOB_PATH_CONFIDENCE = Confidence.Tentative;
/** Format which this recognizer can detect*/
protected I8R format;
/** String containing regular expression to group candidate files */
protected String fileGroupingExpr;
/** String containing regular expression to identify required candidate files */
protected String mustHaveExpr;
/** String containing regular expression to identify optional candidate files */
protected String mayHaveExpr;
/** Capture group index in fileGroupingToken which captures the part of
* the file path which indicates related files */
protected int fileGroupingCaptureGroupIndex;
/** Capture group index in fileGroupingToken which captures the part of
* the file path which we will be comparing for "must-have" files */
protected int mustHaveCaptureGroupIndex;
/** Capture group index in fileGroupingToken which captures the part of
* the file path which we will be comparing for "may-have" files */
protected int mayHaveCaptureGroupIndex;
/** Minimum number of files that must match the mustHaveExpr in order for
* a set of Sources to be considered an instance of an aggregate Format.
* Allows us to identify potentially defective instances of a format */
protected int minMustHavesToIdentify;
/** Indicates whether or not to include in the Source that is part of the
* FormatIdentification returned by this class any files which match the grouping expression,
* but do not match either must mustHaveExpr or mayHaveExpr */
protected boolean includeUnmatchedFromGroup;
/** Pattern compiled from the fileGroupingExpr */
protected Pattern fileGroupingPattern;
/** Pattern constructed from mustHaveExpr */
protected Pattern mustHavePattern;
/** Pattern constructed from mayHaveExpr */
protected Pattern mayHavePattern;
/**
* Instantiate a new <code>GlobPathRecognizer</code>.
*/
public GlobPathRecognizer() {
this(null);
}
/**
* Instantiate a new <code>GlobPathRecognizer</code>.
* @param ModuleAccessor to manage persistence
*/
public GlobPathRecognizer(ModuleAccessor moduleAccessor) {
super(VERSION, RELEASE, RIGHTS, Scope.Generic, moduleAccessor);
}
/**
* Instantiate a new <code>GlobPathRecognizer</code>.
* @param moduleAccessor manages persistence
* @param format Format which this class can recognize
* @param fileGroupingExpr String containing regular expression to group candidate files
* @param mustHaveExpr String containing regular expression to identify required candidate files
* @param mayHaveExpr String containing regular expression to identify optional candidate files
* @param fileGroupingCaptureGroupIndex int containing capture group index in fileGroupingExpr that
* captures the part of the file path which indicates related files
* @param mustHaveCaptureGroupIndex int containing capture group index in fileGroupingExpr that
* captures the part of the file path which indicates file is required
* @param mayHaveCaptureGroupIndex int containing capture group index in fileGroupingExpr that
* captures the part of the file path which indicates file is optional
* @param minMustHavesToIdentify int containing minimum number of files that must match the
* mustHaveExpr in order for a set of Sources to be considered an
* instance of an aggregate Format. Allows us to identify potentially
* defective instances
* @param includeUnmatchedFromGroup boolean which indicates whether or not to include in a Source
* that is part of the FormatIdentification returned by this class any
* files which match the grouping expression, but do not match either
* mustHaveExpr or mayHaveExpr
*/
public GlobPathRecognizer(ModuleAccessor moduleAccessor,
I8R format, String fileGroupingExpr,
String mustHaveExpr, String mayHaveExpr,
int fileGroupingCaptureGroupIndex,
int mustHaveCaptureGroupIndex,
int mayHaveCaptureGroupIndex,
int minMustHavesToIdentify,
boolean includeUnmatchedFromGroup) {
this(moduleAccessor);
this.format = format;
this.fileGroupingExpr = fileGroupingExpr;
this.mustHaveExpr = mustHaveExpr;
this.mayHaveExpr = mayHaveExpr;
this.fileGroupingCaptureGroupIndex = fileGroupingCaptureGroupIndex;
this.mustHaveCaptureGroupIndex = mustHaveCaptureGroupIndex;
this.mayHaveCaptureGroupIndex = mayHaveCaptureGroupIndex;
this.minMustHavesToIdentify = minMustHavesToIdentify;
this.includeUnmatchedFromGroup = includeUnmatchedFromGroup;
}
/** Aggregate identifier of the source unit.
* @param jhove2 JHOVE2 core framework
* @param source Source unit
* @return Recognized clump source units
* @throws IOException
* @throws JHOVE2Exception
* @see org.jhove2.module.aggrefy.Recognizer#recognize(org.jhove2.core.JHOVE2, org.jhove2.core.source.Source)
*/
@Override
public Set<ClumpSource> recognize(JHOVE2 jhove2, Source source)
throws IOException, JHOVE2Exception
{
Set<ClumpSource> clumpSources = new TreeSet<ClumpSource>();
this.compilePatterns();
Collection<GlobPathMatchInfoGroup> sourceGroups =
this.groupSources(source);
for (GlobPathMatchInfoGroup sourceGroup : sourceGroups) {
ClumpSource clumpSource =
this.recognizeGroupedSource(sourceGroup, jhove2);
if (clumpSource != null){
clumpSources.add(clumpSource);
}
}
return clumpSources;
}
/**
* Constructs candidate instances of an aggregate format by grouping
* together all children of source parameter that match fileGroupingToken
* @param source Source object whose child Sources are to be explored for groups
* constituting instances of a format
* @return Collection of GlobPathMatchInfoGroup objects, each of which contains a list
* of likely related Sources comprising an instance of a Format,
* and indications as to whether or not each Source in the group is a
* required or optional or unspecified component of that Format instances
* @throws JHOVE2Exception
*/
protected Collection <GlobPathMatchInfoGroup> groupSources(Source source)
throws JHOVE2Exception
{
HashMap<String, GlobPathMatchInfoGroup> groupMap =
new HashMap<String, GlobPathMatchInfoGroup>();
for (Source childSource:source.getChildSources()){
File sourceFile = childSource.getFile();
if (sourceFile != null){
String filePath = childSource.getFile().getPath();
// does the Source file path match the pattern that indicates a related file?
Matcher m = this.fileGroupingPattern.matcher(filePath);
if (m.matches()){
// might have more than one instance of a format in the Source, so
// we have to group related files together
String groupString = null;
String mustHaveString = null;
String mayHaveString = null;
try {
// get the value of the capture group which is the key to a format instance
// (group of files)
groupString = m.group(this.fileGroupingCaptureGroupIndex);
// get the value of the capture group that indicates a file in the group
// is one of the files required by the format definition
mustHaveString = m.group(this.mustHaveCaptureGroupIndex);
// get the value of the capture group that indicates a file in the group
// is one of the files considered optional by the format definition
mayHaveString = m.group(this.mayHaveCaptureGroupIndex);
}
catch (IllegalStateException ise){
// should not occur, we were inside if (m.matches()) statement
throw new JHOVE2Exception("Exception thrown grouping patterns: Check configuration", ise);
}
catch (IndexOutOfBoundsException iob){
throw new JHOVE2Exception("Exception thrown grouping patterns: Check configuration", iob);
}
GlobPathMatchInfo fileInfo = new GlobPathMatchInfo(childSource);
boolean matchesMustHaves = false;
Matcher m2 = null;
if (this.mustHavePattern != null){
m2 = this.mustHavePattern.matcher(mustHaveString);
matchesMustHaves = m2.matches();
}
fileInfo.setMustHave(matchesMustHaves);
boolean matchesMayHaves = false;
if (this.mayHavePattern != null){
m2 = this.mayHavePattern.matcher(mayHaveString);
matchesMayHaves = m2.matches();
}
fileInfo.setMayHave(matchesMayHaves);
GlobPathMatchInfoGroup infoGroup;
//is this the first occurrence of grouping key?
if (!(groupMap.containsKey(groupString))){
//if so, add grouping key and new GlobPathMatchInfoGroup to groupMaP
infoGroup = new GlobPathMatchInfoGroup();
infoGroup.setGroupKey(groupString);
groupMap.put(groupString, infoGroup);
}
else {
// otherwise just retrieve
infoGroup = groupMap.get(groupString);
}
// add information about current Source to list associated with this grouping key
infoGroup.getSourceMatchInfoList().add(fileInfo);
// increment counter information associated with this grouping key
if (matchesMustHaves){
infoGroup.setMustHaveCount(infoGroup.getMustHaveCount()+1);
}
if (matchesMayHaves){
infoGroup.setMayHaveCount(infoGroup.getMayHaveCount()+1);
}
if (!matchesMustHaves && !matchesMayHaves){
infoGroup.setUnmatchedCount(infoGroup.getUnmatchedCount()+1);
}
}// end if (m.matches()){
}//end if sourceFile != null
}// end for (Source childSource:source.getChildSources())
// we don't need the keys to the map any more; just return the values
return groupMap.values();
}
/**
* Inspects candidate group to determine if it comprises instance of Format
* @param jhove2 framework configured with SourceFactory
* @return FormatIdentification for this group if it comprises instance of a Format;
* otherwise returns null;
* @throws JHOVE2Exception
*/
protected ClumpSource recognizeGroupedSource(GlobPathMatchInfoGroup fileGroup,
JHOVE2 jhove2)
throws JHOVE2Exception
{
FormatIdentification fi = null;
ClumpSource clumpSource = null;
if (fileGroup.getMustHaveCount() >= this.minMustHavesToIdentify){
fi = new FormatIdentification(this.format, GLOB_PATH_CONFIDENCE,
this.getReportableIdentifier());
if (jhove2.getSourceFactory() == null){
throw new JHOVE2Exception("JHOVE2 SourceFactory is null");
}
clumpSource = jhove2.getSourceFactory().getClumpSource(jhove2);
clumpSource = (ClumpSource) clumpSource.addPresumptiveFormat(fi);
for (GlobPathMatchInfo sourceInfo:fileGroup.getSourceMatchInfoList()){
if ((sourceInfo.isMustHave() || sourceInfo.isMayHave()) ||
(this.includeUnmatchedFromGroup)) {
Source sourceInfoSource = sourceInfo.getSource();
sourceInfoSource=clumpSource.addChildSource(sourceInfoSource);
sourceInfo.setSource(sourceInfoSource);
}
}
}
return clumpSource;
}
/**
* Compiles regular expression patterns used in globbing
* @throws JHOVE2Exception Any PatternSyntaxException is thrown as JHOVE2Exception and allowed
* to bubble up to stop processing, as configuration needs fixing
*/
protected void compilePatterns() throws JHOVE2Exception{
try {
this.fileGroupingPattern = Pattern.compile(this.fileGroupingExpr);
}
catch (PatternSyntaxException e){
throw new JHOVE2Exception("Exception thrown compiling fileGroupingToken: "
+ this.fileGroupingExpr, e);
}
if (this.mustHaveExpr != null){
try {
this.mustHavePattern = Pattern.compile(this.mustHaveExpr);
}
catch (PatternSyntaxException e){
throw new JHOVE2Exception("Exception thrown compiling mustHaveToken: "
+ this.mustHaveExpr, e);
}
}
if (this.mayHaveExpr != null){
try {
this.mayHavePattern = Pattern.compile(this.mayHaveExpr);
}
catch (PatternSyntaxException e){
throw new JHOVE2Exception("Exception thrown compiling mayHaveToken: "
+ this.fileGroupingExpr, e);
}
}
return;
}
/**
* Get identifier for Format which this recognizer can detect
* @return Format which this recognizer can detect
*/
@ReportableProperty(order = 2, value = "I8R for Format which this recognizer can detect")
public I8R getFormatIdentifier() {
return format;
}
/**
* Sets identifier for Format which this recognizer can detect
* @param format Format which this recognizer can detect
*/
public void setFormatIdentifier(I8R format) {
this.format = format;
}
/**
* Get String containing regular expression to group candidate files
* @return String containing regular expression to group candidate files
*/
@ReportableProperty(order = 2, value = "String containing regular expression to group candidate files")
public String getFileGroupingExpr() {
return fileGroupingExpr;
}
/**
* Set String containing regular expression to group candidate files
* @param groupingExpr Regular expression to group candidate files
*/
public void setFileGroupingExpr(String groupingExpr) {
this.fileGroupingExpr = groupingExpr;
}
/**
* Get String containing regular expression to identify required candidate files
* @return Regular expression to identify required candidate files
*/
@ReportableProperty(order = 2,
value = "String containing regular expression to identify required candidate files")
public String getMustHaveExpr() {
return mustHaveExpr;
}
/**
* Sets String containing regular expression to identify required candidate files
* @param mustHaveExpr Regular expression to identify required candidate files
*/
public void setMustHaveExpr(String mustHaveExpr) {
this.mustHaveExpr = mustHaveExpr;
}
/**
* Gets String containing regular expression to identify optional candidate files
* @return Regular expression to identify optional candidate files
*/
@ReportableProperty(order = 2,
value = "String containing regular expression to identify optional candidate files")
public String getMayHaveExpr() {
return mayHaveExpr;
}
/**
* Sets String containing regular expression to identify optional candidate files
* @param mayHaveExpr Regular expression to identify optional candidate files
*/
public void setMayHaveExpr(String mayHaveExpr) {
this.mayHaveExpr = mayHaveExpr;
}
/**
* Get minimum number of files that must match the mustHaveExpr in order for
* a set of Sources to be considered an instance of an aggregate Format.
* Allows us to identify potentially defective instances of a format
*
* @return minMustHavesToIdentify
*/
@ReportableProperty(order = 2, value="Minimum number of files that must match the mustHaveExp")
public int getMinMustHavesToIdentify() {
return minMustHavesToIdentify;
}
/**
* Sets minimum number of files that must match the mustHaveExpr in order for
* a set of Sources to be considered an instance of an aggregate Format.
* Allows us to identify potentially defective instances of a format
*
* @param minMustHavesToIdentify
*/
public void setMinMustHavesToIdentify(int minMustHavesToIdentify) {
this.minMustHavesToIdentify = minMustHavesToIdentify;
}
/**
* Get indicator as to whether or not to include in the Source that is part of the
* FormatIdentification returned by this class any files which match the grouping expression,
* but do not match either must mustHaveExpr or mayHaveExpr
*
* @return boolean
*/
@ReportableProperty(order = 2,
value="Indicates whether or not to include any files which match the grouping expression but do not match either must mustHaveExpr or mayHaveExpr")
public boolean isIncludeUnmatchedFromGroup() {
return includeUnmatchedFromGroup;
}
/**
* Sets indicator as to whether or not to include in the Source that is part of the
* FormatIdentification returned by this class any files which match the grouping expression,
* but do not match either must mustHaveExpr or mayHaveExpr
*
* @param includeUnmatchedFromGroup
*/
public void setIncludeUnmatchedFromGroup(boolean includeUnmatchedFromGroup) {
this.includeUnmatchedFromGroup = includeUnmatchedFromGroup;
}
/**
* Gets capture group index in fileGroupingToken which captures the part of
* the file path which we will be comparing for "must-have" files
*
* @return mustHaveCaptureGroupIndex
*/
@ReportableProperty(order = 2,
value = "Capture group index in fileGroupingToken for the part of the file path which we will be comparing for must-have files")
public int getMustHaveCaptureGroupIndex() {
return mustHaveCaptureGroupIndex;
}
/**
* Sets capture group index in fileGroupingToken which captures the part of
* the file path which we will be comparing for "must-have" files
*
* @param mustHaveCaptureGroupIndex
*/
public void setMustHaveCaptureGroupIndex(int mustHaveCaptureGroupIndex) {
this.mustHaveCaptureGroupIndex = mustHaveCaptureGroupIndex;
}
/**
* Get capture group index in fileGroupingToken which captures the part of
* the file path which we will be comparing for "may-have" files
*
* @return mayHaveCaptureGroupIndex
*/
@ReportableProperty(order = 2,
value="capture group index in fileGroupingToken which captures the part of the file path which we will be comparing for may-have files")
public int getMayHaveCaptureGroupIndex() {
return mayHaveCaptureGroupIndex;
}
/**
* Set capture group index in fileGroupingToken which captures the part of
* the file path which we will be comparing for "may-have" files
*
* @param mayHaveCaptureGroupIndex
*/
public void setMayHaveCaptureGroupIndex(int mayHaveCaptureGroupIndex) {
this.mayHaveCaptureGroupIndex = mayHaveCaptureGroupIndex;
}
/**
* Get capture group index in fileGroupingToken which captures the part of
* the file path which indicates related files
*
* @return fileGroupingCaptureGroupIndex
*/
@ReportableProperty(order = 2,
value = "Capture group index in fileGroupingToken which captures the part of the file path which indicates related files")
public int getFileGroupingCaptureGroupIndex() {
return fileGroupingCaptureGroupIndex;
}
/**
* Set capture group index in fileGroupingToken which captures the part of
* the file path which indicates related files
*
* @param fileGroupingCaptureGroupIndex
*/
public void setFileGroupingCaptureGroupIndex(int fileGroupingCaptureGroupIndex) {
this.fileGroupingCaptureGroupIndex = fileGroupingCaptureGroupIndex;
}
}