/* * Copyright 2012, CMM, University of Queensland. * * This file is part of Paul. * * Paul is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Paul is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Paul. If not, see <http://www.gnu.org/licenses/>. */ package au.edu.uq.cmm.paul.grabber; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import javax.persistence.EntityManager; import javax.persistence.EntityManagerFactory; import javax.persistence.TypedQuery; import org.apache.commons.collections.Predicate; import org.apache.commons.collections.PredicateUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import au.edu.uq.cmm.paul.Paul; import au.edu.uq.cmm.paul.queue.QueueManager.DateRange; import au.edu.uq.cmm.paul.status.Facility; import au.edu.uq.cmm.paul.status.FacilityStatusManager; import au.edu.uq.cmm.paul.watcher.UncPathnameMapper; /** * This variation on the DataGrabber gathers DatasetMetadata records all files * in a facility's directory tree, and compares them against the records in the DB. * The analyser also performs some basic integrity checks on the queue. * * @author scrawley */ public class Analyser extends AbstractFileGrabber { private static Logger LOG = LoggerFactory.getLogger(Analyser.class); public enum ProblemType { METADATA_MISSING, METADATA_SIZE, FILE_MISSING, FILE_SIZE, FILE_SIZE_2, FILE_HASH, FILE_HASH_2, IO_ERROR; } private static final Comparator<DatasetMetadata> ORDER_BY_BASE_PATH_AND_TIME = new Comparator<DatasetMetadata>() { @Override public int compare(DatasetMetadata o1, DatasetMetadata o2) { int res = o1.getSourceFilePathnameBase().compareTo( o2.getSourceFilePathnameBase()); if (res == 0) { res = Long.compare( o1.getLastFileTimestamp().getTime(), o2.getLastFileTimestamp().getTime()); } return res; } }; private static final Comparator<DatasetMetadata> ORDER_BY_BASE_PATH_AND_TIME_AND_ID = new Comparator<DatasetMetadata>() { @Override public int compare(DatasetMetadata o1, DatasetMetadata o2) { int res = o1.getSourceFilePathnameBase().compareTo( o2.getSourceFilePathnameBase()); if (res == 0) { res = Long.compare( o1.getLastFileTimestamp().getTime(), o2.getLastFileTimestamp().getTime()); } if (res == 0) { res = o1.getId().compareTo(o2.getId()); } return res; } }; private BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(); private FacilityStatusManager fsm; private EntityManagerFactory emf; private UncPathnameMapper uncNameMapper; private List<Group> grouped; private Statistics all; private Statistics beforeLWM; private Statistics intertidal; private Statistics afterHWM; private Problems problems; private Statistics beforeQStart; private Statistics inQueue; private Statistics afterQEnd; private Date lwm; private Date hwm; private Date qStart; private Date qEnd; private Date fStart; private Date fEnd; private boolean checkHashes; public Analyser(Paul services, Facility facility) { super(services, facility); fsm = services.getFacilityStatusManager(); uncNameMapper = services.getUncNameMapper(); emf = services.getEntityManagerFactory(); } public Analyser analyse(Date lwmTimestamp, Date hwmTimestamp, DateRange queueRange, boolean checkHashes) { this.lwm = lwmTimestamp; this.hwm = hwmTimestamp; if (queueRange == null) { this.qStart = null; this.qEnd = null; } else { this.qStart = queueRange.getFromDate(); this.qEnd = queueRange.getToDate(); } this.checkHashes = checkHashes; LOG.info("Analysing queues and folders for " + getFacility().getFacilityName()); SortedSet<DatasetMetadata> inFolder = buildInFolderMetadata(); SortedSet<DatasetMetadata> inDatabase = buildInDatabaseMetadata(); LOG.debug("Got " + inFolder.size() + " in folders and " + inDatabase.size() + " in database"); LOG.info("Grouping datasets for " + getFacility().getFacilityName()); grouped = groupDatasets(inFolder, inDatabase); LOG.debug("Got " + grouped.size() + " groups"); LOG.info("Gathering statistics for " + getFacility().getFacilityName()); determineFolderRange(inFolder); all = gatherStats(grouped, PredicateUtils.truePredicate()); if (hwmTimestamp == null || lwmTimestamp == null) { beforeLWM = null; afterHWM = null; intertidal = null; } else { final long lwmTime = lwmTimestamp.getTime(); beforeLWM = gatherStats(grouped, new Predicate() { public boolean evaluate(Object metadata) { return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() < lwmTime; } }); final long hwmTime = hwmTimestamp.getTime(); afterHWM = gatherStats(grouped, new Predicate() { public boolean evaluate(Object metadata) { return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() > hwmTime; } }); intertidal = gatherStats(grouped, new Predicate() { public boolean evaluate(Object metadata) { long time = ((DatasetMetadata) metadata).getLastFileTimestamp().getTime(); return time >= lwmTime && time <= hwmTime; } }); } if (queueRange == null) { afterQEnd = null; beforeQStart = null; inQueue = null; } else { final long qStart = this.qStart.getTime(); beforeQStart = gatherStats(grouped, new Predicate() { public boolean evaluate(Object metadata) { return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() < qStart; } }); final long qEnd = this.qEnd.getTime(); afterQEnd = gatherStats(grouped, new Predicate() { public boolean evaluate(Object metadata) { return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() > qEnd; } }); inQueue = gatherStats(grouped, new Predicate() { public boolean evaluate(Object metadata) { long ts = ((DatasetMetadata) metadata).getLastFileTimestamp().getTime(); return ts >= qStart && ts <= qEnd; } }); } LOG.info("Performing queue entry integrity checks for " + getFacility().getFacilityName()); problems = integrityCheck(grouped); return this; } private void determineFolderRange(SortedSet<DatasetMetadata> inFolder) { if (inFolder.isEmpty()) { fStart = null; fEnd = null; } else { Iterator<DatasetMetadata> it = inFolder.iterator(); DatasetMetadata ds = it.next(); fStart = fEnd = ds.getLastFileTimestamp(); while (it.hasNext()) { ds = it.next(); Date ts = ds.getLastFileTimestamp(); if (ts.getTime() < fStart.getTime()) { fStart = ts; } else if (ts.getTime() > fEnd.getTime()) { fEnd = ts; } } } } private Problems integrityCheck(List<Group> grouped) { List<Problem> problems = new ArrayList<Problem>(); for (Group group : grouped) { // Check only the latest queue entry. Older ones are not really // relevant, and besides they typically have the "problem" that // one or more captured component datafiles no longer matches the // in-folder dataset. (Which has typically been recaptured.) DatasetMetadata dataset = group.getLatestInDatabase(); if (dataset == null) { continue; } File adminFile = new File(dataset.getMetadataFilePathname()); if (!adminFile.exists()) { logProblem(dataset, null, ProblemType.METADATA_MISSING, problems, "Metadata file missing: " + adminFile); } else if (adminFile.length() == 0) { logProblem(dataset, null, ProblemType.METADATA_SIZE, problems, "Metadata file empty: " + adminFile); } for (DatafileMetadata datafile : dataset.getDatafiles()) { try { String hash = checkHashes ? datafile.getDatafileHash() : null; if (checkHashes) { LOG.debug("stored hash - " + hash); } File file = new File(datafile.getCapturedFilePathname()); if (!file.exists()) { logProblem(dataset, datafile, ProblemType.FILE_MISSING, problems, "Data file missing: " + file); } else if (file.length() != datafile.getFileSize()) { logProblem(dataset, datafile, ProblemType.FILE_SIZE, problems, "Data file size mismatch: " + file + ": admin metadata says " + datafile.getFileSize() + " but actual captured file size is " + file.length()); } else if (hash != null && !hash.equals(HashUtils.fileHash(file))) { logProblem(dataset, datafile, ProblemType.FILE_HASH, problems, "Data file hash mismatch between metadata and " + file); } else if (checkHashes) { LOG.debug("captured hash - " + HashUtils.fileHash(file)); } File source = new File(datafile.getSourceFilePathname()); if (source.exists()) { if (source.length() != datafile.getFileSize()) { logProblem(dataset, datafile, ProblemType.FILE_SIZE_2, problems, "Data file size mismatch: " + file + ": original file size is " + source.length() + " but admin metadata says " + datafile.getFileSize()); } else if (hash != null && !hash.equals(HashUtils.fileHash(source))) { logProblem(dataset, datafile, ProblemType.FILE_HASH_2, problems, "Data file hash mismatch between metadata and " + source); } else if (checkHashes) { LOG.debug("source hash - " + HashUtils.fileHash(source)); } } } catch (IOException ex) { LOG.error("Unexpected IOException while checking hashes", ex); logProblem(dataset, datafile, ProblemType.IO_ERROR, problems, "IO error while checking file hashes - see logs"); } } } LOG.info("Queue integrity check for '" + getFacility().getFacilityName() + "' found " + problems.size() + " problems (listed above)"); return new Problems(problems); } private void logProblem(DatasetMetadata dataset, DatafileMetadata datafile, ProblemType type, List<Problem> list, String details) { LOG.info("Problem in dataset #" + dataset.getId() + ": " + details); list.add(new Problem(dataset, datafile, type, details)); } private Statistics gatherStats(List<Group> grouped, Predicate predicate) { int datasetsInFolder = 0; int datasetsInDatabase = 0; int datasetsUnmatchedInFolder = 0; int groupsUnmatchedInDatabase = 0; int groupsWithDuplicatesInDatabase = 0; int groupsInDatabase = 0; for (Group group : grouped) { if (group.getInFolder() != null && predicate.evaluate(group.getInFolder())) { datasetsInFolder++; if (group.getAllInDatabase().size() == 0) { datasetsUnmatchedInFolder++; } } int inDatabase = 0; boolean matched = false; for (DatasetMetadata dataset : group.getAllInDatabase()) { if (predicate.evaluate(dataset)) { inDatabase++; if (group.inFolder != null && matches(group.inFolder, dataset)) { matched = true; } } } datasetsInDatabase += inDatabase; if (!matched && group.inFolder != null) { groupsUnmatchedInDatabase++; } if (inDatabase > 1) { groupsWithDuplicatesInDatabase++; } if (inDatabase > 0) { groupsInDatabase++; } } return new Statistics(datasetsInFolder, datasetsInDatabase, groupsInDatabase, groupsWithDuplicatesInDatabase, datasetsUnmatchedInFolder, groupsUnmatchedInDatabase); } static boolean matches(DatasetMetadata d1, DatasetMetadata d2) { return d1.getSourceFilePathnameBase().equals(d2.getSourceFilePathnameBase()) && d1.getLastFileTimestamp().getTime() == d2.getLastFileTimestamp().getTime(); } private List<Group> groupDatasets( Collection<DatasetMetadata> inFolder, Collection<DatasetMetadata> inDatabase) { ArrayList<Group> groups = createGroupsFromDatabase(inDatabase); groups = mergeGroupsFromFolder(groups, inFolder); return groups; } private ArrayList<Group> createGroupsFromDatabase( Collection<DatasetMetadata> inDatabase) { ArrayList<Group> groups = new ArrayList<Group>(); Group group = null; for (DatasetMetadata dataset : inDatabase) { if (!intertidal(dataset.getCaptureTimestamp()) && !intertidal(dataset.getLastFileTimestamp())) { continue; } String pathname = dataset.getSourceFilePathnameBase(); if (group == null || !group.getBasePathname().equals(pathname)) { group = new Group(pathname); groups.add(group); } group.addInDatabase(dataset); } return groups; } private boolean intertidal(Date timestamp) { return (lwm == null || timestamp.getTime() >= lwm.getTime()) && (hwm == null || timestamp.getTime() <= hwm.getTime()); } private ArrayList<Group> mergeGroupsFromFolder(ArrayList<Group> groups, Collection<DatasetMetadata> inFolder) { ArrayList<Group> res = new ArrayList<Group>(); Iterator<Group> git = groups.iterator(); Iterator<DatasetMetadata> dit = inFolder.iterator(); Group group = git.hasNext() ? git.next() : null; DatasetMetadata dataset = dit.hasNext() ? dit.next() : null; while (group != null || dataset != null) { if (dataset == null) { res.add(group); group = git.hasNext() ? git.next() : null; } else if (group == null) { if (intertidal(dataset.getLastFileTimestamp())) { Group newGroup = new Group(dataset.getSourceFilePathnameBase()); newGroup.setInFolder(dataset); res.add(newGroup); } dataset = dit.hasNext() ? dit.next() : null; } else { int cmp = group.getBasePathname().compareTo(dataset.getSourceFilePathnameBase()); if (cmp == 0) { res.add(group); group.setInFolder(dataset); group = git.hasNext() ? git.next() : null; dataset = dit.hasNext() ? dit.next() : null; } else if (cmp < 0) { res.add(group); group = git.hasNext() ? git.next() : null; } else { if (intertidal(dataset.getLastFileTimestamp())) { Group newGroup = new Group(dataset.getSourceFilePathnameBase()); newGroup.setInFolder(dataset); res.add(newGroup); } dataset = dit.hasNext() ? dit.next() : null; } } } return res; } private SortedSet<DatasetMetadata> buildInDatabaseMetadata() { TreeSet<DatasetMetadata> inDatabase = new TreeSet<DatasetMetadata>(ORDER_BY_BASE_PATH_AND_TIME_AND_ID); EntityManager em = emf.createEntityManager(); try { TypedQuery<DatasetMetadata> query = em.createQuery( "from DatasetMetadata m left join fetch m.datafiles " + "where m.facilityName = :name", DatasetMetadata.class); query.setParameter("name", getFacility().getFacilityName()); for (DatasetMetadata ds : query.getResultList()) { if (inDatabase.add(ds)) { ds.getDatafiles().size(); } } } finally { em.close(); } return inDatabase; } private SortedSet<DatasetMetadata> buildInFolderMetadata() { TreeSet<DatasetMetadata> inFolder = new TreeSet<DatasetMetadata>(ORDER_BY_BASE_PATH_AND_TIME); String folderName = getFacility().getFolderName(); if (folderName == null) { return inFolder; } File localDir = uncNameMapper.mapUncPathname(folderName); if (localDir == null) { return inFolder; } fsm.getStatus(getFacility()).setLocalDirectory(localDir); analyseTree(localDir, Long.MIN_VALUE, Long.MAX_VALUE); for (Runnable runnable : queue) { WorkEntry entry = (WorkEntry) runnable; SessionDetails session = fsm.getSessionDetails( getFacility(), entry.getTimestamp().getTime(), entry.getBaseFile()); entry.pretendToGrabFiles(); inFolder.add(entry.assembleDatasetMetadata(null, session, new File(""))); } return inFolder; } @Override protected void enqueueWorkEntry(WorkEntry entry) { queue.add(entry); } public final List<Group> getGrouped() { return grouped; } public final Statistics getAll() { return all; } public final Statistics getBeforeLWM() { return beforeLWM; } public final Statistics getIntertidal() { return intertidal; } public final Statistics getAfterHWM() { return afterHWM; } public final Statistics getBeforeQStart() { return beforeQStart; } public final Statistics getInQueue() { return inQueue; } public final Statistics getAfterQEnd() { return afterQEnd; } public final Problems getProblems() { return problems; } public final Date getLWM() { return lwm; } public final Date getHWM() { return hwm; } public final Date getqStart() { return qStart; } public final Date getqEnd() { return qEnd; } public final Date getfStart() { return fStart; } public final Date getfEnd() { return fEnd; } public final void setProblems(Problems problems) { this.problems = problems; } @Override protected boolean isShutDown() { return false; } }