/* * Copyright 2012-2013, CMM, University of Queensland. * * This file is part of Paul. * * Paul is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Paul is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Paul. If not, see <http://www.gnu.org/licenses/>. */ package au.edu.uq.cmm.paul.grabber; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.channels.FileLock; import java.nio.channels.FileLockInterruptionException; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.codehaus.jackson.JsonGenerationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import au.edu.uq.cmm.paul.GrabberFacilityConfig.FileArrivalMode; import au.edu.uq.cmm.paul.Paul; import au.edu.uq.cmm.paul.PaulException; import au.edu.uq.cmm.paul.queue.QueueFileException; import au.edu.uq.cmm.paul.queue.QueueFileManager; import au.edu.uq.cmm.paul.queue.QueueManager; import au.edu.uq.cmm.paul.status.DatafileTemplate; import au.edu.uq.cmm.paul.status.Facility; import au.edu.uq.cmm.paul.status.FacilityStatus; import au.edu.uq.cmm.paul.status.FacilityStatusManager; import au.edu.uq.cmm.paul.watcher.FileWatcherEvent; /** * This class represents a unit of work for the {@link FileGrabber} executor; * i.e. a dataset to be "grabbed". This class does most of the work of grabbing * and creation of the queue entries. * * @author scrawley */ class WorkEntry implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(WorkEntry.class); private static final long DEFAULT_GRABBER_TIMEOUT = 600000; // milliseconds == 10 minutes private final long grabberTimeout; private final FileGrabber fileGrabber; private final QueueManager queueManager; private final QueueFileManager fileManager; private final FacilityStatusManager statusManager; private final File baseFile; private final String instrumentBasePath; private final Map<File, GrabbedFile> files; private final Facility facility; private int settling; private Date timestamp; private long latestFileTimestamp = 0L; private final boolean catchup; private Thread grabberThread; private boolean grabAborted; private boolean pretending; public WorkEntry(Paul services, FileWatcherEvent event, File baseFile) { this.facility = (Facility) event.getFacility(); this.timestamp = new Date(event.getTimestamp()); this.statusManager = services.getFacilityStatusManager(); this.fileGrabber = statusManager.getStatus(facility).getFileGrabber(); this.queueManager = services.getQueueManager(); this.fileManager = queueManager.getFileManager(); this.baseFile = baseFile; this.instrumentBasePath = mapToInstrumentPath(facility, baseFile); this.files = new ConcurrentHashMap<File, GrabbedFile>(); long timeout = services.getConfiguration().getGrabberTimeout(); this.grabberTimeout = timeout == 0 ? DEFAULT_GRABBER_TIMEOUT : timeout; this.catchup = event.isCatchup(); settling = facility.getFileSettlingTime(); if (settling <= 0) { settling = FileGrabber.DEFAULT_FILE_SETTLING_TIME; } addEvent(event); } private String mapToInstrumentPath(Facility facility, File file) { String filePath = file.getAbsolutePath(); FacilityStatus status = statusManager.getStatus(facility); String directoryPath = status.getLocalDirectory().getAbsolutePath(); if (!filePath.startsWith(directoryPath)) { throw new PaulException("Bad path base: '" + filePath + "' does not start with '" + directoryPath); } // This is a hack, but I can't use `File` to generate a Windows-style // pathname on Unix / Linux. filePath = filePath.substring(directoryPath.length()); filePath = filePath.replaceAll("/", "\\\\"); return facility.getDriveName() + ":" + filePath; } public void addEvent(FileWatcherEvent event) { File file = event.getFile(); LOG.debug("Processing event for file " + file); synchronized (this) { if (grabberThread != null) { LOG.debug("A late file event arrived for file " + file + ": interrupting the grabber"); grabberThread.interrupt(); } boolean matched = false; List<DatafileTemplate> templates = facility.getDatafileTemplates(); if (templates.isEmpty()) { if (!files.containsKey(file)) { files.put(file, new GrabbedFile(file, file, null)); LOG.debug("Added file " + file + " to map for grabbing"); } else { LOG.debug("File " + file + " already in map for grabbing"); } updateLatestFileTimestamp(file); } else { for (DatafileTemplate template : templates) { Pattern pattern = template.getCompiledFilePattern( facility.isCaseInsensitive()); Matcher matcher = pattern.matcher(file.getAbsolutePath()); if (matcher.matches()) { if (!files.containsKey(file)) { files.put(file, new GrabbedFile( new File(matcher.group(1)), file, template)); LOG.debug("Added file " + file + " to map for grabbing"); } else { LOG.debug("File " + file + " already in map for grabbing"); } updateLatestFileTimestamp(file); matched = true; break; } } if (!matched) { LOG.debug("File " + file + " didn't match any template - ignoring"); } } } } private void updateLatestFileTimestamp(File file) { long timestamp = file.lastModified(); if (timestamp > latestFileTimestamp) { latestFileTimestamp = timestamp; } } public long getLatestFileTimestamp() { return latestFileTimestamp; } public final Date getTimestamp() { return timestamp; } public final void setTimestamp(Date timestamp) { this.timestamp = timestamp; } public File getBaseFile() { return baseFile; } @Override public void run() { LOG.info("Processing workEntry for " + baseFile); try { synchronized (this) { if (grabberThread == null) { grabAborted = false; grabberThread = Thread.currentThread(); } else { LOG.debug("WorkEntry already being processed by " + grabberThread.getName()); return; } } // Wait for events to settle, and all mandatory files to appear if (!datasetCompleted()) { return; } if (grabFiles(false) != null) { statusManager.advanceHWMTimestamp(facility, timestamp); } } catch (InterruptedException ex) { LOG.debug("Handling interrupt on workEntry thread", ex); grabAborted = true; try { deleteGrabbedFiles(); } catch (InterruptedException ex2) { LOG.info("Interrupted while tidying up grabbed files"); } } catch (Throwable ex) { LOG.error("unexpected exception", ex); return; } finally { synchronized (this) { if (grabAborted) { fileGrabber.enqueueWorkEntry(this); } else { fileGrabber.remove(baseFile); } grabberThread = null; } } LOG.debug("Finished processing workEntry for " + baseFile); } public DatasetMetadata grabFiles(boolean regrabbing) throws InterruptedException, IOException, QueueFileException { // Perform pre-grab checks if (!regrabbing && !isGrabbable()) { LOG.debug("WorkEntry is not grabbable"); return null; } // Prepare for grabbing LOG.debug("WorkEntry.grabFiles has " + files.size() + " files to grab"); SessionDetails session = statusManager.getSessionDetails( facility, timestamp.getTime(), baseFile); // Optionally lock the files, then grab them. for (GrabbedFile file : files.values()) { if (Thread.interrupted()) { throw new InterruptedException("Interrupted in grabFiles()"); } try (FileInputStream is = new FileInputStream(file.getFile())) { if (facility.isUseFileLocks()) { LOG.debug("acquiring lock on " + file); try (FileLock lock = is.getChannel().lock(0, Long.MAX_VALUE, true)) { LOG.debug("locked " + file); doGrabFile(file, is, regrabbing); } LOG.debug("unlocked " + file); } else { doGrabFile(file, is, regrabbing); } } catch (FileLockInterruptionException ex) { LOG.debug("Lock interrupted", ex); throw new InterruptedException("Interrupted in grabFiles()"); } catch (IOException ex) { // This is probably "mostly harmless"; e.g. a tempfile that disappears, or a file // or directory with bad permissions. LOG.info("Unexpected IO Error: " + ex); if (LOG.isDebugEnabled()) { LOG.debug("Cause of previous exception", ex); } } } try { if (Thread.interrupted()) { throw new InterruptedException("Interrupted in grabFiles()"); } return saveMetadata(timestamp, session, regrabbing); } catch (JsonGenerationException ex) { throw new PaulException(ex); } } private boolean isGrabbable() { if (facility.getFileArrivalMode() == FileArrivalMode.DIRECT) { return true; } if (facility.getFileArrivalMode() == FileArrivalMode.RSYNC) { // If the latest file modification date in the putative dataset is // before the LWM, we are not interested in it. long latest = Long.MAX_VALUE; for (GrabbedFile file: files.values()) { long modified = file.getFile().lastModified(); if (modified > latest) { latest = modified; } } FacilityStatus status = statusManager.getStatus(facility); if (latest < status.getGrabberLWMTimestamp().getTime()) { LOG.debug("WorkEntry falls before Grabber LWM"); return false; } } // Look for existing Datasets in the queue with the same baseFile. List<DatasetMetadata> possibles = queueManager.lookupDatasets(baseFile.toString()); if (possibles.size() == 0) { LOG.debug("WorkEntry has no possible previous Datasets"); return true; } // Trawl through the existing Datasets, knocking out any files in the // grab list that are already in a Dataset for (DatasetMetadata dm: possibles) { for (DatafileMetadata df: dm.getDatafiles()) { for (GrabbedFile gf: files.values()) { File file = gf.getFile(); LOG.debug("Comparing " + file.toString() + " and " + df.getSourceFilePathname()); try { if (!file.toString().equals(df.getSourceFilePathname())) { LOG.debug("No matching file"); continue; } if (!df.getDatafileHash().equals(gf.computeFileHash())) { LOG.debug("Hashes don't match"); continue; } LOG.debug("Files have same name and hash"); files.remove(file); if (files.isEmpty()) { return false; } break; } catch (IOException ex) { // We ran into a problem calculating the file's hash ... LOG.warn("Unexpected exception", ex); } } } } return true; } /** * This method is used by the CatchupAnalyser to decorate the * entry with sufficient information that we can generate a * passable metadata record for it. */ public void pretendToGrabFiles() { pretending = true; for (GrabbedFile file : files.values()) { Date now = new Date(); Date fileTimestamp = new Date(file.getFile().lastModified()); file.setCopiedFile(file.getFile()); file.setFileTimestamp(fileTimestamp); file.setCopyTimestamp(now); } } /** * This method is called on an interrupt to tidy up any data files that were * captured or being captured. If we are 'pretending', don't do anything because * the "pretend copied" files are actually the precious original files! * @throws InterruptedException */ private void deleteGrabbedFiles() throws InterruptedException { if (!pretending) { for (GrabbedFile file : files.values()) { File copied = file.getCopiedFile(); if (copied != null) { try { fileManager.removeFile(copied); } catch (QueueFileException ex) { LOG.warn("Problem while tidying up grabbed files", ex); } } } } } private boolean datasetCompleted() throws InterruptedException { // Wait until the dataset is completed. boolean incomplete = true; long start = System.currentTimeMillis(); long limit = grabberTimeout < 0 ? Long.MAX_VALUE : start + grabberTimeout; do { if (!catchup) { // Wait for the file modification interrupts stop arriving ... plus // the settling time. while (true) { try { Thread.sleep(settling); break; } catch (InterruptedException ex) { if (fileGrabber.isShutDown()) { throw ex; } } } // We don't need to abort the grab because we haven't started yet. grabAborted = false; } // Check that the dataset's non-optional files are all present, // and that they meet the minimum size requirements. incomplete = false; for (DatafileTemplate template : facility.getDatafileTemplates()) { if (template.isOptional()) { continue; } Pattern pattern = template.getCompiledFilePattern( facility.isCaseInsensitive()); boolean satisfied = false; for (File file : files.keySet()) { Matcher matcher = pattern.matcher(file.getAbsolutePath()); if (matcher.matches()) { if (file.length() >= template.getMinimumSize()) { satisfied = true; } else { LOG.debug("Datafile " + file + " isn't big enough yet"); } break; } } if (!satisfied) { LOG.debug("Datafile for template " + template.getFilePattern() + " isn't ready"); incomplete = true; break; } } } while (incomplete && !catchup && limit > System.currentTimeMillis()); LOG.info("Total waiting time for baseFile: " + ((System.currentTimeMillis() - start) / 1000.0) + " seconds"); if (incomplete) { LOG.info("Dataset for baseFile " + baseFile + (catchup ? " is incomplete" : " did not complete within timeout") + ". Dropping it."); return false; } // Prune any files that don't meet their minimum size requirement. for (Iterator<Entry<File, GrabbedFile>> it = files.entrySet().iterator(); it.hasNext(); /* */) { Entry<File, GrabbedFile> entry = it.next(); long length = entry.getKey().length(); if (length < entry.getValue().getTemplate().getMinimumSize()) { LOG.info("Dropping datafile " + entry.getKey() + " with size " + length); it.remove(); } } // Avoid creating an empty Dataset (containing just an admin metadata file) if (files.isEmpty()) { LOG.info("Dropping empty dataset for baseFile " + baseFile); return false; } return true; } private void doGrabFile(GrabbedFile file, FileInputStream is, boolean regrabbing) throws InterruptedException, IOException, QueueFileException { LOG.debug("Start file grabbing for " + file.getFile()); Date now = new Date(); Date fileTimestamp = new Date(file.getFile().lastModified()); String suffix = (file.getTemplate() == null) ? ".data" : file.getTemplate().getSuffix(); File copiedFile = fileManager.enqueueFile(file.getFile(), suffix, regrabbing); file.setCopiedFile(copiedFile); file.setFileTimestamp(fileTimestamp); file.setCopyTimestamp(now); LOG.debug("Done grabbing "+ file.getFile() + " -> " + copiedFile); } private DatasetMetadata saveMetadata(Date now, SessionDetails session, boolean regrabbing) throws IOException, JsonGenerationException, QueueFileException, InterruptedException { File metadataFile = fileManager.generateUniqueFile(".admin", regrabbing); DatasetMetadata dataset = assembleDatasetMetadata(now, session, metadataFile); dataset.updateDatasetHash(); if (!regrabbing) { queueManager.addEntry(dataset, false); } return dataset; } public DatasetMetadata assembleDatasetMetadata( Date now, SessionDetails session, File metadataFile) { List<DatafileMetadata> list = new ArrayList<DatafileMetadata>(files.size()); for (GrabbedFile g : files.values()) { if (g.getCopiedFile() == null) { LOG.info("Leaving out file " + g.getFile() + " which wasn't previously enqueued"); continue; } String mimeType = (g.getTemplate() == null) ? "application/octet-stream" : g.getTemplate().getMimeType(); DatafileMetadata d = new DatafileMetadata( g.getFile().getAbsolutePath(), mapToInstrumentPath(facility, g.getFile()), g.getCopiedFile().getAbsolutePath(), g.getFileTimestamp(), g.getCopyTimestamp(), mimeType, g.getCopiedFile().length(), null); list.add(d); } DatasetMetadata dataset = new DatasetMetadata( baseFile.getAbsolutePath(), instrumentBasePath, metadataFile.getAbsolutePath(), session.getUserName(), facility.getFacilityName(), facility.getId(), session.getAccount(), session.getEmailAddress(), session.getOperatorName(), now, session.getSessionUuid(), session.getLoginTime(), list); return dataset; } public void commitRegrabbedDataset(DatasetMetadata dataset) throws IOException, QueueFileException, InterruptedException { DatasetMetadata originalDataset = queueManager.fetchDataset(dataset.getId()); dataset.setMetadataFilePathname(originalDataset.getMetadataFilePathname()); // Delete the original dataset's captured files for (DatafileMetadata of : originalDataset.getDatafiles()) { new File(of.getCapturedFilePathname()).delete(); } // Rename the new dataset's captured files for (DatafileMetadata f : dataset.getDatafiles()) { File file = fileManager.renameGrabbedDatafile(new File(f.getCapturedFilePathname())); f.setCapturedFilePathname(file.toString()); } // This will save the updated dataset metadata to the database and file system. queueManager.addEntry(dataset, true); } }