JobSchedulerStoreImpl.java example

Explorer
activemq-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.activemq.store.kahadb.scheduler;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;

import org.apache.activemq.broker.scheduler.JobScheduler;
import org.apache.activemq.broker.scheduler.JobSchedulerStore;
import org.apache.activemq.protobuf.Buffer;
import org.apache.activemq.store.kahadb.AbstractKahaDBStore;
import org.apache.activemq.store.kahadb.JournalCommand;
import org.apache.activemq.store.kahadb.KahaDBMetaData;
import org.apache.activemq.store.kahadb.Visitor;
import org.apache.activemq.store.kahadb.data.KahaAddScheduledJobCommand;
import org.apache.activemq.store.kahadb.data.KahaDestroySchedulerCommand;
import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobCommand;
import org.apache.activemq.store.kahadb.data.KahaRemoveScheduledJobsCommand;
import org.apache.activemq.store.kahadb.data.KahaRescheduleJobCommand;
import org.apache.activemq.store.kahadb.data.KahaTraceCommand;
import org.apache.activemq.store.kahadb.disk.index.BTreeVisitor;
import org.apache.activemq.store.kahadb.disk.journal.DataFile;
import org.apache.activemq.store.kahadb.disk.journal.Location;
import org.apache.activemq.store.kahadb.disk.page.Page;
import org.apache.activemq.store.kahadb.disk.page.PageFile;
import org.apache.activemq.store.kahadb.disk.page.Transaction;
import org.apache.activemq.store.kahadb.disk.util.VariableMarshaller;
import org.apache.activemq.store.kahadb.scheduler.legacy.LegacyStoreReplayer;
import org.apache.activemq.util.ByteSequence;
import org.apache.activemq.util.IOHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class JobSchedulerStoreImpl extends AbstractKahaDBStore implements JobSchedulerStore {

    private static final Logger LOG = LoggerFactory.getLogger(JobSchedulerStoreImpl.class);

    private JobSchedulerKahaDBMetaData metaData = new JobSchedulerKahaDBMetaData(this);
    private final MetaDataMarshaller metaDataMarshaller = new MetaDataMarshaller(this);
    private final Map<String, JobSchedulerImpl> schedulers = new HashMap<String, JobSchedulerImpl>();
    private File legacyStoreArchiveDirectory;

    /**
     * The Scheduler Token is used to identify base revisions of the Scheduler store.  A store
     * based on the initial scheduler design will not have this tag in it's meta-data and will
     * indicate an update is needed.  Later versions of the scheduler can also change this value
     * to indicate incompatible store bases which require complete meta-data and journal rewrites
     * instead of simpler meta-data updates.
     */
    static final UUID SCHEDULER_STORE_TOKEN = UUID.fromString("57ed642b-1ee3-47b3-be6d-b7297d500409");

    /**
     * The default scheduler store version.  All new store instance will be given this version and
     * earlier versions will be updated to this version.
     */
    static final int CURRENT_VERSION = 1;

    @Override
    public JobScheduler getJobScheduler(final String name) throws Exception {
        this.indexLock.writeLock().lock();
        try {
            JobSchedulerImpl result = this.schedulers.get(name);
            if (result == null) {
                final JobSchedulerImpl js = new JobSchedulerImpl(this);
                js.setName(name);
                getPageFile().tx().execute(new Transaction.Closure<IOException>() {
                    @Override
                    public void execute(Transaction tx) throws IOException {
                        js.createIndexes(tx);
                        js.load(tx);
                        metaData.getJobSchedulers().put(tx, name, js);
                    }
                });
                result = js;
                this.schedulers.put(name, js);
                if (isStarted()) {
                    result.start();
                }
                this.pageFile.flush();
            }
            return result;
        } finally {
            this.indexLock.writeLock().unlock();
        }
    }

    @Override
    public boolean removeJobScheduler(final String name) throws Exception {
        boolean result = false;

        this.indexLock.writeLock().lock();
        try {
            final JobSchedulerImpl js = this.schedulers.remove(name);
            result = js != null;
            if (result) {
                js.stop();
                getPageFile().tx().execute(new Transaction.Closure<IOException>() {
                    @Override
                    public void execute(Transaction tx) throws IOException {
                        metaData.getJobSchedulers().remove(tx, name);
                        js.removeAll(tx);
                    }
                });
            }
        } finally {
            this.indexLock.writeLock().unlock();
        }
        return result;
    }

    /**
     * Sets the directory where the legacy scheduler store files are archived before an
     * update attempt is made.  Both the legacy index files and the journal files are moved
     * to this folder prior to an upgrade attempt.
     *
     * @param directory
     *      The directory to move the legacy Scheduler Store files to.
     */
    public void setLegacyStoreArchiveDirectory(File directory) {
        this.legacyStoreArchiveDirectory = directory;
    }

    /**
     * Gets the directory where the legacy Scheduler Store files will be archived if the
     * broker is started and an existing Job Scheduler Store from an old version is detected.
     *
     * @return the directory where scheduler store legacy files are archived on upgrade.
     */
    public File getLegacyStoreArchiveDirectory() {
        if (this.legacyStoreArchiveDirectory == null) {
            this.legacyStoreArchiveDirectory = new File(getDirectory(), "legacySchedulerStore");
        }

        return this.legacyStoreArchiveDirectory.getAbsoluteFile();
    }

    @Override
    public void load() throws IOException {
        if (opened.compareAndSet(false, true)) {
            getJournal().start();
            try {
                loadPageFile();
            } catch (UnknownStoreVersionException ex) {
                LOG.info("Can't start until store update is performed.");
                upgradeFromLegacy();
                // Restart with the updated store
                getJournal().start();
                loadPageFile();
                LOG.info("Update from legacy Scheduler store completed successfully.");
            } catch (Throwable t) {
                LOG.warn("Index corrupted. Recovering the index through journal replay. Cause: {}", t.toString());
                LOG.debug("Index load failure", t);

                // try to recover index
                try {
                    pageFile.unload();
                } catch (Exception ignore) {
                }
                if (isArchiveCorruptedIndex()) {
                    pageFile.archive();
                } else {
                    pageFile.delete();
                }
                metaData = new JobSchedulerKahaDBMetaData(this);
                pageFile = null;
                loadPageFile();
            }
            startCheckpoint();
            recover();
        }
        LOG.info("{} started.", this);
    }

    @Override
    public void unload() throws IOException {
        if (opened.compareAndSet(true, false)) {
            for (JobSchedulerImpl js : this.schedulers.values()) {
                try {
                    js.stop();
                } catch (Exception e) {
                    throw new IOException(e);
                }
            }
            this.indexLock.writeLock().lock();
            try {
                if (pageFile != null && pageFile.isLoaded()) {
                    metaData.setState(KahaDBMetaData.CLOSED_STATE);

                    if (metaData.getPage() != null) {
                        pageFile.tx().execute(new Transaction.Closure<IOException>() {
                            @Override
                            public void execute(Transaction tx) throws IOException {
                                tx.store(metaData.getPage(), metaDataMarshaller, true);
                            }
                        });
                    }
                }
            } finally {
                this.indexLock.writeLock().unlock();
            }

            checkpointLock.writeLock().lock();
            try {
                if (metaData.getPage() != null) {
                    checkpointUpdate(true);
                }
            } finally {
                checkpointLock.writeLock().unlock();
            }
            synchronized (checkpointThreadLock) {
                if (checkpointThread != null) {
                    try {
                        checkpointThread.join();
                        checkpointThread = null;
                    } catch (InterruptedException e) {
                    }
                }
            }

            if (pageFile != null) {
                pageFile.unload();
                pageFile = null;
            }
            if (this.journal != null) {
                journal.close();
                journal = null;
            }

            metaData = new JobSchedulerKahaDBMetaData(this);
        }
        LOG.info("{} stopped.", this);
    }

    private void loadPageFile() throws IOException {
        this.indexLock.writeLock().lock();
        try {
            final PageFile pageFile = getPageFile();
            pageFile.load();
            pageFile.tx().execute(new Transaction.Closure<IOException>() {
                @Override
                public void execute(Transaction tx) throws IOException {
                    if (pageFile.getPageCount() == 0) {
                        Page<JobSchedulerKahaDBMetaData> page = tx.allocate();
                        assert page.getPageId() == 0;
                        page.set(metaData);
                        metaData.setPage(page);
                        metaData.setState(KahaDBMetaData.CLOSED_STATE);
                        metaData.initialize(tx);
                        tx.store(metaData.getPage(), metaDataMarshaller, true);
                    } else {
                        Page<JobSchedulerKahaDBMetaData> page = null;
                        page = tx.load(0, metaDataMarshaller);
                        metaData = page.get();
                        metaData.setPage(page);
                    }
                    metaData.load(tx);
                    metaData.loadScheduler(tx, schedulers);
                    for (JobSchedulerImpl js : schedulers.values()) {
                        try {
                            js.start();
                        } catch (Exception e) {
                            JobSchedulerStoreImpl.LOG.error("Failed to load " + js.getName(), e);
                        }
                    }
                }
            });

            pageFile.flush();
        } finally {
            this.indexLock.writeLock().unlock();
        }
    }

    private void upgradeFromLegacy() throws IOException {

        journal.close();
        journal = null;
        try {
            pageFile.unload();
            pageFile = null;
        } catch (Exception ignore) {}

        File storeDir = getDirectory().getAbsoluteFile();
        File storeArchiveDir = getLegacyStoreArchiveDirectory();

        LOG.info("Attempting to move old store files from {} to {}", storeDir, storeArchiveDir);

        // Move only the known store files, locks and other items left in place.
        IOHelper.moveFiles(storeDir, storeArchiveDir, new FilenameFilter() {

            @Override
            public boolean accept(File dir, String name) {
                if (name.endsWith(".data") || name.endsWith(".redo") || name.endsWith(".log")) {
                    return true;
                }
                return false;
            }
        });

        // We reset everything to clean state, then we can read from the old
        // scheduler store and replay the scheduled jobs into this one as adds.
        getJournal().start();
        metaData = new JobSchedulerKahaDBMetaData(this);
        pageFile = null;
        loadPageFile();

        LegacyStoreReplayer replayer = new LegacyStoreReplayer(getLegacyStoreArchiveDirectory());
        replayer.load();
        replayer.startReplay(this);

        // Cleanup after replay and store what we've done.
        pageFile.tx().execute(new Transaction.Closure<IOException>() {
            @Override
            public void execute(Transaction tx) throws IOException {
                tx.store(metaData.getPage(), metaDataMarshaller, true);
            }
        });

        checkpointUpdate(true);
        getJournal().close();
        getPageFile().unload();
    }

    @Override
    protected void checkpointUpdate(Transaction tx, boolean cleanup) throws IOException {
        LOG.debug("Job Scheduler Store Checkpoint started.");

        // reflect last update exclusive of current checkpoint
        Location lastUpdate = metaData.getLastUpdateLocation();
        metaData.setState(KahaDBMetaData.OPEN_STATE);
        tx.store(metaData.getPage(), metaDataMarshaller, true);
        pageFile.flush();

        if (cleanup) {
            final TreeSet<Integer> completeFileSet = new TreeSet<Integer>(journal.getFileMap().keySet());
            final TreeSet<Integer> gcCandidateSet = new TreeSet<Integer>(completeFileSet);

            LOG.trace("Last update: {}, full gc candidates set: {}", lastUpdate, gcCandidateSet);

            if (lastUpdate != null) {
                gcCandidateSet.remove(lastUpdate.getDataFileId());
            }

            this.metaData.getJournalRC().visit(tx, new BTreeVisitor<Integer, Integer>() {

                @Override
                public void visit(List<Integer> keys, List<Integer> values) {
                    for (Integer key : keys) {
                        if (gcCandidateSet.remove(key)) {
                            LOG.trace("Removed referenced file: {} from GC set", key);
                        }
                    }
                }

                @Override
                public boolean isInterestedInKeysBetween(Integer first, Integer second) {
                    return true;
                }
            });

            LOG.trace("gc candidates after reference check: {}", gcCandidateSet);

            // If there are GC candidates then check the remove command location to see
            // if any of them can go or if they must stay in order to ensure proper recover.
            //
            // A log containing any remove commands must be kept until all the logs with the
            // add commands for all the removed jobs have been dropped.
            if (!gcCandidateSet.isEmpty()) {
                Iterator<Entry<Integer, List<Integer>>> removals = metaData.getRemoveLocationTracker().iterator(tx);
                List<Integer> orphans = new ArrayList<Integer>();
                while (removals.hasNext()) {
                    boolean orphanedRemove = true;
                    Entry<Integer, List<Integer>> entry = removals.next();

                    // If this log is not a GC candidate then there's no need to do a check to rule it out
                    if (gcCandidateSet.contains(entry.getKey())) {
                        for (Integer addLocation : entry.getValue()) {
                            if (completeFileSet.contains(addLocation)) {
                                LOG.trace("A remove in log {} has an add still in existance in {}.", entry.getKey(), addLocation);
                                orphanedRemove = false;
                                break;
                            }
                        }

                        // If it's not orphaned than we can't remove it, otherwise we
                        // stop tracking it it's log will get deleted on the next check.
                        if (!orphanedRemove) {
                            gcCandidateSet.remove(entry.getKey());
                        } else {
                            LOG.trace("All removes in log {} are orphaned, file can be GC'd", entry.getKey());
                            orphans.add(entry.getKey());
                        }
                    }
                }

                // Drop all orphaned removes from the tracker.
                for (Integer orphan : orphans) {
                    metaData.getRemoveLocationTracker().remove(tx, orphan);
                }
            }

            LOG.trace("gc candidates after removals check: {}", gcCandidateSet);
            if (!gcCandidateSet.isEmpty()) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Cleanup removing the data files: " + gcCandidateSet);
                }
                journal.removeDataFiles(gcCandidateSet);
            }
        }

        LOG.debug("Job Scheduler Store Checkpoint complete.");
    }

    /**
     * Adds a reference for the journal log file pointed to by the given Location value.
     *
     * To prevent log files in the journal that still contain valid data that needs to be
     * kept in order to allow for recovery the logs must have active references.  Each Job
     * scheduler should ensure that the logs are accurately referenced.
     *
     * @param tx
     *      The TX under which the update is to be performed.
     * @param location
     *      The location value to update the reference count of.
     *
     * @throws IOException if an error occurs while updating the journal references table.
     */
    protected void incrementJournalCount(Transaction tx, Location location) throws IOException {
        int logId = location.getDataFileId();
        Integer val = metaData.getJournalRC().get(tx, logId);
        int refCount = val != null ? val.intValue() + 1 : 1;
        metaData.getJournalRC().put(tx, logId, refCount);
    }

    /**
     * Removes one reference for the Journal log file indicated in the given Location value.
     *
     * The references are used to track which log files cannot be GC'd.  When the reference count
     * on a log file reaches zero the file id is removed from the tracker and the log will be
     * removed on the next check point update.
     *
     * @param tx
     *      The TX under which the update is to be performed.
     * @param location
     *      The location value to update the reference count of.
     *
     * @throws IOException if an error occurs while updating the journal references table.
     */
    protected void decrementJournalCount(Transaction tx, Location location) throws IOException {
        int logId = location.getDataFileId();
        Integer refCount = metaData.getJournalRC().get(tx, logId);
        if (refCount != null) {
            int refCountValue = refCount;
            refCountValue--;
            if (refCountValue <= 0) {
                metaData.getJournalRC().remove(tx, logId);
            } else {
                metaData.getJournalRC().put(tx, logId, refCountValue);
            }
        }
    }

    /**
     * Updates the Job removal tracking index with the location of a remove command and the
     * original JobLocation entry.
     *
     * The JobLocation holds the locations in the logs where the add and update commands for
     * a job stored.  The log file containing the remove command can only be discarded after
     * both the add and latest update log files have also been discarded.
     *
     * @param tx
     *      The TX under which the update is to be performed.
     * @param location
     *      The location value to reference a remove command.
     * @param removedJob
     *      The original JobLocation instance that holds the add and update locations
     *
     * @throws IOException if an error occurs while updating the remove location tracker.
     */
    protected void referenceRemovedLocation(Transaction tx, Location location, JobLocation removedJob) throws IOException {
        int logId = location.getDataFileId();
        List<Integer> removed = this.metaData.getRemoveLocationTracker().get(tx, logId);
        if (removed == null) {
            removed = new ArrayList<Integer>();
        }
        removed.add(removedJob.getLocation().getDataFileId());
        this.metaData.getRemoveLocationTracker().put(tx, logId, removed);
    }

    /**
     * Retrieve the scheduled Job's byte blob from the journal.
     *
     * @param location
     *      The location of the KahaAddScheduledJobCommand that originated the Job.
     *
     * @return a ByteSequence containing the payload of the scheduled Job.
     *
     * @throws IOException if an error occurs while reading the payload value.
     */
    protected ByteSequence getPayload(Location location) throws IOException {
        KahaAddScheduledJobCommand job = (KahaAddScheduledJobCommand) this.load(location);
        Buffer payload = job.getPayload();
        return new ByteSequence(payload.getData(), payload.getOffset(), payload.getLength());
    }

    public void readLockIndex() {
        this.indexLock.readLock().lock();
    }

    public void readUnlockIndex() {
        this.indexLock.readLock().unlock();
    }

    public void writeLockIndex() {
        this.indexLock.writeLock().lock();
    }

    public void writeUnlockIndex() {
        this.indexLock.writeLock().unlock();
    }

    @Override
    public String toString() {
        return "JobSchedulerStore: " + getDirectory();
    }

    @Override
    protected String getPageFileName() {
        return "scheduleDB";
    }

    @Override
    protected File getDefaultDataDirectory() {
        return new File(IOHelper.getDefaultDataDirectory(), "delayedDB");
    }

    private class MetaDataMarshaller extends VariableMarshaller<JobSchedulerKahaDBMetaData> {

        private final JobSchedulerStoreImpl store;

        MetaDataMarshaller(JobSchedulerStoreImpl store) {
            this.store = store;
        }

        @Override
        public JobSchedulerKahaDBMetaData readPayload(DataInput dataIn) throws IOException {
            JobSchedulerKahaDBMetaData rc = new JobSchedulerKahaDBMetaData(store);
            rc.read(dataIn);
            return rc;
        }

        @Override
        public void writePayload(JobSchedulerKahaDBMetaData object, DataOutput dataOut) throws IOException {
            object.write(dataOut);
        }
    }

    /**
     * Called during index recovery to rebuild the index from the last known good location.  For
     * entries that occur before the last known good position we just ignore then and move on.
     *
     * @param command
     *        the command read from the Journal which should be used to update the index.
     * @param location
     *        the location in the index where the command was read.
     * @param inDoubtlocation
     *        the location in the index known to be the last time the index was valid.
     *
     * @throws IOException if an error occurs while recovering the index.
     */
    protected void doRecover(JournalCommand<?> data, final Location location, final Location inDoubtlocation) throws IOException {
        if (inDoubtlocation != null && location.compareTo(inDoubtlocation) >= 0) {
            process(data, location);
        }
    }

    /**
     * Called during recovery to allow the store to rebuild from scratch.
     *
     * @param data
     *      The command to process, which was read from the Journal.
     * @param location
     *      The location of the command in the Journal.
     *
     * @throws IOException if an error occurs during command processing.
     */
    @Override
    protected void process(JournalCommand<?> data, final Location location) throws IOException {
        data.visit(new Visitor() {
            @Override
            public void visit(final KahaAddScheduledJobCommand command) throws IOException {
                final JobSchedulerImpl scheduler;

                indexLock.writeLock().lock();
                try {
                    try {
                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
                    } catch (Exception e) {
                        throw new IOException(e);
                    }
                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
                        @Override
                        public void execute(Transaction tx) throws IOException {
                            scheduler.process(tx, command, location);
                        }
                    });

                    processLocation(location);
                } finally {
                    indexLock.writeLock().unlock();
                }
            }

            @Override
            public void visit(final KahaRemoveScheduledJobCommand command) throws IOException {
                final JobSchedulerImpl scheduler;

                indexLock.writeLock().lock();
                try {
                    try {
                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
                    } catch (Exception e) {
                        throw new IOException(e);
                    }
                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
                        @Override
                        public void execute(Transaction tx) throws IOException {
                            scheduler.process(tx, command, location);
                        }
                    });

                    processLocation(location);
                } finally {
                    indexLock.writeLock().unlock();
                }
            }

            @Override
            public void visit(final KahaRemoveScheduledJobsCommand command) throws IOException {
                final JobSchedulerImpl scheduler;

                indexLock.writeLock().lock();
                try {
                    try {
                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
                    } catch (Exception e) {
                        throw new IOException(e);
                    }
                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
                        @Override
                        public void execute(Transaction tx) throws IOException {
                            scheduler.process(tx, command, location);
                        }
                    });

                    processLocation(location);
                } finally {
                    indexLock.writeLock().unlock();
                }
            }

            @Override
            public void visit(final KahaRescheduleJobCommand command) throws IOException {
                final JobSchedulerImpl scheduler;

                indexLock.writeLock().lock();
                try {
                    try {
                        scheduler = (JobSchedulerImpl) getJobScheduler(command.getScheduler());
                    } catch (Exception e) {
                        throw new IOException(e);
                    }
                    getPageFile().tx().execute(new Transaction.Closure<IOException>() {
                        @Override
                        public void execute(Transaction tx) throws IOException {
                            scheduler.process(tx, command, location);
                        }
                    });

                    processLocation(location);
                } finally {
                    indexLock.writeLock().unlock();
                }
            }

            @Override
            public void visit(final KahaDestroySchedulerCommand command) {
                try {
                    removeJobScheduler(command.getScheduler());
                } catch (Exception e) {
                    LOG.warn("Failed to remove scheduler: {}", command.getScheduler());
                }

                processLocation(location);
            }

            @Override
            public void visit(KahaTraceCommand command) {
                processLocation(location);
            }
        });
    }

    protected void processLocation(final Location location) {
        indexLock.writeLock().lock();
        try {
            this.metaData.setLastUpdateLocation(location);
        } finally {
            indexLock.writeLock().unlock();
        }
    }

    /**
     * We recover from the Journal logs as needed to restore the index.
     *
     * @throws IllegalStateException
     * @throws IOException
     */
    private void recover() throws IllegalStateException, IOException {
        this.indexLock.writeLock().lock();
        try {
            long start = System.currentTimeMillis();
            Location lastIndoubtPosition = getRecoveryPosition();
            Location recoveryPosition = lastIndoubtPosition;

            if (recoveryPosition != null) {
                int redoCounter = 0;
                LOG.info("Recovering from the scheduled job journal @" + recoveryPosition);
                while (recoveryPosition != null) {
                    try {
                        JournalCommand<?> message = load(recoveryPosition);
                        metaData.setLastUpdateLocation(recoveryPosition);
                        doRecover(message, recoveryPosition, lastIndoubtPosition);
                        redoCounter++;
                    } catch (IOException failedRecovery) {
                        if (isIgnoreMissingJournalfiles()) {
                            LOG.debug("Failed to recover data at position:" + recoveryPosition, failedRecovery);
                            // track this dud location
                            journal.corruptRecoveryLocation(recoveryPosition);
                        } else {
                            throw new IOException("Failed to recover data at position:" + recoveryPosition, failedRecovery);
                        }
                    }
                    recoveryPosition = journal.getNextLocation(recoveryPosition);
                     if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) {
                         LOG.info("@ {}, {} entries recovered ..", recoveryPosition, redoCounter);
                     }
                }
                long end = System.currentTimeMillis();
                LOG.info("Recovery replayed {} operations from the journal in {} seconds.",
                         redoCounter, ((end - start) / 1000.0f));
            }

            // We may have to undo some index updates.
            pageFile.tx().execute(new Transaction.Closure<IOException>() {
                @Override
                public void execute(Transaction tx) throws IOException {
                    recoverIndex(tx);
                }
            });

        } finally {
            this.indexLock.writeLock().unlock();
        }
    }

    private Location getRecoveryPosition() throws IOException {
        // This loads the first position and we completely rebuild the index if we
        // do not override it with some known recovery start location.
        Location result = null;

        if (!isForceRecoverIndex()) {
            if (metaData.getLastUpdateLocation() != null) {
                result = metaData.getLastUpdateLocation();
            }
        }

        return journal.getNextLocation(result);
    }

    private void recoverIndex(Transaction tx) throws IOException {
        long start = System.currentTimeMillis();

        // It is possible index updates got applied before the journal updates..
        // in that case we need to removed references to Jobs that are not in the journal
        final Location lastAppendLocation = journal.getLastAppendLocation();
        long undoCounter = 0;

        // Go through all the jobs in each scheduler and check if any are added after
        // the last appended location and remove those.  For now we ignore the update
        // location since the scheduled job will update itself after the next fire and
        // a new update will replace any existing update.
        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
            Map.Entry<String, JobSchedulerImpl> entry = i.next();
            JobSchedulerImpl scheduler = entry.getValue();

            List<JobLocation> jobs = scheduler.getAllScheduledJobs(tx);
            for (JobLocation job : jobs) {
                if (job.getLocation().compareTo(lastAppendLocation) >= 0) {
                    if (scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime())) {
                        LOG.trace("Removed Job past last appened in the journal: {}", job.getJobId());
                        undoCounter++;
                    }
                }
            }
        }

        if (undoCounter > 0) {
            // The rolled back operations are basically in flight journal writes.  To avoid getting
            // these the end user should do sync writes to the journal.
            long end = System.currentTimeMillis();
            LOG.info("Rolled back {} messages from the index in {} seconds.", undoCounter, ((end - start) / 1000.0f));
            undoCounter = 0;
        }

        // Now we check for missing and corrupt journal files.

        // 1. Collect the set of all referenced journal files based on the Location of the
        //    the scheduled jobs and the marked last update field.
        HashSet<Integer> missingJournalFiles = new HashSet<Integer>();
        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
            Map.Entry<String, JobSchedulerImpl> entry = i.next();
            JobSchedulerImpl scheduler = entry.getValue();

            List<JobLocation> jobs = scheduler.getAllScheduledJobs(tx);
            for (JobLocation job : jobs) {
                missingJournalFiles.add(job.getLocation().getDataFileId());
                if (job.getLastUpdate() != null) {
                    missingJournalFiles.add(job.getLastUpdate().getDataFileId());
                }
            }
        }

        // 2. Remove from that set all known data file Id's in the journal and what's left
        //    is the missing set which will soon also contain the corrupted set.
        missingJournalFiles.removeAll(journal.getFileMap().keySet());
        if (!missingJournalFiles.isEmpty()) {
            LOG.info("Some journal files are missing: {}", missingJournalFiles);
        }

        // 3. Now check all references in the journal logs for corruption and add any
        //    corrupt journal files to the missing set.
        HashSet<Location> corruptedLocations = new HashSet<Location>();

        if (isCheckForCorruptJournalFiles()) {
            Collection<DataFile> dataFiles = journal.getFileMap().values();
            for (DataFile dataFile : dataFiles) {
                int id = dataFile.getDataFileId();
                for (long offset : dataFile.getCorruptedBlocks()) {
                    corruptedLocations.add(new Location(id, (int) offset));
                }
            }

            if (!corruptedLocations.isEmpty()) {
                LOG.debug("Found some corrupted data blocks in the journal: {}", corruptedLocations.size());
            }
        }

        // 4. Now we either fail or we remove all references to missing or corrupt journal
        //    files from the various JobSchedulerImpl instances.  We only remove the Job if
        //    the initial Add operation is missing when the ignore option is set, the updates
        //    could be lost but that's price you pay when ignoring the missing logs.
        if (!missingJournalFiles.isEmpty() || !corruptedLocations.isEmpty()) {
            if (!isIgnoreMissingJournalfiles()) {
                throw new IOException("Detected missing/corrupt journal files.");
            }

            // Remove all Jobs that reference an Location that is either missing or corrupt.
            undoCounter = removeJobsInMissingOrCorruptJounralFiles(tx, missingJournalFiles, corruptedLocations);

            // Clean up the Journal Reference count Map.
            removeJournalRCForMissingFiles(tx, missingJournalFiles);
        }

        if (undoCounter > 0) {
            long end = System.currentTimeMillis();
            LOG.info("Detected missing/corrupt journal files.  Dropped {} jobs from the " +
                     "index in {} seconds.", undoCounter, ((end - start) / 1000.0f));
        }
    }

    private void removeJournalRCForMissingFiles(Transaction tx, Set<Integer> missing) throws IOException {
        List<Integer> matches = new ArrayList<Integer>();

        Iterator<Entry<Integer, Integer>> references = metaData.getJournalRC().iterator(tx);
        while (references.hasNext()) {
            int dataFileId = references.next().getKey();
            if (missing.contains(dataFileId)) {
                matches.add(dataFileId);
            }
        }

        for (Integer match : matches) {
            metaData.getJournalRC().remove(tx, match);
        }
    }

    private int removeJobsInMissingOrCorruptJounralFiles(Transaction tx, Set<Integer> missing, Set<Location> corrupted) throws IOException {
        int removed = 0;

        // Remove Jobs that reference missing or corrupt files.
        // Remove Reference counts to missing or corrupt files.
        // Remove and remove command markers to missing or corrupt files.
        for (Iterator<Map.Entry<String, JobSchedulerImpl>> i = metaData.getJobSchedulers().iterator(tx); i.hasNext();) {
            Map.Entry<String, JobSchedulerImpl> entry = i.next();
            JobSchedulerImpl scheduler = entry.getValue();

            List<JobLocation> jobs = scheduler.getAllScheduledJobs(tx);
            for (JobLocation job : jobs) {

                // Remove all jobs in missing log files.
                if (missing.contains(job.getLocation().getDataFileId())) {
                    scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime());
                    removed++;
                    continue;
                }

                // Remove all jobs in corrupted parts of log files.
                if (corrupted.contains(job.getLocation())) {
                    scheduler.removeJobAtTime(tx, job.getJobId(), job.getNextTime());
                    removed++;
                }
            }
        }

        return removed;
    }
}