// Copyright 2009 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.util.diffing;
import static com.google.enterprise.connector.util.diffing.DocumentSnapshotComparator.COMPARATOR;
import com.google.common.annotations.VisibleForTesting;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.TraversalSchedule;
import com.google.enterprise.connector.spi.TraversalScheduleAware;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* A service that monitors a {@link SnapshotRepository} and makes callbacks
* when changes occur.
* <p>
* This implementation works as follows. It repeatedly scans all the
* {@link DocumentSnapshot} entries returned by
* {@link SnapshotRepository#iterator()}. On each pass, it compares the current
* contents of the repository to a record of what it saw on the previous pass.
* The record is stored as a file in the local repository. Each discrepancy
* is propagated to the client.
* <p>
* Using a local snapshot of the repository has some serious flaws for
* continuous crawl:
* <ul>
* <li>The local snapshot can diverge from the actual contents of the GSA. This
* can lead to situations where discrepancies are not corrected.</li>
* <li>If the local snapshot gets corrupted, there is no way to recover short of
* deleting all on the GSA and starting again.</li>
* </ul>
* A much more robust solution is to obtain snapshots directly from the GSA
* at least part of the time. (However, to save bandwidth, it may still be
* useful to keep local snapshots and only get an "authoritative" snapshot
* from the cloud occasionally. E.g., once a week or if the local snapshot
* is corrupted.)
* <p>
* When an API to do that is available, this implementation should be fixed
* to use it.
*
* @since 2.8
*/
// TODO: Retrieve authoritative snapshots from GSA when appropriate.
public class DocumentSnapshotRepositoryMonitor implements Runnable {
private static final Logger LOG = Logger.getLogger(
DocumentSnapshotRepositoryMonitor.class.getName());
/*
* Gross hack uses Java Reflection to setup and teardown NDC logging context.
* This avoids connector-spi.jar having a compile-time or run-time dependency
* on connector-logging.jar.
*/
private static Method ndcPush = null;
private static Method ndcRemove = null;
static {
initNdcLogging();
}
/* Extracted from the above static block to suppress the unchecked warning. */
@SuppressWarnings("unchecked")
private static void initNdcLogging() {
try {
Class<?> ndc =
Class.forName("com.google.enterprise.connector.logging.NDC");
ndcPush = ndc.getMethod("push", String.class);
ndcRemove = ndc.getMethod("remove", (Class []) null);
} catch (LinkageError ignored) {
} catch (ClassNotFoundException ignored) {
} catch (NoSuchMethodException ignored) {
} catch (SecurityException ignored) {
}
}
/* Call an NDC method via reflection, if possible. */
private static void invoke(Method method, Object... args) {
if (method != null) {
try {
method.invoke(null, args);
} catch (LinkageError ignored) {
} catch (IllegalAccessException ignored) {
} catch (IllegalArgumentException ignored) {
} catch (InvocationTargetException ignored) {
}
}
}
/**
* The client provides an implementation of this interface to receive
* notification of changes to the repository.
*/
public static interface Callback {
public void passBegin() throws InterruptedException;
public void newDocument(DocumentHandle documentHandle,
MonitorCheckpoint mcp) throws InterruptedException;
public void deletedDocument(DocumentHandle documentHandle,
MonitorCheckpoint mcp) throws InterruptedException;
public void changedDocument(DocumentHandle documentHandle,
MonitorCheckpoint mcp) throws InterruptedException;
public void passComplete(MonitorCheckpoint mcp) throws InterruptedException;
public boolean hasEnqueuedAtLeastOneChangeThisPass();
public void passPausing(int sleepms) throws InterruptedException;
}
/** This connector instance's current traversal schedule. */
private volatile TraversalSchedule traversalSchedule;
/** Directory that contains snapshots. */
private final SnapshotStore snapshotStore;
/** The root of the repository to monitor */
private final SnapshotRepository<? extends DocumentSnapshot> query;
/** Reader for the current snapshot. */
private SnapshotReader snapshotReader;
/** Callback to invoke when a change is detected. */
private final Callback callback;
/** Current record from the snapshot. */
private DocumentSnapshot current;
/** The snapshot we are currently writing */
private OrderedSnapshotWriter snapshotWriter;
private final String name;
private final DocumentSnapshotFactory documentSnapshotFactory;
private final DocumentSink documentSink;
/* Contains a checkpoint confirmation from CM. */
private MonitorCheckpoint guaranteeCheckpoint;
/* The monitor should exit voluntarily if set to false */
private volatile boolean isRunning = true;
/**
* Creates a DocumentSnapshotRepositoryMonitor that monitors the
* Repository rooted at {@code root}.
*
* @param name the name of this monitor (a hash of the start path)
* @param query query for files
* @param snapshotStore where snapshots are stored
* @param callback client callback
* @param documentSink destination for filtered out file info
* @param initialCp checkpoint when system initiated, could be {@code null}
* @param documentSnapshotFactory for un-serializing
* {@link DocumentSnapshot} objects.
*/
public DocumentSnapshotRepositoryMonitor(String name,
SnapshotRepository<? extends DocumentSnapshot> query,
SnapshotStore snapshotStore, Callback callback,
DocumentSink documentSink, MonitorCheckpoint initialCp,
DocumentSnapshotFactory documentSnapshotFactory) {
this.name = name;
this.query = query;
this.snapshotStore = snapshotStore;
this.callback = callback;
this.documentSnapshotFactory = documentSnapshotFactory;
this.documentSink = documentSink;
guaranteeCheckpoint = initialCp;
}
/**
* @return a current checkpoint for this monitor.
*/
private MonitorCheckpoint getCheckpoint(long readerDelta) {
long snapNum = snapshotReader.getSnapshotNumber();
long readRecNum = snapshotReader.getRecordNumber() + readerDelta;
if (readRecNum < 0) {
readRecNum = 0;
}
long writeRecNum = snapshotWriter.getRecordCount();
return new MonitorCheckpoint(name, snapNum, readRecNum, writeRecNum);
}
private MonitorCheckpoint getCheckpoint() {
return getCheckpoint(0);
}
@Override
public void run() {
// Call NDC.push() via reflection, if possible.
invoke(ndcPush, "Monitor " + name);
try {
while (true) {
tryToRunForever();
// TODO: Remove items from this monitor that are in queues.
// Watch out for race conditions. The queues are potentially
// giving docs to CM as bad things happen in monitor.
// This TODO would be mitigated by a reconciliation with GSA.
performExceptionRecovery();
}
} catch (InterruptedException ie) {
LOG.info("Repository Monitor " + name + " received stop signal. " + this);
} finally {
// Call NDC.remove() via reflection, if possible.
invoke(ndcRemove);
}
}
private void tryToRunForever() throws InterruptedException {
try {
while (true) {
if (traversalSchedule == null || traversalSchedule.shouldRun()) {
// Start traversal
doOnePass();
}
else {
LOG.finest("Currently out of traversal window. "
+ "Sleeping for 15 minutes.");
// TODO(nashi): Calculate when it should wake up while
// handling TraversalScheduleAware events properly.
callback.passPausing(15*60*1000);
}
}
} catch (SnapshotWriterException e) {
String msg = "Failed to write to snapshot file: " + snapshotWriter.getPath();
LOG.log(Level.SEVERE, msg, e);
} catch (SnapshotReaderException e) {
String msg = "Failed to read snapshot file: " + snapshotReader.getPath();
LOG.log(Level.SEVERE, msg, e);
} catch (SnapshotStoreException e) {
String msg = "Problem with snapshot store.";
LOG.log(Level.SEVERE, msg, e);
} catch (SnapshotRepositoryRuntimeException e) {
String msg = "Failed reading repository.";
LOG.log(Level.SEVERE, msg, e);
}
}
/**
* Call in situations were DocumentSnapshotRepositoryMonitor runs were
* interfered with and we wish to have the DocumentSnapshotRepositoryMonitor
* continue running. Brings system into state where doOnePass can be invoked.
* Failures in this method are considered fatal for the thread.
*
* @throws IllegalStateException if recovery fails.
* @throws InterruptedException if the calling thread is interrupted.
*/
private void performExceptionRecovery() throws InterruptedException,
IllegalStateException {
// Try to close potentially opened snapshot files.
try {
snapshotStore.close(snapshotReader, snapshotWriter);
LOG.info("Repository Monitor " + name + " closed faulty reader and writer.");
} catch (IOException e) {
String msg = "Repository Monitor " + name + " failed clean up .";
LOG.log(Level.SEVERE, msg, e);
throw new IllegalStateException(msg, e);
} catch (SnapshotStoreException e) {
String msg = "Repository Monitor " + name + " failed clean up .";
LOG.log(Level.SEVERE, msg, e);
throw new IllegalStateException(msg, e);
}
if (null == guaranteeCheckpoint) {
// This monitor was started without state; that is from scratch.
// TODO: Consider deleting all snapshot state and starting again.
String msg = "Repository Monitor " + name + " could not start correctly.";
LOG.severe(msg);
throw new IllegalStateException(msg);
} else {
try {
SnapshotStore.stitch(snapshotStore.getDirectory(), guaranteeCheckpoint,
documentSnapshotFactory);
LOG.info("Repository Monitor " + name + " restiched snapshot.");
} catch (IOException e) {
String msg = "Repository Monitor " + name + " has failed and stopped.";
LOG.log(Level.SEVERE, msg, e);
throw new IllegalStateException(msg, e);
} catch (SnapshotStoreException e) {
String msg = "Repository Monitor " + name + " failed fixing store.";
LOG.log(Level.SEVERE, msg, e);
throw new IllegalStateException(msg, e);
}
}
}
/**
* Makes one pass through the repository, notifying {@code visitor} of any
* changes.
*
* @throws InterruptedException
*/
private void doOnePass() throws SnapshotStoreException,
InterruptedException {
callback.passBegin();
try {
// Open the most recent snapshot and read the first record.
this.snapshotReader = snapshotStore.openMostRecentSnapshot();
current = snapshotReader.read();
// Create an snapshot writer for this pass.
this.snapshotWriter =
new OrderedSnapshotWriter(snapshotStore.openNewSnapshotWriter());
for(DocumentSnapshot ss : query) {
if (false == isRunning) {
LOG.log(Level.INFO, "Exiting the monitor thread " + name
+ " " + this);
throw new InterruptedException();
}
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException();
}
processDeletes(ss);
safelyProcessDocumentSnapshot(ss);
}
// Take care of any trailing paths in the snapshot.
processDeletes(null);
} finally {
try {
snapshotStore.close(snapshotReader, snapshotWriter);
} catch (IOException e) {
LOG.log(Level.WARNING, "Failed closing snapshot reader and writer.", e);
// Try to proceed anyway. Weird they are not closing.
}
}
if (current != null) {
throw new IllegalStateException(
"Should not finish pass until entire read snapshot is consumed.");
}
callback.passComplete(getCheckpoint(-1));
snapshotStore.deleteOldSnapshots();
if (!callback.hasEnqueuedAtLeastOneChangeThisPass()) {
// No monitor checkpoints from this pass went to queue because
// there were no changes, so we can delete the snapshot we just wrote.
new java.io.File(snapshotWriter.getPath()).delete();
// TODO: Check return value; log trouble.
}
snapshotWriter = null;
snapshotReader = null;
}
/**
* Process snapshot entries as deletes until {@code current} catches up with
* {@code documentSnapshot}. Or, if {@code documentSnapshot} is {@code null},
* process all remaining snapshot entries as deletes.
*
* @param documentSnapshot where to stop
* @throws SnapshotReaderException
* @throws InterruptedException
*/
private void processDeletes(DocumentSnapshot documentSnapshot)
throws SnapshotReaderException, InterruptedException {
while (current != null
&& (documentSnapshot == null
|| COMPARATOR.compare(documentSnapshot, current) > 0)) {
callback.deletedDocument(
new DeleteDocumentHandle(current.getDocumentId()), getCheckpoint());
current = snapshotReader.read();
}
}
private void safelyProcessDocumentSnapshot(DocumentSnapshot snapshot)
throws InterruptedException, SnapshotReaderException,
SnapshotWriterException {
try {
processDocument(snapshot);
} catch (RepositoryException re) {
//TODO Log the exception or its message? in document sink perhaps.
documentSink.add(snapshot.getDocumentId(), FilterReason.IO_EXCEPTION);
}
}
/**
* Processes a document found in the document repository.
*
* @param documentSnapshot
* @throws RepositoryException
* @throws InterruptedException
* @throws SnapshotReaderException
* @throws SnapshotWriterException
*/
private void processDocument(DocumentSnapshot documentSnapshot)
throws InterruptedException, RepositoryException, SnapshotReaderException,
SnapshotWriterException {
// At this point 'current' >= 'file', or possibly current == null if
// we've processed the previous snapshot entirely.
if (current != null
&& COMPARATOR.compare(documentSnapshot, current) == 0) {
processPossibleChange(documentSnapshot);
} else {
// This file didn't exist during the previous scan.
DocumentHandle documentHandle = documentSnapshot.getUpdate(null);
snapshotWriter.write(documentSnapshot);
// Null if filtered due to mime-type.
if (documentHandle != null) {
callback.newDocument(documentHandle, getCheckpoint(-1));
}
}
}
/**
* Processes a document found in the document repository that also appeared
* in the previous scan. Determines whether the document has changed,
* propagates changes to the client and writes the snapshot record.
*
* @param documentSnapshot
* @throws RepositoryException
* @throws InterruptedException
* @throws SnapshotWriterException
* @throws SnapshotReaderException
*/
private void processPossibleChange(DocumentSnapshot documentSnapshot)
throws RepositoryException, InterruptedException, SnapshotWriterException,
SnapshotReaderException {
DocumentHandle documentHandle = documentSnapshot.getUpdate(current);
snapshotWriter.write(documentSnapshot);
if (documentHandle == null) {
// No change.
} else {
// Normal change - send the gsa an update.
callback.changedDocument(documentHandle, getCheckpoint());
}
current = snapshotReader.read();
}
// Public for DocumentSnapshotRepositoryMonitorTest
@VisibleForTesting
public void acceptGuarantee(MonitorCheckpoint cp) {
snapshotStore.acceptGuarantee(cp);
guaranteeCheckpoint = cp;
}
@VisibleForTesting
public void testTraversalSchedule()
throws NullPointerException, InterruptedException {
tryToRunForever();
}
public void shutdown() {
LOG.log(Level.WARNING, "Shutdown the monitor thread " + name
+ " @ " + this);
isRunning = false;
}
public synchronized void setTraversalSchedule(TraversalSchedule
traversalSchedule) {
this.traversalSchedule = traversalSchedule;
LOG.log(Level.INFO, "Traversal schedule for " + name + " is changed to: " +
traversalSchedule.toString());
}
}