/* $Id: CrawlerAgent.java 988245 2010-08-23 18:39:35Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.system; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.crawler.interfaces.*; import java.util.*; /** This is the main agent class for the crawler. */ public class CrawlerAgent implements IAgent { public static final String _rcsid = "@(#)$Id: CrawlerAgent.java 988245 2010-08-23 18:39:35Z kwright $"; // Thread objects. // These get filled in as threads are created. protected JobStartThread jobStartThread = null; protected StufferThread stufferThread = null; protected FinisherThread finisherThread = null; protected JobNotificationThread notificationThread = null; protected StartupThread startupThread = null; protected StartDeleteThread startDeleteThread = null; protected JobDeleteThread jobDeleteThread = null; protected WorkerThread[] workerThreads = null; protected ExpireStufferThread expireStufferThread = null; protected ExpireThread[] expireThreads = null; protected DocumentDeleteStufferThread deleteStufferThread = null; protected DocumentDeleteThread[] deleteThreads = null; protected DocumentCleanupStufferThread cleanupStufferThread = null; protected DocumentCleanupThread[] cleanupThreads = null; protected JobResetThread jobResetThread = null; protected SeedingThread seedingThread = null; protected IdleCleanupThread idleCleanupThread = null; protected SetPriorityThread setPriorityThread = null; protected HistoryCleanupThread historyCleanupThread = null; protected AssessmentThread assessmentThread = null; // Reset managers /** Worker thread pool reset manager */ protected WorkerResetManager workerResetManager = null; /** Delete thread pool reset manager */ protected DocDeleteResetManager docDeleteResetManager = null; /** Cleanup thread pool reset manager */ protected DocCleanupResetManager docCleanupResetManager = null; // Number of worker threads protected int numWorkerThreads = 0; // Number of delete threads protected int numDeleteThreads = 0; // Number of cleanup threads protected int numCleanupThreads = 0; // Number of expiration threads protected int numExpireThreads = 0; // Factor for low water level in queueing protected float lowWaterFactor = 5.0f; // Factor in amount to stuff protected float stuffAmtFactor = 0.5f; /** Process identifier for this agent */ protected String processID = null; /** Constructor. *@param threadContext is the thread context. */ public CrawlerAgent() throws ManifoldCFException { } /** Initialize agent environment. * This is called before any of the other operations are called, and is meant to insure that * the environment is properly initialized. */ public void initialize(IThreadContext threadContext) throws ManifoldCFException { org.apache.manifoldcf.authorities.system.ManifoldCF.localInitialize(threadContext); org.apache.manifoldcf.crawler.system.ManifoldCF.localInitialize(threadContext); } /** Tear down agent environment. * This is called after all the other operations are completed, and is meant to allow * environment resources to be freed. */ public void cleanUp(IThreadContext threadContext) throws ManifoldCFException { org.apache.manifoldcf.crawler.system.ManifoldCF.localCleanup(threadContext); org.apache.manifoldcf.authorities.system.ManifoldCF.localCleanup(threadContext); } /** Install agent. This usually installs the agent's database tables etc. */ @Override public void install(IThreadContext threadContext) throws ManifoldCFException { // Install the system tables for the crawler. ManifoldCF.installSystemTables(threadContext); } /** Uninstall agent. This must clean up everything the agent is responsible for. */ @Override public void deinstall(IThreadContext threadContext) throws ManifoldCFException { ManifoldCF.deinstallSystemTables(threadContext); } /** Called ONLY when no other active services of this kind are running. Meant to be * used after the cluster has been down for an indeterminate period of time. */ @Override public void clusterInit(IThreadContext threadContext) throws ManifoldCFException { IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.prepareForClusterStart(); } /** Cleanup after ALL agents processes. * Call this method to clean up dangling persistent state when a cluster is just starting * to come up. This method CANNOT be called when there are any active agents * processes at all. *@param processID is the current process ID. */ @Override public void cleanUpAllAgentData(IThreadContext threadContext, String currentProcessID) throws ManifoldCFException { IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.cleanupProcessData(); // What kind of reprioritization should be done here? // Answer: since we basically keep everything in the database now, the only kind of reprioritization we need // to take care of are dangling ones that won't get done because the process that was doing them went // away. BUT: somebody may have blown away lock info, in which case we won't know anything at all. // So we do everything in that case. ManifoldCF.resetAllDocumentPriorities(threadContext,currentProcessID); } /** Cleanup after agents process. * Call this method to clean up dangling persistent state after agent has been stopped. * This method CANNOT be called when the agent is active, but it can * be called at any time and by any process in order to guarantee that a terminated * agent does not block other agents from completing their tasks. *@param currentProcessID is the current process ID. *@param cleanupProcessID is the process ID of the agent to clean up after. */ @Override public void cleanUpAgentData(IThreadContext threadContext, String currentProcessID, String cleanupProcessID) throws ManifoldCFException { IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.cleanupProcessData(cleanupProcessID); // If one agents process was starting a reprioritization, it could have started the reprioritization sequence, but // failed to complete it. If so, we may need to reset/complete the reprioritization sequence, which is defined as: // - Resetting prioritization parameters // - Removing all existing document priorities // These must go together in order for the reset to be correct. IReprioritizationTracker rt = ReprioritizationTrackerFactory.make(threadContext); String reproID = rt.isSpecifiedProcessReprioritizing(cleanupProcessID); if (reproID != null) { // We have to take over the prioritization for the process, which apparently died // in the middle. jobManager.clearAllDocumentPriorities(); /* IRepositoryConnectionManager connectionManager = RepositoryConnectionManagerFactory.make(threadContext); // Reprioritize all documents in the jobqueue, 1000 at a time Map<String,IRepositoryConnection> connectionMap = new HashMap<String,IRepositoryConnection>(); Map<Long,IJobDescription> jobDescriptionMap = new HashMap<Long,IJobDescription>(); // Do the 'not yet processed' documents only. Documents that are queued for reprocessing will be assigned // new priorities. Already processed documents won't. This guarantees that our bins are appropriate for current thread // activity. // In order for this to be the correct functionality, ALL reseeding and requeuing operations MUST reset the associated document // priorities. // ??? -- start the process of reprioritization ONLY; don't do the whole thing. while (true) { long startTime = System.currentTimeMillis(); Long currentTimeValue = rt.checkReprioritizationInProgress(); if (currentTimeValue == null) { // Some other process or thread superceded us. return; } long updateTime = currentTimeValue.longValue(); DocumentDescription[] docs = jobManager.getNextNotYetProcessedReprioritizationDocuments(10000); if (docs.length == 0) break; // Calculate new priorities for all these documents ManifoldCF.writeDocumentPriorities(threadContext,docs,connectionMap,jobDescriptionMap); Logging.threads.debug("Reprioritized "+Integer.toString(docs.length)+" not-yet-processed documents in "+new Long(System.currentTimeMillis()-startTime)+" ms"); } */ rt.doneReprioritization(reproID); } } /** Start the agent. This method should spin up the agent threads, and * then return. */ @Override public void startAgent(IThreadContext threadContext, String processID) throws ManifoldCFException { this.processID = processID; startSystem(threadContext); } /** Stop the agent. This should shut down the agent threads etc. */ @Override public void stopAgent(IThreadContext threadContext) throws ManifoldCFException { stopSystem(threadContext); } /** Request permission from agent to delete an output connection. *@param connName is the name of the output connection. *@return true if the connection is in use, false otherwise. */ @Override public boolean isOutputConnectionInUse(IThreadContext threadContext, String connName) throws ManifoldCFException { // Check with job manager. IJobManager jobManager = JobManagerFactory.make(threadContext); return jobManager.checkIfOutputReference(connName); } /** Note the deregistration of a set of output connections. *@param connectionNames are the names of the connections being deregistered. */ @Override public void noteOutputConnectorDeregistration(IThreadContext threadContext, String[] connectionNames) throws ManifoldCFException { // Notify job manager IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.noteOutputConnectorDeregistration(connectionNames); } /** Note the registration of a set of output connections. *@param connectionNames are the names of the connections being registered. */ @Override public void noteOutputConnectorRegistration(IThreadContext threadContext, String[] connectionNames) throws ManifoldCFException { // Notify job manager IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.noteOutputConnectorRegistration(connectionNames); } /** Note a change in configuration for an output connection. *@param connectionName is the name of the connections being changed. */ @Override public void noteOutputConnectionChange(IThreadContext threadContext, String connectionName) throws ManifoldCFException { // Notify job manager IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.noteOutputConnectionChange(connectionName); } /** Request permission from agent to delete a transformation connection. *@param connName is the name of the transformation connection. *@return true if the connection is in use, false otherwise. */ @Override public boolean isTransformationConnectionInUse(IThreadContext threadContext, String connName) throws ManifoldCFException { // Check with job manager. IJobManager jobManager = JobManagerFactory.make(threadContext); return jobManager.checkIfTransformationReference(connName); } /** Note the deregistration of a set of transformation connections. *@param connectionNames are the names of the connections being deregistered. */ @Override public void noteTransformationConnectorDeregistration(IThreadContext threadContext, String[] connectionNames) throws ManifoldCFException { // Notify job manager IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.noteTransformationConnectorDeregistration(connectionNames); } /** Note the registration of a set of transformation connections. *@param connectionNames are the names of the connections being registered. */ @Override public void noteTransformationConnectorRegistration(IThreadContext threadContext, String[] connectionNames) throws ManifoldCFException { // Notify job manager IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.noteTransformationConnectorRegistration(connectionNames); } /** Note a change in configuration for a transformation connection. *@param connectionName is the name of the connection being changed. */ @Override public void noteTransformationConnectionChange(IThreadContext threadContext, String connectionName) throws ManifoldCFException { // Notify job manager IJobManager jobManager = JobManagerFactory.make(threadContext); jobManager.noteTransformationConnectionChange(connectionName); } /** Start everything. */ public void startSystem(IThreadContext threadContext) throws ManifoldCFException { Logging.root.info("Starting up pull-agent..."); // Now, start all the threads numWorkerThreads = ManifoldCF.getMaxWorkerThreads(threadContext); if (numWorkerThreads < 1 || numWorkerThreads > 300) throw new ManifoldCFException("Illegal value for the number of worker threads", ManifoldCFException.SETUP_ERROR); numDeleteThreads = ManifoldCF.getMaxDeleteThreads(threadContext); numCleanupThreads = ManifoldCF.getMaxCleanupThreads(threadContext); numExpireThreads = ManifoldCF.getMaxExpireThreads(threadContext); if (numDeleteThreads < 1 || numDeleteThreads > 300) throw new ManifoldCFException("Illegal value for the number of delete threads", ManifoldCFException.SETUP_ERROR); if (numCleanupThreads < 1 || numCleanupThreads > 300) throw new ManifoldCFException("Illegal value for the number of cleanup threads", ManifoldCFException.SETUP_ERROR); if (numExpireThreads < 1 || numExpireThreads > 300) throw new ManifoldCFException("Illegal value for the number of expire threads", ManifoldCFException.SETUP_ERROR); lowWaterFactor = (float)LockManagerFactory.getDoubleProperty(threadContext,ManifoldCF.lowWaterFactorProperty,5.0); if (lowWaterFactor < 1.0 || lowWaterFactor > 1000.0) throw new ManifoldCFException("Illegal value for the low water factor", ManifoldCFException.SETUP_ERROR); stuffAmtFactor = (float)LockManagerFactory.getDoubleProperty(threadContext,ManifoldCF.stuffAmtFactorProperty,2.0); if (stuffAmtFactor < 0.1 || stuffAmtFactor > 1000.0) throw new ManifoldCFException("Illegal value for the stuffing amount factor", ManifoldCFException.SETUP_ERROR); // Create the threads and objects. This MUST be completed before there is any chance of "shutdownSystem" getting called. QueueTracker queueTracker = new QueueTracker(); DocumentQueue documentQueue = new DocumentQueue(); DocumentDeleteQueue documentDeleteQueue = new DocumentDeleteQueue(); DocumentCleanupQueue documentCleanupQueue = new DocumentCleanupQueue(); DocumentCleanupQueue expireQueue = new DocumentCleanupQueue(); BlockingDocuments blockingDocuments = new BlockingDocuments(); workerResetManager = new WorkerResetManager(documentQueue,expireQueue,processID); docDeleteResetManager = new DocDeleteResetManager(documentDeleteQueue,processID); docCleanupResetManager = new DocCleanupResetManager(documentCleanupQueue,processID); jobStartThread = new JobStartThread(processID); startupThread = new StartupThread(new StartupResetManager(processID),processID); startDeleteThread = new StartDeleteThread(new DeleteStartupResetManager(processID),processID); finisherThread = new FinisherThread(processID); notificationThread = new JobNotificationThread(new NotificationResetManager(processID),processID); jobDeleteThread = new JobDeleteThread(processID); stufferThread = new StufferThread(documentQueue,numWorkerThreads,workerResetManager,queueTracker,blockingDocuments,lowWaterFactor,stuffAmtFactor,processID); expireStufferThread = new ExpireStufferThread(expireQueue,numExpireThreads,workerResetManager,processID); setPriorityThread = new SetPriorityThread(numWorkerThreads,blockingDocuments,processID); historyCleanupThread = new HistoryCleanupThread(processID); workerThreads = new WorkerThread[numWorkerThreads]; int i = 0; while (i < numWorkerThreads) { workerThreads[i] = new WorkerThread(Integer.toString(i),documentQueue,workerResetManager,queueTracker,processID); i++; } expireThreads = new ExpireThread[numExpireThreads]; i = 0; while (i < numExpireThreads) { expireThreads[i] = new ExpireThread(Integer.toString(i),expireQueue,workerResetManager,processID); i++; } deleteStufferThread = new DocumentDeleteStufferThread(documentDeleteQueue,numDeleteThreads,docDeleteResetManager,processID); deleteThreads = new DocumentDeleteThread[numDeleteThreads]; i = 0; while (i < numDeleteThreads) { deleteThreads[i] = new DocumentDeleteThread(Integer.toString(i),documentDeleteQueue,docDeleteResetManager,processID); i++; } cleanupStufferThread = new DocumentCleanupStufferThread(documentCleanupQueue,numCleanupThreads,docCleanupResetManager,processID); cleanupThreads = new DocumentCleanupThread[numCleanupThreads]; i = 0; while (i < numCleanupThreads) { cleanupThreads[i] = new DocumentCleanupThread(Integer.toString(i),documentCleanupQueue,docCleanupResetManager,processID); i++; } jobResetThread = new JobResetThread(processID); seedingThread = new SeedingThread(new SeedingResetManager(processID),processID); idleCleanupThread = new IdleCleanupThread(processID); assessmentThread = new AssessmentThread(processID); // Start all the threads jobStartThread.start(); startupThread.start(); startDeleteThread.start(); finisherThread.start(); notificationThread.start(); jobDeleteThread.start(); stufferThread.start(); expireStufferThread.start(); setPriorityThread.start(); historyCleanupThread.start(); i = 0; while (i < numWorkerThreads) { workerThreads[i].start(); i++; } i = 0; while (i < numExpireThreads) { expireThreads[i].start(); i++; } cleanupStufferThread.start(); i = 0; while (i < numCleanupThreads) { cleanupThreads[i].start(); i++; } deleteStufferThread.start(); i = 0; while (i < numDeleteThreads) { deleteThreads[i].start(); i++; } jobResetThread.start(); seedingThread.start(); idleCleanupThread.start(); assessmentThread.start(); Logging.root.info("Pull-agent started"); } /** Stop the system. */ public void stopSystem(IThreadContext threadContext) throws ManifoldCFException { Logging.root.info("Shutting down pull-agent..."); while (jobDeleteThread != null || startupThread != null || startDeleteThread != null || jobStartThread != null || stufferThread != null || finisherThread != null || notificationThread != null || workerThreads != null || expireStufferThread != null || expireThreads != null || deleteStufferThread != null || deleteThreads != null || cleanupStufferThread != null || cleanupThreads != null || jobResetThread != null || seedingThread != null || idleCleanupThread != null || assessmentThread != null || setPriorityThread != null || historyCleanupThread != null) { // Send an interrupt to all threads that are still there. // In theory, this only needs to be done once. In practice, I have seen cases where the thread loses track of the fact that it has been // interrupted (which may be a JVM bug - who knows?), but in any case there's no harm in doing it again. if (historyCleanupThread != null) { historyCleanupThread.interrupt(); } if (setPriorityThread != null) { setPriorityThread.interrupt(); } if (jobStartThread != null) { jobStartThread.interrupt(); } if (jobDeleteThread != null) { jobDeleteThread.interrupt(); } if (startupThread != null) { startupThread.interrupt(); } if (startDeleteThread != null) { startDeleteThread.interrupt(); } if (stufferThread != null) { stufferThread.interrupt(); } if (expireStufferThread != null) { expireStufferThread.interrupt(); } if (finisherThread != null) { finisherThread.interrupt(); } if (notificationThread != null) { notificationThread.interrupt(); } if (workerThreads != null) { int i = 0; while (i < workerThreads.length) { Thread workerThread = workerThreads[i++]; if (workerThread != null) workerThread.interrupt(); } } if (expireThreads != null) { int i = 0; while (i < expireThreads.length) { Thread expireThread = expireThreads[i++]; if (expireThread != null) expireThread.interrupt(); } } if (cleanupStufferThread != null) { cleanupStufferThread.interrupt(); } if (cleanupThreads != null) { int i = 0; while (i < cleanupThreads.length) { Thread cleanupThread = cleanupThreads[i++]; if (cleanupThread != null) cleanupThread.interrupt(); } } if (deleteStufferThread != null) { deleteStufferThread.interrupt(); } if (deleteThreads != null) { int i = 0; while (i < deleteThreads.length) { Thread deleteThread = deleteThreads[i++]; if (deleteThread != null) deleteThread.interrupt(); } } if (jobResetThread != null) { jobResetThread.interrupt(); } if (seedingThread != null) { seedingThread.interrupt(); } if (idleCleanupThread != null) { idleCleanupThread.interrupt(); } if (assessmentThread != null) { assessmentThread.interrupt(); } // Now, wait for all threads to die. try { ManifoldCF.sleep(1000L); } catch (InterruptedException e) { } // Check to see which died. if (historyCleanupThread != null) { if (!historyCleanupThread.isAlive()) historyCleanupThread = null; } if (setPriorityThread != null) { if (!setPriorityThread.isAlive()) setPriorityThread = null; } if (jobDeleteThread != null) { if (!jobDeleteThread.isAlive()) jobDeleteThread = null; } if (startupThread != null) { if (!startupThread.isAlive()) startupThread = null; } if (startDeleteThread != null) { if (!startDeleteThread.isAlive()) startDeleteThread = null; } if (jobStartThread != null) { if (!jobStartThread.isAlive()) jobStartThread = null; } if (stufferThread != null) { if (!stufferThread.isAlive()) stufferThread = null; } if (expireStufferThread != null) { if (!expireStufferThread.isAlive()) expireStufferThread = null; } if (finisherThread != null) { if (!finisherThread.isAlive()) finisherThread = null; } if (notificationThread != null) { if (!notificationThread.isAlive()) notificationThread = null; } if (workerThreads != null) { int i = 0; boolean isAlive = false; while (i < workerThreads.length) { Thread workerThread = workerThreads[i]; if (workerThread != null) { if (!workerThread.isAlive()) workerThreads[i] = null; else isAlive = true; } i++; } if (!isAlive) workerThreads = null; } if (expireThreads != null) { int i = 0; boolean isAlive = false; while (i < expireThreads.length) { Thread expireThread = expireThreads[i]; if (expireThread != null) { if (!expireThread.isAlive()) expireThreads[i] = null; else isAlive = true; } i++; } if (!isAlive) expireThreads = null; } if (cleanupStufferThread != null) { if (!cleanupStufferThread.isAlive()) cleanupStufferThread = null; } if (cleanupThreads != null) { int i = 0; boolean isAlive = false; while (i < cleanupThreads.length) { Thread cleanupThread = cleanupThreads[i]; if (cleanupThread != null) { if (!cleanupThread.isAlive()) cleanupThreads[i] = null; else isAlive = true; } i++; } if (!isAlive) cleanupThreads = null; } if (deleteStufferThread != null) { if (!deleteStufferThread.isAlive()) deleteStufferThread = null; } if (deleteThreads != null) { int i = 0; boolean isAlive = false; while (i < deleteThreads.length) { Thread deleteThread = deleteThreads[i]; if (deleteThread != null) { if (!deleteThread.isAlive()) deleteThreads[i] = null; else isAlive = true; } i++; } if (!isAlive) deleteThreads = null; } if (jobResetThread != null) { if (!jobResetThread.isAlive()) jobResetThread = null; } if (seedingThread != null) { if (!seedingThread.isAlive()) seedingThread = null; } if (idleCleanupThread != null) { if (!idleCleanupThread.isAlive()) idleCleanupThread = null; } if (assessmentThread != null) { if (!assessmentThread.isAlive()) assessmentThread = null; } } // Threads are down; release connectors RepositoryConnectorPoolFactory.make(threadContext).flushUnusedConnectors(); NotificationConnectorPoolFactory.make(threadContext).flushUnusedConnectors(); numWorkerThreads = 0; numDeleteThreads = 0; numExpireThreads = 0; Logging.root.info("Pull-agent successfully shut down"); } }