/* $Id: DocumentDeleteThread.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.system;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import java.util.*;
import java.lang.reflect.*;
/** This class represents a document delete thread. This thread's job is to pull document sets to be deleted off of
* a queue, and kill them. It finishes a delete set by getting rid of the corresponding rows in the job queue.
*
* There are very few decisions that this thread needs to make; essentially all the hard thought went into deciding
* what documents to queue in the first place.
*
* The only caveat is that the ingestion API may not be accepting delete requests at the time that this thread wants it
* to be able to accept them. In that case, it's acceptable for the thread to block until the ingestion service is
* functioning again.
*
* Transactions are not much needed for this class; it simply needs to not fail to remove the appropriate jobqueue
* table rows at the end of the delete.
*/
public class DocumentDeleteThread extends Thread
{
public static final String _rcsid = "@(#)$Id: DocumentDeleteThread.java 988245 2010-08-23 18:39:35Z kwright $";
// Local data
/** Thread ID */
protected final String id;
/** This is a reference to the static main document queue */
protected final DocumentDeleteQueue documentDeleteQueue;
/** Delete thread pool reset manager */
protected final DocDeleteResetManager resetManager;
/** Process ID */
protected final String processID;
/** Constructor.
*@param id is the worker thread id.
*/
public DocumentDeleteThread(String id, DocumentDeleteQueue documentDeleteQueue, DocDeleteResetManager resetManager, String processID)
throws ManifoldCFException
{
super();
this.id = id;
this.documentDeleteQueue = documentDeleteQueue;
this.resetManager = resetManager;
this.processID = processID;
setName("Document delete thread '"+id+"'");
setDaemon(true);
}
public void run()
{
resetManager.registerMe();
try
{
// Create a thread context object.
IThreadContext threadContext = ThreadContextFactory.make();
IJobManager jobManager = JobManagerFactory.make(threadContext);
IIncrementalIngester ingester = IncrementalIngesterFactory.make(threadContext);
IRepositoryConnectionManager connMgr = RepositoryConnectionManagerFactory.make(threadContext);
ITransformationConnectionManager transformationConnectionManager = TransformationConnectionManagerFactory.make(threadContext);
IOutputConnectionManager outputConnectionManager = OutputConnectionManagerFactory.make(threadContext);
// Loop
while (true)
{
// Do another try/catch around everything in the loop
try
{
// Before we begin, conditionally reset
resetManager.waitForReset(threadContext);
// See if there is anything on the queue for me
DocumentDeleteSet dds = documentDeleteQueue.getDocuments();
if (dds == null)
// Reset
continue;
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Document delete thread received "+Integer.toString(dds.getCount())+" documents to delete for job "+dds.getJobDescription().getID().toString());
IJobDescription job = dds.getJobDescription();
String connectionName = job.getConnectionName();
IPipelineConnections pipelineConnections = new PipelineConnections(new PipelineSpecificationBasic(job),transformationConnectionManager,outputConnectionManager);
try
{
// Do the delete work.
// Delete these identifiers. The underlying IIncrementalIngester method will need to be provided an activities object consistent
// with the individual connection, so the first job is to segregate what came in into connection bins. Then, we process each connection
// bin appropriately.
boolean[] deleteFromQueue = new boolean[dds.getCount()];
String[] docClassesToRemove = new String[dds.getCount()];
String[] hashedDocsToRemove = new String[dds.getCount()];
DeleteQueuedDocument[] docsToDelete = new DeleteQueuedDocument[dds.getCount()];
for (int j = 0; j < dds.getCount(); j++)
{
DeleteQueuedDocument dqd = dds.getDocument(j);
DocumentDescription ddd = dqd.getDocumentDescription();
docClassesToRemove[j] = connectionName;
hashedDocsToRemove[j] = ddd.getDocumentIdentifierHash();
docsToDelete[j] = dqd;
deleteFromQueue[j] = false;
}
OutputRemoveActivity logger = new OutputRemoveActivity(connectionName,connMgr);
try
{
ingester.documentDeleteMultiple(pipelineConnections,docClassesToRemove,hashedDocsToRemove,logger);
for (int j = 0; j < dds.getCount(); j++)
{
deleteFromQueue[j] = true;
}
}
catch (ServiceInterruption e)
{
// We don't know which failed, or maybe they all did.
// Go through the list of documents we just tried, and reset them on the queue based on the
// ServiceInterruption parameters. Then we must proceed to delete ONLY the documents that
// were not part of the index deletion attempt.
for (int j = 0; j < dds.getCount(); j++)
{
DeleteQueuedDocument cqd = docsToDelete[j];
DocumentDescription dd = cqd.getDocumentDescription();
// To recover from an expiration failure, requeue the document to COMPLETED etc.
jobManager.resetDeletingDocument(dd,e.getRetryTime());
cqd.setProcessed();
}
}
// Count the records we're actually going to delete
int recordCount = 0;
for (int j = 0; j < dds.getCount(); j++)
{
if (deleteFromQueue[j])
recordCount++;
}
// Delete the records
DocumentDescription[] deleteDescriptions = new DocumentDescription[recordCount];
recordCount = 0;
for (int j = 0; j < dds.getCount(); j++)
{
if (deleteFromQueue[j])
deleteDescriptions[recordCount++] = docsToDelete[j].getDocumentDescription();
}
jobManager.deleteIngestedDocumentIdentifiers(deleteDescriptions);
// Mark them as gone
for (int j = 0; j < dds.getCount(); j++)
{
if (deleteFromQueue[j])
docsToDelete[j].wasProcessed();
}
// Go around again
}
finally
{
// Here we should take steps to insure that the documents that have been handed to us
// are dealt with appropriately. This may involve setting the document state to "complete"
// so that they will be picked up again.
for (int j = 0; j < dds.getCount(); j++)
{
DeleteQueuedDocument dqd = dds.getDocument(j);
if (dqd.wasProcessed() == false)
{
// Pop this document back into the jobqueue in an appropriate state
DocumentDescription ddd = dqd.getDocumentDescription();
// Requeue this document!
jobManager.resetDeletingDocument(ddd,0L);
dqd.setProcessed();
}
}
}
}
catch (ManifoldCFException e)
{
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
break;
if (e.getErrorCode() == ManifoldCFException.DATABASE_CONNECTION_ERROR)
{
resetManager.noteEvent();
Logging.threads.error("Document delete thread aborting and restarting due to database connection reset: "+e.getMessage(),e);
try
{
// Give the database a chance to catch up/wake up
ManifoldCF.sleep(10000L);
}
catch (InterruptedException se)
{
break;
}
continue;
}
// Log it, but keep the thread alive
Logging.threads.error("Exception tossed: "+e.getMessage(),e);
if (e.getErrorCode() == ManifoldCFException.SETUP_ERROR)
{
// Shut the whole system down!
System.exit(1);
}
}
catch (InterruptedException e)
{
// We're supposed to quit
break;
}
catch (OutOfMemoryError e)
{
System.err.println("agents process ran out of memory - shutting down");
e.printStackTrace(System.err);
System.exit(-200);
}
catch (Throwable e)
{
// A more severe error - but stay alive
Logging.threads.fatal("Error tossed: "+e.getMessage(),e);
}
}
}
catch (Throwable e)
{
// Severe error on initialization
System.err.println("agents process could not start - shutting down");
Logging.threads.fatal("DocumentDeleteThread initialization error tossed: "+e.getMessage(),e);
System.exit(-300);
}
}
/** The OutputRemoveActivity class */
protected static class OutputRemoveActivity implements IOutputRemoveActivity
{
// Connection manager
protected final IRepositoryConnectionManager connMgr;
// Output connection name
protected final String connectionName;
/** Constructor */
public OutputRemoveActivity(String connectionName, IRepositoryConnectionManager connMgr)
{
this.connectionName = connectionName;
this.connMgr = connMgr;
}
/** Record time-stamped information about the activity of the output connector.
*@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
* activity has an associated time; the startTime field records when the activity began. A null value
* indicates that the start time and the finishing time are the same.
*@param activityType is a string which is fully interpretable only in the context of the connector involved, which is
* used to categorize what kind of activity is being recorded. For example, a web connector might record a
* "fetch document" activity. Cannot be null.
*@param dataSize is the number of bytes of data involved in the activity, or null if not applicable.
*@param entityURI is a (possibly long) string which identifies the object involved in the history record.
* The interpretation of this field will differ from connector to connector. May be null.
*@param resultCode contains a terse description of the result of the activity. The description is limited in
* size to 255 characters, and can be interpreted only in the context of the current connector. May be null.
*@param resultDescription is a (possibly long) human-readable string which adds detail, if required, to the result
* described in the resultCode field. This field is not meant to be queried on. May be null.
*/
public void recordActivity(Long startTime, String activityType, Long dataSize,
String entityURI, String resultCode, String resultDescription)
throws ManifoldCFException
{
connMgr.recordHistory(connectionName,startTime,activityType,dataSize,entityURI,resultCode,
resultDescription,null);
}
}
}