package ecologylab.bigsemantics.seeding;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import ecologylab.bigsemantics.collecting.Seeding;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.generic.Debug;
import ecologylab.generic.Generic;
/**
* now how SeedDistributor works:
* <p />
* searches: at most 2 searches are being processed (download and parse) at one time, in order not
* to block traffic. when there are already 2 being processed, new searches will be put in a waiting
* list.
* <p />
* search results: will be put into a priority queue when they come. will be processed by a consumer
* thread one group by one group. group size is set to 1 at this time (i.e. each time 1 search
* result is processed).
* <p />
* consumer thread: will be start when the first search request comes, and stopped when all the
* searches and search results are processed. check the search waiting list to process waiting
* searches at proper time. check the search results to process them. check for ending condition in
* order to call endSeeding().
* <p />
* -- above updated 7/24/2010, Yin Qu
*
* <p />
* <p />
* Aggregate all results across multiple searches and feeds. Interleave the results from each Seed.
* Round-robin the scheduling of parsing each.
* <p/>
* We want to keep track of: number of searches that will report to us. has each search (s1)
* started? has each search (s1) finished?
*
* @author eunyee
* @author andruid
* @param <slice>
*/
public class SeedDistributor extends Debug implements Runnable
{
public static interface DistributorContinuation
{
void distribute(DocumentClosure result);
}
/**
* initial size of the search result queue
*/
private static final int INIT_CAPACITY = 128;
private static final int MIN_INTERVAL_BTW_SEARCHES = 1000;
private static final int MIN_INTERVAL_BTW_QUEUE_PROCESSING = 1000;
/**
* to generate surrogates as soon as possible, during each queue processing we will process the
* top NUM_RESULTS_PROCESSED_EACH_TIME search results.
*/
private static final int NUM_RESULTS_PROCESSED_EACH_TIME = 1;
/**
* limit the number of searches being downloaded at one time in order to prevent blocking the
* traffic by searches, since some search engine limits the rate we can access them, and we have
* only 4 threads for downloading during seeding.
*/
private static final int MAX_NUM_SEARCHES_PROCESSING = 2;
private SemanticsGlobalScope infoCollector;
/**
* number of searches that we have to queue and process in total
*/
private int numSearchesToQueue = 0;
/**
* number of searches that have been queued to DownloadMonitor, but not yet finished
*/
private int numSearchesProcessing = 0;
/**
* number of searches that have been finished (will call doneQueueing()). track this number to
* decide when to finish seeding.
*/
private int numSearchesDone = 0;
/**
* a waiting list for search requests, in case that there are already MAX_NUM_SEARCHES_PROCESSING
* searches in processing.
*/
private final Queue<DocumentClosure> waitingSearches = new LinkedList<DocumentClosure>();
/**
* the comparator to decide the order of search results to be processed. can be customized through
* constructor. by default, search results will be ordered according to their ranks in the search
* result list.
*/
private final Comparator<DocumentClosure> comparator;
/**
* The priority queue holding (weighted) search results waiting for downloading and parsing. Note
* that PriorityQueue is not synchronized.
*/
private final PriorityQueue<DocumentClosure> queuedResults;
private final Map<DocumentClosure, DistributorContinuation> callbackMap = new HashMap<DocumentClosure, SeedDistributor.DistributorContinuation>();
private long lastSearchTimestamp;
private long lastQueueProcessingTimestamp;
private boolean started = false;
private boolean stopFlag;
public SeedDistributor(SemanticsGlobalScope infoCollector, Comparator<DocumentClosure> comparator)
{
this.infoCollector = infoCollector;
this.comparator = comparator;
this.queuedResults = new PriorityQueue<DocumentClosure>(INIT_CAPACITY, comparator);
}
public SeedDistributor(SemanticsGlobalScope infoCollector)
{
this(infoCollector, new Comparator<DocumentClosure>()
{
@Override
public int compare(DocumentClosure o1, DocumentClosure o2)
{
int i1 = getRank(o1);
int i2 = getRank(o2);
return i1 - i2;
}
});
}
/**
* Queue a search request to the SeedDistributor. The request might be put to a waiting list in
* order not to block the traffic.
*
* @param searchContainer
*/
public void queueSearchRequest(DocumentClosure searchContainer)
{
debug("search request: " + searchContainer);
numSearchesToQueue++;
if (numSearchesProcessing >= MAX_NUM_SEARCHES_PROCESSING)
{
synchronized (waitingSearches)
{
waitingSearches.offer(searchContainer);
}
}
else
{
downloadSearchRequest(searchContainer);
}
if (!started)
{
start();
}
}
/**
* Actually process a search request (send it to DownloadMonitor). Will wait for some time after
* each processing.
*
* @param searchContainer
*/
public void downloadSearchRequest(DocumentClosure searchContainer)
{
debug("queueing search request to DownloadMonitor: " + searchContainer);
searchContainer.queueDownload();
numSearchesProcessing++;
// debug("sending search request to DownloadMonitor: " + searchContainer);
waitForAtMost(lastSearchTimestamp, MIN_INTERVAL_BTW_SEARCHES);
lastSearchTimestamp = System.currentTimeMillis();
}
/**
* Called whenever a search is downloaded and parsed.
*
* @param srcDocument
* @param searchNum
* @param numResults
*/
public void doneQueueing(Document srcDocument)
{
debug("search parsed: " + srcDocument);
numSearchesProcessing--;
numSearchesDone++;
}
/**
* Queue a search result to the queue. Queued search results are interleaved and processed by the
* consumer thread.
*
* @param resultContainer
*/
public void queueResult(DocumentClosure resultContainer)
{
queueResult(resultContainer, null);
}
/**
* Queue a search result to the queue, with a specific callback method. The callback method will
* be called when the result is distributed (polled from the queue and processed), instead of
* calling queueDownload() on the result.
*
* @param resultContainer
* @param callback
*/
public void queueResult(DocumentClosure resultContainer, DistributorContinuation callback)
{
synchronized (queuedResults)
{
debug("queueing result: " + resultContainer);
queuedResults.offer(resultContainer);
if (callback != null)
{
callbackMap.put(resultContainer, callback);
}
}
}
/**
* Process one group of search results. The size of one group is controlled by
* NUM_RESULTS_PROCESSED_EACH_TIME.
*/
private void downloadResults()
{
int i = 0;
while (queuedResults.size() > 0 && i < NUM_RESULTS_PROCESSED_EACH_TIME)
{
synchronized (queuedResults)
{
if (queuedResults.size() > 0)
{
DocumentClosure downloadable = queuedResults.poll();
/*String query = getQuery(downloadable);
int rank = getRank(downloadable);
debug(String.format("sending container to DownloadMonitor: [%s:%d]%s", query, rank,
downloadable));
// downloadable.setDispatchTarget(this);
if (callbackMap.containsKey(downloadable))
{
callbackMap.get(downloadable).distribute(downloadable);
callbackMap.remove(downloadable);
}
else
{
downloadable.queueDownload();
}
i++;
*
*/
throw new RuntimeException("not implemented");
}
}
}
}
private static int getRank(DocumentClosure downloadable)
{
int r = -1;
// if (downloadable instanceof OldContainerI)
DocumentClosure container = (DocumentClosure) downloadable;
r = container.searchResult() == null ? -2 : container.searchResult().resultNum();
return r;
}
/*
private static String getQuery(DocumentClosure downloadable)
{
String query = null;
if (downloadable instanceof DocumentClosure)
{
DocumentClosure container = (DocumentClosure) downloadable;
query = container.getQuery();
}
return query == null ? "" : query;
}
*/
/**
* Start the consumer thread.
*/
public void start()
{
if (!started)
{
stopFlag = false;
debug("starting seed distributor consumer thread ...");
Thread t = new Thread(this, toString() + " consumer");
t.start();
started = true;
}
}
/**
* Stop the consumer thread.
*/
public void stop()
{
stopFlag = true;
debug("stopping seed distributor consumer thread ...");
}
/**
* Reset the SeedDistributor.
*/
public void reset()
{
this.numSearchesToQueue = 0;
this.numSearchesProcessing = 0;
this.numSearchesDone = 0;
this.queuedResults.clear();
}
private void waitForAtMost(long timestamp, int minMillis)
{
if (timestamp == 0)
{
// first time, no need to wait
return;
}
int wait = (int) (System.currentTimeMillis() - timestamp);
if (wait < 0)
wait = 0;
if (wait < minMillis)
{
int sleepMillis = minMillis - wait;
Generic.sleep((int) sleepMillis);
debug("waiting (in milliseconds): " + sleepMillis);
}
}
/**
* The consumer thread. It will be started when the first search request comes, and activated
* every MIN_INTERVAL_BTW_QUEUE_PROCESSING milliseconds.
* <p />
* During each activation, it will first check if there are waiting search requests that could be
* processed. If there are, it will poll one from the queue and process it. If there are not, it
* will process a group of search results from the priority queue.
* <p />
* At last, it will check for the ending condition in order to call endSeeding() when all the
* seeds have been processed.
*/
@Override
public void run()
{
while (!stopFlag)
{
waitForAtMost(lastQueueProcessingTimestamp, MIN_INTERVAL_BTW_QUEUE_PROCESSING);
lastQueueProcessingTimestamp = System.currentTimeMillis();
if (numSearchesProcessing < MAX_NUM_SEARCHES_PROCESSING && waitingSearches.size() > 0)
{
synchronized (waitingSearches)
{
if (waitingSearches.size() > 0)
{
DocumentClosure search = waitingSearches.poll();
downloadSearchRequest(search);
}
}
}
else
{
downloadResults();
}
checkForEndSeeding();
}
started = false;
}
/**
* Check for ending conditions and call endSeeding() at the right time. The ending condition is:
* all the searches that are queued have been downloaded and parsed, and all the search results
* have been processed.
*/
private void checkForEndSeeding()
{
int numResultsRemaining = 0;
synchronized (queuedResults)
{
numResultsRemaining = queuedResults.size();
}
debug(String.format(
"checking for endSeeding(): toQueue=%d, processing=%d, done=%d, remaining results=%d",
numSearchesToQueue, numSearchesProcessing, numSearchesDone, numResultsRemaining));
if (numSearchesDone >= numSearchesToQueue && numSearchesProcessing <= 0
&& numResultsRemaining == 0)
{
System.out
.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
System.out
.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
System.out
.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
System.out
.println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
Seeding seeding = infoCollector.getSeeding();
if (seeding != null)
seeding.endSeeding();
stop();
}
}
/**
* (Currently for debugging only.)
@Override
public void continuation(AC o)
{
if (!o.isRecycled())
{
debug("done downloading: " + o);
}
}
*/
}