package org.gbif.checklistbank.index.backfill;
import org.gbif.api.service.checklistbank.DescriptionService;
import org.gbif.api.service.checklistbank.DistributionService;
import org.gbif.api.service.checklistbank.SpeciesProfileService;
import org.gbif.api.service.checklistbank.VernacularNameService;
import org.gbif.checklistbank.service.UsageService;
import org.gbif.checklistbank.service.mybatis.DescriptionServiceMyBatis;
import org.gbif.checklistbank.service.mybatis.DistributionServiceMyBatis;
import org.gbif.checklistbank.service.mybatis.SpeciesProfileServiceMyBatis;
import org.gbif.checklistbank.service.mybatis.VernacularNameServiceMyBatis;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.Charset;
import java.text.DecimalFormat;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicLong;
import com.google.common.io.Files;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Checklist Bank multithreaded name usage solr indexer.
* This class creates a pool of configurable <i>threads</i> that concurrently execute a number of jobs
* each processing a configurable number of name usages (<i>batchSize</i>)
* using a configurable number of concurrent lucene <i>writers</i>.
* The indexer makes direct use of the mybatis layer and requires a checklist bank datasource to be configured.
*/
public abstract class NameUsageBatchProcessor extends ThreadPoolRunner<Integer> {
// document counter
protected static AtomicLong counter = new AtomicLong(0L);
private static final Logger LOG = LoggerFactory.getLogger(NameUsageBatchProcessor.class);
protected final int batchSize;
/**
* Log interval in seconds. Use property logInterval to set it, defaults to one minute.
*/
protected final Integer logInterval;
private CountReporter reporterThread;
// mybatis converted services exposing internal methods not avaiable in the service interface
// (would also have to be in clients then)
private final UsageService nameUsageService;
private final VernacularNameServiceMyBatis vernacularNameService;
private final DescriptionServiceMyBatis descriptionService;
private final DistributionServiceMyBatis distributionService;
private final SpeciesProfileServiceMyBatis speciesProfileService;
//
private List<Integer> allIds;
protected int jobCounter = 0;
private class CountReporter extends Thread {
/**
* Timer to measure the total time of execution
*/
private StopWatch stopWatch = new StopWatch();
private final long total;
private final DecimalFormat twoDForm = new DecimalFormat("#.##");
private boolean interrupted = false;
CountReporter(long total) {
this.total = total;
}
@Override
public void run() {
stopWatch.start();
LOG.info("Started reporting thread with expected {} total records.", total);
LOG.info("Logging every {} seconds. Use logInterval property to change interval.", logInterval);
while (!interrupted) {
log();
try {
Thread.sleep(logInterval * 1000);
} catch (InterruptedException e) {
LOG.info("Reporter thread interrupted, exiting");
interrupted = true;
}
}
LOG.info("Reporter thread stopped");
}
public void shutdown() {
interrupted = true;
}
/**
* Log total progress every minute.
*/
private void log() {
long cnt = counter.get();
double percCompleted = (double) cnt / (double) total;
double percRemaining = 1d - percCompleted;
long timeRemaining = (long) (stopWatch.getTime() * (percRemaining / percCompleted));
LOG.info("{} documents ({}%) added in {}",
new Object[] {cnt, twoDForm.format(percCompleted * 100), stopWatch.toString()});
LOG.info("Expected remaining time to finish {}", DurationFormatUtils.formatDurationHMS(timeRemaining));
}
}
public NameUsageBatchProcessor(Integer threads, int batchSize, Integer logInterval,
UsageService nameUsageService,
VernacularNameService vernacularNameService, DescriptionService descriptionService,
DistributionService distributionService, SpeciesProfileService speciesProfileService) {
super(threads);
this.logInterval = logInterval;
this.batchSize = batchSize;
// services
this.nameUsageService = nameUsageService;
this.vernacularNameService = (VernacularNameServiceMyBatis) vernacularNameService;
this.descriptionService = (DescriptionServiceMyBatis) descriptionService;
this.distributionService = (DistributionServiceMyBatis) distributionService;
this.speciesProfileService = (SpeciesProfileServiceMyBatis) speciesProfileService;
}
public static Properties loadProperties(String propertiesFile) throws IOException {
Properties tempProperties;
try (Reader reader = Files.newReader(new File(propertiesFile), Charset.defaultCharset())) {
tempProperties = new Properties();
tempProperties.load(reader);
}
return tempProperties;
}
@Override
public int run() {
int x = super.run();
LOG.info("Time taken run and finish all jobs: {}", reporterThread.stopWatch.toString());
reporterThread.shutdown();
return x;
}
/**
* Creates a list of NameUsageIndexingJob by loading all usage ids and splitting up the jobs between those ids.
*
* @return a {@link List} of {@link NameUsageIndexingJob}.
*/
@Override
protected Callable<Integer> newJob() {
if (allIds == null) {
initKeys();
try {
init();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
// any new job to be created?
if (allIds.size() <= jobCounter * batchSize) {
LOG.info("No more jobs to insert. Created {} jobs in total each processing {} records.", jobCounter, batchSize);
return null;
}
// produce new job with a new slice
final int startKey = allIds.get(jobCounter * batchSize);
int endIdx = (jobCounter + 1) * batchSize - 1;
final int endKey = endIdx > allIds.size() ? allIds.get(allIds.size() - 1) : allIds.get(endIdx);
jobCounter++;
return newBatchJob(startKey, endKey, nameUsageService, vernacularNameService, descriptionService, distributionService, speciesProfileService);
}
protected abstract Callable<Integer> newBatchJob(int startKey, int endKey, UsageService nameUsageService, VernacularNameServiceMyBatis vernacularNameService, DescriptionServiceMyBatis descriptionService, DistributionServiceMyBatis distributionService, SpeciesProfileServiceMyBatis speciesProfileService);
protected abstract void init() throws Exception;
protected abstract void postprocess() throws Exception;
private void initKeys() {
StopWatch stopWatch = new StopWatch();
LOG.debug("Start retrieving all usage ids ...");
stopWatch.start();
allIds = nameUsageService.listAll();
//allIds = Lists.newArrayList(ContiguousSet.create(Range.closed(0, 13), DiscreteDomain.integers()).asList());
LOG.info("Retrieved all {} usage ids in {}", allIds.size(), stopWatch.toString());
stopWatch.reset();
stopWatch.start();
Collections.sort(allIds);
LOG.info("Sorted all {} usage ids in {}", allIds.size(), stopWatch.toString());
LOG.info("{} full jobs each processing {} records to be created.", allIds.size() / batchSize, batchSize);
// start global reporter
reporterThread = new CountReporter(allIds.size());
reporterThread.start();
}
@Override
protected void shutdownService(int tasksCount) {
try {
super.shutdownService(tasksCount);
LOG.info("All jobs completed.");
postprocess();
LOG.info("Indexing completed!");
} catch (Exception e) {
LOG.error("Error shutingdown the indexer", e);
}
}
}