package org.gbif.checklistbank.index.backfill;
import org.gbif.api.service.checklistbank.DescriptionService;
import org.gbif.api.service.checklistbank.DistributionService;
import org.gbif.api.service.checklistbank.SpeciesProfileService;
import org.gbif.api.service.checklistbank.VernacularNameService;
import org.gbif.checklistbank.index.NameUsageDocConverter;
import org.gbif.checklistbank.index.guice.EmbeddedSolrReference;
import org.gbif.checklistbank.index.guice.SolrIndexingModule;
import org.gbif.checklistbank.service.UsageService;
import org.gbif.checklistbank.service.mybatis.DescriptionServiceMyBatis;
import org.gbif.checklistbank.service.mybatis.DistributionServiceMyBatis;
import org.gbif.checklistbank.service.mybatis.SpeciesProfileServiceMyBatis;
import org.gbif.checklistbank.service.mybatis.VernacularNameServiceMyBatis;
import org.gbif.utils.file.ResourcesUtil;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Properties;
import java.util.concurrent.Callable;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.google.inject.name.Named;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.core.CoreContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Checklist Bank multithreaded name usage solr indexer.
* This class creates a pool of configurable <i>threads</i> that concurrently execute a number of jobs
* each processing a configurable number of name usages (<i>batchSize</i>)
* using a configurable number of concurrent lucene <i>writers</i>.
* The indexer makes direct use of the mybatis layer and requires a checklist bank datasource to be configured.
*/
public class SolrBackfill extends NameUsageBatchProcessor {
private static final Logger LOG = LoggerFactory.getLogger(SolrBackfill.class);
private final int numWriters;
// other injected instances
private NameUsageDocConverter solrDocumentConverter;
private final File indexDir;
private EmbeddedSolrReference solrRef;
private EmbeddedSolrServer[] writers;
@Inject
public SolrBackfill(EmbeddedSolrReference solr,
@Named(IndexingConfigKeys.THREADS) Integer threads,
@Named(IndexingConfigKeys.BATCH_SIZE) Integer batchSize,
@Named(IndexingConfigKeys.WRITERS) Integer numWriters,
@Named(IndexingConfigKeys.LOG_INTERVAL) Integer logInterval,
UsageService nameUsageService, NameUsageDocConverter solrDocumentConverter,
VernacularNameService vernacularNameService, DescriptionService descriptionService,
DistributionService distributionService, SpeciesProfileService speciesProfileService) {
super(threads, batchSize, logInterval, nameUsageService, vernacularNameService, descriptionService, distributionService, speciesProfileService);
this.numWriters = numWriters;
this.solrDocumentConverter = solrDocumentConverter;
// final solr
solrRef = solr;
indexDir = new File(getSolrHome(), "parts");
LOG.info("Creating solr indices in folder {}", indexDir.getAbsolutePath());
}
/**
* Entry point for execution.
* Commandline arguments are:
* 0: required path to property file
*/
public static void main(String[] args) throws Exception {
if (args.length == 0) {
throw new IllegalArgumentException("Path to property file required");
}
// Creates the injector
Properties props = loadProperties(args[0]);
Injector injector = Guice.createInjector(new SolrIndexingModule(props));
// Gets the indexer instance
SolrBackfill nameUsageIndexer = injector.getInstance(SolrBackfill.class);
nameUsageIndexer.run();
// This statement is used because the Guice container is not stopped inside the threadpool.
System.exit(0);
}
private void setupServers() {
writers = new EmbeddedSolrServer[numWriters];
if (numWriters == 1) {
// use main server
writers[0] = solrRef.getSolr();
} else {
// insert others
LOG.debug("Setting up {} embedded solr servers ...", numWriters);
for (int idx = 0; idx < numWriters; idx++) {
writers[idx] = setupSolr(getWriterHome(idx));
}
}
}
private File getSolrHome() {
return new File(solrRef.getSolr().getCoreContainer().getSolrHome());
}
private void mergeIndices() throws IOException, SolrServerException {
if (numWriters == 1) {
LOG.info("Optimizing single solr index ...");
solrRef.getSolr().optimize();
} else {
File solrHome = getSolrHome();
// shutdown solr before we can merge into its index
solrRef.getSolr().getCoreContainer().shutdown();
Path luceneDir = getLuceneDir(solrHome);
LOG.debug("Opening main lucene index at {}", luceneDir);
FSDirectory mainDir = FSDirectory.open(luceneDir);
IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());
IndexWriter fsWriter = new IndexWriter(mainDir, cfg);
LOG.info("Start merging of {} solr indices", jobCounter);
Directory[] parts = new Directory[jobCounter];
for (int idx = 0; idx < jobCounter; idx++) {
Path threadDir = getLuceneDir(getWriterHome(idx));
LOG.info("Add lucene dir {} for merging", threadDir);
parts[idx] = FSDirectory.open(threadDir);
}
fsWriter.addIndexes(parts);
fsWriter.close();
mainDir.close();
LOG.info("Lucene dirs merged! Startup main solr again");
//startup solr again, keeping it in the same singleton wrapper that is accessible to the other tests
solrRef.setSolr(setupSolr(solrHome));
}
}
private File getWriterHome(int thread) {
return new File(indexDir, "slice" + thread);
}
private static Path getLuceneDir(File solrHome) {
return Paths.get(solrHome.getPath(), "data/index");
}
/**
* Setup an embedded solr only for with a given solr home.
* Creates a checklistbank solr index schema, solr.xml and all other config files needed.
*
* @return the created server
*/
private EmbeddedSolrServer setupSolr(File solrHome) {
try {
// copy solr resource files
ResourcesUtil.copy(solrHome, "solr/", false, "solr.xml");
// copy default configurations
File conf = new File(solrHome, "conf");
ResourcesUtil.copy(conf, "solr/default/", false, "synonyms.txt", "protwords.txt", "stopwords.txt");
// copy specific configurations, overwriting above defaults
ResourcesUtil.copy(conf, "solr/checklistbank/conf/", false, "schema.xml", "solrconfig.xml");
// insert container
CoreContainer coreContainer = new CoreContainer(solrHome.getAbsolutePath());
coreContainer.load();
EmbeddedSolrServer solrServer = new EmbeddedSolrServer(coreContainer, "");
LOG.info("Created embedded solr server with solr dir {}", solrHome.getAbsolutePath());
// test solr
SolrPingResponse solrPingResponse = solrServer.ping();
LOG.info("Solr server configured at {}, ping response in {}", solrHome.getAbsolutePath(),
solrPingResponse.getQTime());
return solrServer;
} catch (Exception e) {
throw new IllegalStateException("Solr unavailable", e);
}
}
@Override
protected Callable<Integer> newBatchJob(int startKey, int endKey, UsageService nameUsageService, VernacularNameServiceMyBatis vernacularNameService, DescriptionServiceMyBatis descriptionService, DistributionServiceMyBatis distributionService, SpeciesProfileServiceMyBatis speciesProfileService) {
// round robin on configured solr servers?
final SolrClient solrClient = writers[jobCounter % numWriters];
return new NameUsageIndexingJob(solrClient, nameUsageService, startKey, endKey, solrDocumentConverter,
vernacularNameService, descriptionService, distributionService, speciesProfileService);
}
@Override
protected void init() throws Exception {
// insert solr servers if multiple writers are configured
setupServers();
}
@Override
protected void postprocess() throws Exception {
mergeIndices();
}
}