package org.gbif.checklistbank.index; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.model.common.paging.PagingRequest; import org.gbif.api.service.checklistbank.DescriptionService; import org.gbif.api.service.checklistbank.DistributionService; import org.gbif.api.service.checklistbank.SpeciesProfileService; import org.gbif.api.service.checklistbank.VernacularNameService; import org.gbif.checklistbank.index.guice.Solr; import org.gbif.checklistbank.index.model.SolrUsage; import org.gbif.checklistbank.logging.LogContext; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.checklistbank.model.UsageForeignKeys; import org.gbif.checklistbank.service.DatasetImportService; import org.gbif.checklistbank.service.ImporterCallback; import org.gbif.checklistbank.service.UsageService; import org.gbif.utils.concurrent.ExecutorUtils; import org.gbif.utils.concurrent.NamedThreadFactory; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import javax.annotation.Nullable; import com.codahale.metrics.Meter; import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.inject.Inject; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.common.SolrInputDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Service that updates a solr checklistbank index in real time. * A maximum of one minute is allowed for a commit to happen. */ public class NameUsageIndexServiceSolr implements DatasetImportService { private static final Logger LOG = LoggerFactory.getLogger(NameUsageIndexServiceSolr.class); private final static String NAME = "sync-solr"; private final NameUsageDocConverter converter = new NameUsageDocConverter(); private final SolrClient solr; private final int batchSize = 25; private final UsageService usageService; private final VernacularNameService vernacularNameService; private final DescriptionService descriptionService; private final DistributionService distributionService; private final SpeciesProfileService speciesProfileService; // consider only some extension records at most private final PagingRequest page = new PagingRequest(0, 500); private final ExecutorService exec; private ConcurrentLinkedQueue<Future<?>> tasks = new ConcurrentLinkedQueue<>(); private final Meter updMeter = new Meter(); private final AtomicInteger updCounter = new AtomicInteger(0); @Inject public NameUsageIndexServiceSolr( SolrClient solr, UsageService usageService, VernacularNameService vernacularNameService, DescriptionService descriptionService, DistributionService distributionService, SpeciesProfileService speciesProfileService, @Solr Integer syncThreads ) { this.solr = solr; this.usageService = usageService; this.vernacularNameService = vernacularNameService; this.descriptionService = descriptionService; this.distributionService = distributionService; this.speciesProfileService = speciesProfileService; exec = Executors.newFixedThreadPool(syncThreads, new NamedThreadFactory(NAME)); } private <T> Future<T> addTask(Callable<T> task) { Future<T> f = exec.submit(task); tasks.add(f); return f; } private void insertOrUpdateByKey(Iterable<Integer> keys) { insertOrUpdate(Iterables.transform(keys, new Function<Integer, SolrUsage>() { @Nullable @Override public SolrUsage apply(Integer id) { int key = id; // we use the list service for just one record cause its more effective // leaving out fields that we do not index in solr List<NameUsage> range = usageService.listRange(key, key); if (range.isEmpty()) { return null; } else { NameUsage u = range.get(0); UsageExtensions ext = new UsageExtensions(); ext.distributions = distributionService.listByUsage(key, page).getResults(); ext.descriptions = descriptionService.listByUsage(key, page).getResults(); ext.vernacularNames = vernacularNameService.listByUsage(key, page).getResults(); ext.speciesProfiles = speciesProfileService.listByUsage(key, page).getResults(); return new SolrUsage(u, usageService.listParents(key), ext); } } })); } public void insertOrUpdate(Iterable<SolrUsage> usages) { UUID datasetKey = null; for (Iterable<SolrUsage> batch : Iterables.partition(usages, batchSize)) { List<SolrInputDocument> docs = Lists.newArrayList(); for (SolrUsage u : batch) { if (u == null) continue; if (datasetKey==null) { datasetKey=u.usage.getDatasetKey(); } docs.add(converter.toDoc(u.usage, u.parents, u.extensions)); } try { if (!docs.isEmpty()) { solr.add(docs); updMeter.mark(); int cnt = updCounter.incrementAndGet(); if (cnt % 10000 == 0) { LogContext.startDataset(datasetKey); LOG.info("Synced {} usages, mean rate={}", cnt, updMeter.getMeanRate()); LogContext.endDataset(); } } } catch (Exception e) { throw new RuntimeException(e); } } } @Override public Future<List<Integer>> updateForeignKeys(UUID datasetKey, List<UsageForeignKeys> fks) { List<Integer> usageKeys = Lists.newArrayList(); for (UsageForeignKeys fk : fks) { usageKeys.add(fk.getUsageKey()); } return addTask(new SolrUpdateMybatis(usageKeys)); } @Override public Future<List<Integer>> sync(UUID datasetKey, ImporterCallback dao, Iterable<Integer> usageNeoIds) { return addTask(new SolrUpdateCallback(dao, usageNeoIds)); } /** * @param names list of names being ignored. Can be null! */ @Override public Future<List<NameUsage>> sync(UUID datasetKey, ImporterCallback dao, List<NameUsage> usages, @Nullable List<ParsedName> names) { return addTask(new SolrUpdateProParte(usages)); } @Override public void insertNubRelations(UUID datasetKey, Map<Integer, Integer> relations) { exec.submit(new SolrUpdateMybatis(Lists.<Integer>newArrayList(relations.keySet()))); } @Override public int deleteDataset(UUID datasetKey) { try { solr.deleteByQuery("dataset_key:"+datasetKey.toString()); return 0; } catch (Exception e) { throw new RuntimeException(e); } } @Override public Future<List<Integer>> deleteUsages(UUID datasetKey, List<Integer> usageKeys) { return addTask(new SolrDelete(usageKeys)); } @Override public boolean isRunning() { Iterator<Future<?>> iter = tasks.iterator(); while(iter.hasNext()) { Future<?> f = iter.next(); if (f.isDone()) { iter.remove(); } else { return true; } } return false; } @Override public void close() throws Exception { ExecutorUtils.stop(exec, 60, TimeUnit.SECONDS); } class SolrUpdateProParte implements Callable<List<NameUsage>> { private final List<NameUsage> usages; public SolrUpdateProParte(List<NameUsage> usages) { this.usages = usages; } @Override public List<NameUsage> call() throws Exception { insertOrUpdate(Lists.transform(usages, new Function<NameUsage, SolrUsage>() { @Override public SolrUsage apply(NameUsage u) { // the pro parte usage itself might not yet be synced... // so we get list of parent ids from parent which must exist in postgres already! List<Integer> parents = Lists.newArrayList(); if (u.getAcceptedKey() != null) { parents.add(u.getAcceptedKey()); parents.addAll(usageService.listParents(u.getAcceptedKey())); } else if (u.getParentKey() != null) { parents.add(u.getParentKey()); parents.addAll(usageService.listParents(u.getParentKey())); } return new SolrUsage(u, parents,null); } })); return usages; } } /** * Updates solr by loading given usage keys from the provided ImporterCallback handler. * This is used by the neo4j backed indexing tools to provide data from neo without coupling the service here to neo4j. */ class SolrUpdateCallback implements Callable<List<Integer>> { private final Iterable<Integer> usages; private final ImporterCallback dao; /** * @param usages usage keys as required by the callback service (usually neo4j ids, NOT postgres usage keys) */ public SolrUpdateCallback(ImporterCallback dao, Iterable<Integer> usages) { this.dao = dao; this.usages = usages; } @Override public List<Integer> call() throws Exception { final List<Integer> ids = Lists.newArrayList(); insertOrUpdate(Iterables.transform(usages, new Function<Integer, SolrUsage>() { @Override public SolrUsage apply(Integer id) { ids.add(id); NameUsage u = dao.readUsage(id); UsageExtensions e = dao.readExtensions(id); return new SolrUsage(u, usageService.listParents(u.getKey()), e); } })); return ids; } } /** * Updates solr by loading given usage keys from mybatis. */ class SolrUpdateMybatis implements Callable<List<Integer>> { private final List<Integer> ids; public SolrUpdateMybatis(List<Integer> ids) { this.ids = ids; } @Override public List<Integer> call() throws Exception { insertOrUpdateByKey(ids); return ids; } } class SolrDelete implements Callable<List<Integer>> { private final List<Integer> ids; public SolrDelete(List<Integer> ids) { this.ids = ids; } @Override public List<Integer> call() throws Exception { if (!ids.isEmpty()) { LOG.info("Deleting {} usages from solr", ids.size()); List<String> idsAsStrings = ids.stream().map(Object::toString).collect(Collectors.toList()); solr.deleteById(idsAsStrings); } return ids; } } }