package org.gbif.checklistbank.service.mybatis; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.NameUsageMetrics; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.model.checklistbank.VerbatimNameUsage; import org.gbif.checklistbank.logging.LogContext; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.checklistbank.model.UsageForeignKeys; import org.gbif.checklistbank.service.DatasetImportService; import org.gbif.checklistbank.service.ImporterCallback; import org.gbif.checklistbank.service.UsageSyncService; import org.gbif.checklistbank.service.mybatis.guice.Mybatis; import org.gbif.utils.concurrent.ExecutorUtils; import org.gbif.utils.concurrent.NamedThreadFactory; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import com.google.common.base.Preconditions; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.inject.Inject; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import org.apache.ibatis.session.ExecutorType; import org.mybatis.guice.transactional.Transactional; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Concurrent import service for full name usages. */ public class DatasetImportServiceMyBatis implements DatasetImportService, AutoCloseable { private static final Logger LOG = LoggerFactory.getLogger(DatasetImportServiceMyBatis.class); private static final String NAME = "sync-mybatis"; private static final int BATCH_SIZE = 1000; private final UsageSyncService syncService; private ExecutorService exec; private ConcurrentLinkedQueue<Future<?>> tasks = new ConcurrentLinkedQueue<>(); @Inject public DatasetImportServiceMyBatis(UsageSyncService importService, @Mybatis Integer threads) { this.syncService = importService; LOG.info("Starting data import service with {} sync threads.", threads); exec = Executors.newFixedThreadPool(threads, new NamedThreadFactory(NAME)); } private <T> Future<T> addTask(Callable<T> task) { Future<T> f = exec.submit(task); tasks.add(f); return f; } class UsageSync implements Callable<List<Integer>> { final UUID datasetKey; final Iterable<Integer> usages; final ImporterCallback dao; private Map<Integer, Integer> usageKeys; private Set<Integer> inserts; private int firstId = -1; /** * @param dao callback to importer neo4j dao to resolve neo4j ids * @param datasetKey * @param usages list of neo4j node ids to sync from callback */ public UsageSync(ImporterCallback dao, UUID datasetKey, Iterable<Integer> usages) { this.dao = dao; this.datasetKey = datasetKey; this.usages = usages; } @Override public List<Integer> call() throws Exception { LogContext.startDataset(datasetKey); int counter = 0; LOG.debug("Starting usage sync"); usageKeys = Maps.newHashMap(); inserts = new IntOpenHashSet(); List<Integer> neoKeys = Lists.newArrayList(); for (List<Integer> neoBatch : Iterables.partition(usages, BATCH_SIZE)) { if (firstId < 0) { firstId = neoBatch.get(0); } neoKeys.addAll(neoBatch); write(neoBatch); counter = counter + neoBatch.size(); } LOG.info("Completed batch of {} usages, starting with id {}.", counter, firstId); LogContext.endDataset(); // submit extension sync job for all usages ExtensionSync eSync = new ExtensionSync(dao, datasetKey, firstId, usageKeys, inserts); dao.reportNewFuture(addTask(eSync)); return neoKeys; } @Transactional( exceptionMessage = "usage sync job failed", executorType = ExecutorType.REUSE ) private void write(List<Integer> neoNodeIdbatch) throws Exception { for (Integer id : neoNodeIdbatch) { NameUsage u = dao.readUsage(id); ParsedName pn = dao.readName(id); NameUsageMetrics m = dao.readMetrics(id); boolean insert = dao.isInsert(u); syncService.syncUsage(insert, u, pn, m); // remember usageKey and things about this record if (insert) { inserts.add(id); } usageKeys.put(id, u.getKey()); // tell main importer about the new usageKey so we can prepare usages with good foreign keys dao.reportUsageKey(id, u.getKey()); } } } class ProParteSync implements Callable<List<NameUsage>> { final ImporterCallback dao; final UUID datasetKey; final List<NameUsage> usages; final List<ParsedName> names; public ProParteSync(ImporterCallback dao, UUID datasetKey, List<NameUsage> usages, List<ParsedName> names) { this.dao = dao; this.datasetKey = datasetKey; this.usages = usages; this.names = names; Preconditions.checkArgument(usages.size() == names.size()); } @Override @Transactional( exceptionMessage = "usage sync job failed", executorType = ExecutorType.REUSE ) public List<NameUsage> call() throws Exception { LogContext.startDataset(datasetKey); LOG.debug("Starting usage sync with {} usages", usages.size()); Iterator<ParsedName> nIter = names.iterator(); for (NameUsage u : usages) { // pro parte usages are synonyms and do not have any descendants, synonyms, etc NameUsageMetrics m = new NameUsageMetrics(); ParsedName pn = nIter.next(); m.setKey(u.getKey()); m.setNumDescendants(0); boolean insert = dao.isInsert(u); syncService.syncUsage(insert, u, pn, m); } LOG.debug("Completed batch of {} pro parte usages", usages.size()); LogContext.endDataset(); return usages; } } class ExtensionSync implements Callable<List<Integer>> { final UUID datasetKey; final Map<Integer, Integer> usages; final Set<Integer> inserts; final ImporterCallback dao; private int firstId = -1; public ExtensionSync(ImporterCallback dao, UUID datasetKey, int firstId, Map<Integer, Integer> usages, Set<Integer> inserts) { this.dao = dao; this.datasetKey = datasetKey; this.usages = usages; this.inserts = inserts; this.firstId = firstId; } @Override public List<Integer> call() throws Exception { LogContext.startDataset(datasetKey); LOG.debug("Starting extension sync for {} usages", usages.size()); List<Integer> ids = Lists.newArrayList(); for (List<Integer> batch : Iterables.partition(usages.keySet(), BATCH_SIZE)) { write(batch); ids.addAll(batch); } LOG.info("Completed batch of {} usage extensions, starting with id {}.", usages.size(), firstId); LogContext.endDataset(); return ids; } @Transactional( exceptionMessage = "extension sync job failed", executorType = ExecutorType.REUSE ) private void write(List<Integer> ids) throws Exception { for (Integer id : ids) { VerbatimNameUsage v = dao.readVerbatim(id); UsageExtensions e = dao.readExtensions(id); syncService.syncUsageExtras(inserts.contains(id), datasetKey, usages.get(id), v, e); } } } class DeletionSync implements Callable<List<Integer>> { final UUID datasetKey; final List<Integer> usageKeys; public DeletionSync(UUID datasetKey, List<Integer> usageKeys) { this.datasetKey = datasetKey; this.usageKeys = usageKeys; } @Override public List<Integer> call() throws Exception { LogContext.startDataset(datasetKey); LOG.info("Starting deletion for {} usages", usageKeys.size()); for (List<Integer> batch : Lists.partition(usageKeys, BATCH_SIZE)) { deleteBatch(batch); } LOG.debug("Completed batch of {} usage deletions", usageKeys.size()); LogContext.endDataset(); return usageKeys; } @Transactional( exceptionMessage = "usage deletion job failed", executorType = ExecutorType.REUSE ) private void deleteBatch(List<Integer> batch) throws Exception { for (Integer key : batch) { syncService.delete(key); } } } class ForeignKeySync implements Callable<List<Integer>> { final List<UsageForeignKeys> fks; final UUID datasetKey; public ForeignKeySync(UUID datasetKey, List<UsageForeignKeys> fks) { this.fks = fks; this.datasetKey = datasetKey; } @Override public List<Integer> call() throws Exception { LogContext.startDataset(datasetKey); LOG.debug("Starting foreign key updates for {} usages.", fks.size()); List<Integer> ids = Lists.newArrayList(); for (List<UsageForeignKeys> batch : Lists.partition(fks, BATCH_SIZE)) { ids.addAll(updateForeignKeyBatch(batch)); } LOG.debug("Completed batch of {} foreign key updates.", fks.size()); LogContext.endDataset(); return ids; } } @Override public Future<List<Integer>> updateForeignKeys(UUID datasetKey, List<UsageForeignKeys> fks) { return exec.submit(new ForeignKeySync(datasetKey, fks)); } @Transactional( exceptionMessage = "foreign key update job failed", executorType = ExecutorType.REUSE ) private List<Integer> updateForeignKeyBatch(List<UsageForeignKeys> fks) { List<Integer> ids = Lists.newArrayList(); for (UsageForeignKeys fk : fks) { // update usage by usage doing both potential updates in one statement syncService.updateForeignKeys(fk.getUsageKey(), fk.getParentKey(), fk.getBasionymKey()); ids.add(fk.getUsageKey()); } return ids; } @Override public Future<List<Integer>> sync(UUID datasetKey, ImporterCallback dao, Iterable<Integer> usageNeoIds) { return addTask(new UsageSync(dao, datasetKey, usageNeoIds)); } @Override public Future<List<NameUsage>> sync(UUID datasetKey, ImporterCallback dao, List<NameUsage> usages, List<ParsedName> names) { return addTask(new ProParteSync(dao, datasetKey, usages, names)); } @Override public void insertNubRelations(UUID datasetKey, Map<Integer, Integer> relations) { syncService.insertNubRelations(datasetKey, relations); } @Override public int deleteDataset(UUID datasetKey) { return syncService.deleteDataset(datasetKey); } @Override public Future<List<Integer>> deleteUsages(UUID datasetKey, List<Integer> usageKeys) { return addTask(new DeletionSync(datasetKey, usageKeys)); } @Override public boolean isRunning() { Iterator<Future<?>> iter = tasks.iterator(); while(iter.hasNext()) { Future<?> f = iter.next(); if (f.isDone()) { iter.remove(); } else { return true; } } return false; } @Override public void close() throws Exception { ExecutorUtils.stop(exec, 60, TimeUnit.SECONDS); } }