package org.gbif.checklistbank.cli.admin; import org.gbif.api.model.Constants; import org.gbif.api.model.crawler.DwcaValidationReport; import org.gbif.api.model.crawler.GenericValidationReport; import org.gbif.api.model.registry.Dataset; import org.gbif.api.service.checklistbank.DatasetMetricsService; import org.gbif.api.service.registry.DatasetService; import org.gbif.api.service.registry.InstallationService; import org.gbif.api.service.registry.NetworkService; import org.gbif.api.service.registry.NodeService; import org.gbif.api.service.registry.OrganizationService; import org.gbif.api.util.iterables.Iterables; import org.gbif.api.vocabulary.DatasetType; import org.gbif.checklistbank.authorship.AuthorComparator; import org.gbif.checklistbank.cli.analysis.DatasetIndexUpdater; import org.gbif.checklistbank.cli.common.ZookeeperUtils; import org.gbif.checklistbank.service.mybatis.export.Exporter; import org.gbif.checklistbank.cli.nubchanged.BackboneDatasetUpdater; import org.gbif.checklistbank.cli.registry.RegistryService; import org.gbif.checklistbank.model.DatasetCore; import org.gbif.checklistbank.neo.UsageDao; import org.gbif.checklistbank.nub.NubDb; import org.gbif.checklistbank.nub.source.ClbSource; import org.gbif.checklistbank.nub.validation.NubAssertions; import org.gbif.checklistbank.nub.validation.NubTreeValidation; import org.gbif.checklistbank.nub.validation.NubValidation; import org.gbif.checklistbank.service.ParsedNameService; import org.gbif.checklistbank.service.mybatis.ParsedNameServiceMyBatis; import org.gbif.checklistbank.service.mybatis.guice.ChecklistBankServiceMyBatisModule; import org.gbif.checklistbank.service.mybatis.guice.InternalChecklistBankServiceMyBatisModule; import org.gbif.checklistbank.service.mybatis.liquibase.DbSchemaUpdater; import org.gbif.checklistbank.service.mybatis.mapper.DatasetMapper; import org.gbif.checklistbank.service.mybatis.tmp.NameUsageReparser; import org.gbif.cli.BaseCommand; import org.gbif.cli.Command; import org.gbif.common.messaging.DefaultMessagePublisher; import org.gbif.common.messaging.api.Message; import org.gbif.common.messaging.api.MessagePublisher; import org.gbif.common.messaging.api.messages.BackboneChangedMessage; import org.gbif.common.messaging.api.messages.ChecklistNormalizedMessage; import org.gbif.common.messaging.api.messages.ChecklistSyncedMessage; import org.gbif.common.messaging.api.messages.DwcaMetasyncFinishedMessage; import org.gbif.common.messaging.api.messages.MatchDatasetMessage; import org.gbif.common.messaging.api.messages.StartCrawlMessage; import java.io.File; import java.io.IOException; import java.lang.reflect.Field; import java.net.URI; import java.sql.Connection; import java.sql.SQLException; import java.util.Date; import java.util.UUID; import javax.annotation.Nullable; import com.google.common.base.Function; import com.google.common.base.Throwables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.inject.Guice; import com.google.inject.Injector; import org.apache.commons.io.FileUtils; import org.kohsuke.MetaInfServices; import org.neo4j.graphdb.Transaction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Command that issues new normalize or import messages for manual admin purposes. */ @MetaInfServices(Command.class) public class AdminCommand extends BaseCommand { private static final Logger LOG = LoggerFactory.getLogger(AdminCommand.class); private static final String DWCA_SUFFIX = ".dwca"; private final AdminConfiguration cfg = new AdminConfiguration(); private MessagePublisher publisher; private ZookeeperUtils zkUtils; private DatasetService datasetService; private OrganizationService organizationService; private InstallationService installationService; private NetworkService networkService; private NodeService nodeService; private Iterable<Dataset> datasets; private Exporter exporter; private DatasetIndexUpdater datasetIndexUpdater; public AdminCommand() { super("admin"); } @Override protected Object getConfigurationObject() { return cfg; } private void initRegistry() { Injector inj = cfg.registry.createRegistryInjector(); datasetService = inj.getInstance(DatasetService.class); organizationService = inj.getInstance(OrganizationService.class); installationService = inj.getInstance(InstallationService.class); networkService = inj.getInstance(NetworkService.class); nodeService = inj.getInstance(NodeService.class); } private void initCfg() { setKnownKey("col", Constants.COL_DATASET_KEY); setKnownKey("nub", Constants.NUB_DATASET_KEY); setKnownKey("plazi", UUID.fromString("7ce8aef0-9e92-11dc-8738-b8a03c50a862")); } private void setKnownKey(String name, UUID key) { try { Field field = cfg.getClass().getDeclaredField(name); if (field.getBoolean(cfg)) { if (cfg.key != null) { LOG.warn("Explicit dataset key given, ignore {} flag", name); } else { cfg.key = key; } } } catch (ReflectiveOperationException e) { e.printStackTrace(); Throwables.propagate(e); } } private ZookeeperUtils zk() { if (zkUtils == null && cfg.zookeeper.isConfigured()) { try { zkUtils = new ZookeeperUtils(cfg.zookeeper.getCuratorFramework()); } catch (IOException e) { Throwables.propagate(e); } } return zkUtils; } private void send(Message msg) throws IOException { if (publisher == null) { publisher = new DefaultMessagePublisher(cfg.messaging.getConnectionParameters()); } publisher.send(msg); } @Override protected void doRun() { initCfg(); try { if (cfg.operation.global) { runGlobalCommands(); } else { initRegistry(); runDatasetComamnds(); } } catch (Exception e) { throw new RuntimeException(e); } } private void runGlobalCommands() throws Exception { switch (cfg.operation) { case REPARSE: reparseNames(); break; case CLEAN_ORPHANS: cleanOrphans(); break; case SYNC_DATASETS: initRegistry(); syncDatasets(); break; case DUMP: dumpToNeo(); break; case VALIDATE_NEO: verifyNeo(); break; case NUB_CHANGED: sendNubChanged(); break; case UPDATE_NUB_DATASET: updateNubDataset(); break; case SCHEMA_UPDATE: updateSchema(); break; case DATASET_INDEX: indexDataset(); break; case REMATCH: rematchAll(); break; default: throw new UnsupportedOperationException(); } } private void rematchAll() throws Exception { initRegistry(); LOG.info("Start sending match dataset messages for all checklists but the Backbone and CoL"); int counter = 0; for (Dataset d : Iterables.datasets(DatasetType.CHECKLIST, datasetService)) { if (Constants.COL_DATASET_KEY.equals(d.getKey()) || Constants.NUB_DATASET_KEY.equals(d.getKey())) { continue; } send(new MatchDatasetMessage(d.getKey())); counter++; } LOG.info("Sent dataset match message for all {} checklists", counter); } private void indexDataset() throws Exception { datasetIndexUpdater = new DatasetIndexUpdater(cfg.clb, cfg.dataset); // the command allows for indexing both, all datasets and just a supplied key if (cfg.key != null || cfg.keys != null) { runDatasetComamnds(); } else { datasetIndexUpdater.indexAll(); } } private void updateSchema() { try (Connection c = cfg.clb.connect()) { DbSchemaUpdater.update(c); } catch (SQLException e) { LOG.error("Failed to update db schema", e); } } private void updateNubDataset() { Injector clbInj = Guice.createInjector(ChecklistBankServiceMyBatisModule.create(cfg.clb)); DatasetMetricsService metricsService = clbInj.getInstance(DatasetMetricsService.class); Injector regInj = cfg.registry.createRegistryInjector(); BackboneDatasetUpdater nubUpdater = new BackboneDatasetUpdater(regInj.getInstance(DatasetService.class), regInj.getInstance(OrganizationService.class), regInj.getInstance(NetworkService.class)); nubUpdater.updateBackboneDataset(metricsService.get(Constants.NUB_DATASET_KEY)); LOG.info("Backbone dataset metadata updated."); } private void sendNubChanged() throws IOException { Injector inj = Guice.createInjector(ChecklistBankServiceMyBatisModule.create(cfg.clb)); DatasetMetricsService metricsService = inj.getInstance(DatasetMetricsService.class); send(new BackboneChangedMessage(metricsService.get(Constants.NUB_DATASET_KEY))); } private void syncDatasets() { initRegistry(); Injector inj = Guice.createInjector(InternalChecklistBankServiceMyBatisModule.create(cfg.clb)); DatasetMapper mapper = inj.getInstance(DatasetMapper.class); LOG.info("Start syncing datasets from registry to CLB."); int counter = 0; Iterable<Dataset> datasets = Iterables.datasets(DatasetType.CHECKLIST, datasetService); mapper.truncate(); for (Dataset d : datasets) { mapper.insert(new DatasetCore(d)); counter++; } LOG.info("{} checklist titles copied", counter); } /** * Cleans up orphan records in the postgres db. */ private void cleanOrphans() { Injector inj = Guice.createInjector(ChecklistBankServiceMyBatisModule.create(cfg.clb)); ParsedNameServiceMyBatis parsedNameService = (ParsedNameServiceMyBatis) inj.getInstance(ParsedNameService.class); LOG.info("Start cleaning up orphan names. This will take a while ..."); int num = parsedNameService.deleteOrphaned(); LOG.info("{} orphan names deleted", num); } private void cleanup(Dataset d) throws IOException { try { if (cfg.zookeeper.isConfigured()) { zk().delete(ZookeeperUtils.getCrawlInfoPath(d.getKey(), null)); LOG.info("Removed crawl {} from ZK running queue", d.getKey()); //TODO: clear pending & running queues } // cleanup repo files final File dwcaFile = new File(cfg.archiveRepository, d.getKey() + DWCA_SUFFIX); FileUtils.deleteQuietly(dwcaFile); File dir = cfg.archiveDir(d.getKey()); if (dir.exists() && dir.isDirectory()) { FileUtils.deleteDirectory(dir); } LOG.info("Removed dwca files from repository {}", dwcaFile); RegistryService.deleteStorageFiles(cfg.neo, d.getKey()); } catch (Exception e) { LOG.error("Failed to cleanup dataset {}", d.getKey(), e); } } private void runDatasetComamnds() throws Exception { if (cfg.keys != null) { datasets = com.google.common.collect.Iterables.transform(cfg.listKeys(), new Function<UUID, Dataset>() { @Nullable @Override public Dataset apply(UUID key) { return datasetService.get(key); } }); } else { datasets = Iterables.datasets(cfg.key, cfg.type, datasetService, organizationService, installationService, networkService, nodeService); } for (Dataset d : datasets) { LOG.info("{} {} dataset {}: {}", cfg.operation, d.getType(), d.getKey(), d.getTitle().replaceAll("\n", " ")); if (cfg.operation != AdminOperation.CRAWL && cfg.operation != AdminOperation.CLEANUP) { // only deal with checklists for most operations if (!DatasetType.CHECKLIST.equals(d.getType())) { LOG.warn("Cannot {} dataset of type {}: {} {}", cfg.operation, d.getType(), d.getKey(), d.getTitle()); continue; } } switch (cfg.operation) { case CLEANUP: cleanup(d); break; case CRAWL: send(new StartCrawlMessage(d.getKey())); break; case NORMALIZE: if (!cfg.archiveDir(d.getKey()).exists()) { LOG.info("Missing dwca file. Cannot normalize dataset {}", title(d)); } else { // validation result is a fake valid checklist validation send(new DwcaMetasyncFinishedMessage(d.getKey(), d.getType(), URI.create("http://fake.org"), 1, Maps.<String, UUID>newHashMap(), new DwcaValidationReport(d.getKey(), new GenericValidationReport(1, true, Lists.<String>newArrayList(), Lists.<Integer>newArrayList())) ) ); } break; case IMPORT: if (!cfg.neo.neoDir(d.getKey()).exists()) { LOG.info("Missing neo4j directory. Cannot import dataset {}", title(d)); } else { send(new ChecklistNormalizedMessage(d.getKey())); } break; case ANALYZE: send(new ChecklistSyncedMessage(d.getKey(), new Date(), 0, 0)); break; case MATCH: send(new MatchDatasetMessage(d.getKey())); break; case EXPORT: export(d); break; case DATASET_INDEX: datasetIndexUpdater.index(d.getKey()); break; default: throw new UnsupportedOperationException(); } } } private void export(Dataset d) { if (exporter == null) { // lazily init exporter exporter = Exporter.create(cfg.exportRepository, cfg.clb, cfg.registry.wsUrl); } // now export the dataset exporter.export(d); } /** * Reparses all names */ private void reparseNames() { new NameUsageReparser(cfg.clb).run(); //Injector inj = Guice.createInjector(ChecklistBankServiceMyBatisModule.create(cfg.clb)); //ParsedNameService nameService = inj.getInstance(ParsedNameService.class); // //LOG.info("Start reparsing all names. This will take a while ..."); //int num = nameService.reparseAll(); //LOG.info("{} names reparsed", num); } private void dumpToNeo() throws Exception { LOG.info("Start dumping dataset {} from postgres into neo4j", cfg.key); ClbSource src = new ClbSource(cfg.clb, cfg.key, "Checklist " + cfg.key); src.setNeoRepository(cfg.neo.neoRepository); src.init(true, cfg.nubRanksOnly, false, false); } private void verifyNeo() throws Exception { UsageDao dao = null; try { dao = UsageDao.open(cfg.neo, cfg.key); NubDb db = NubDb.open(dao, AuthorComparator.createWithoutAuthormap()); validate(dao, new NubTreeValidation(db)); LOG.info("Tree validation passed!"); validate(dao, new NubAssertions(db)); LOG.info("Nub assertions passed!"); } finally { if (dao != null) { dao.close(); } } } private void validate(UsageDao dao, NubValidation validator) throws AssertionError { try (Transaction tx = dao.beginTx()) { boolean valid = validator.validate(); if (!valid) { LOG.error("Backbone is not valid!"); throw new AssertionError("Backbone is not valid!"); } } } private String title(Dataset d) { return d.getKey() + ": " + d.getTitle().replaceAll("\n", " "); } }