package org.gbif.checklistbank.cli.crawler; import org.gbif.api.model.crawler.DwcaValidationReport; import org.gbif.api.model.crawler.GenericValidationReport; import org.gbif.api.model.registry.Dataset; import org.gbif.api.model.registry.Endpoint; import org.gbif.api.service.registry.DatasetService; import org.gbif.api.vocabulary.EndpointType; import org.gbif.checklistbank.cli.common.RabbitBaseService; import org.gbif.common.messaging.api.messages.DwcaMetasyncFinishedMessage; import org.gbif.common.messaging.api.messages.StartCrawlMessage; import org.gbif.dwca.io.ArchiveFactory; import org.gbif.dwca.io.UnsupportedArchiveException; import org.gbif.utils.HttpUtil; import java.io.File; import java.io.IOException; import java.net.URI; import java.util.Optional; import java.util.UUID; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A service that watches registry changed messages and does deletions of checklists and * updates to the dataset title table in CLB. */ public class CrawlerService extends RabbitBaseService<StartCrawlMessage> { private static final Logger LOG = LoggerFactory.getLogger(CrawlerService.class); private final CrawlerConfiguration cfg; private final DatasetService datasetService; private final HttpUtil http; public CrawlerService(CrawlerConfiguration cfg) { super("clb-crawler", cfg.poolSize, cfg.messaging, cfg.ganglia, cfg.registry.guiceModules()); this.cfg = cfg; http = new HttpUtil(HttpUtil.newMultithreadedClient(cfg.httpTimeout, cfg.poolSize, cfg.poolSize)); // init registry datasetService = getInstance(DatasetService.class); } @Override public void handleMessage(StartCrawlMessage msg) { Dataset d = datasetService.get(msg.getDatasetUuid()); if (d == null) { LOG.warn("No dataset known by key {}", msg.getDatasetUuid()); return; } Optional<Endpoint> dwcaEndpoint = d.getEndpoints() .stream() .filter(e -> EndpointType.DWC_ARCHIVE.equals(e.getType())) .findFirst(); if (!dwcaEndpoint.isPresent()) { LOG.warn("No dwc archive endpoint known for dataset {}: {}", d.getTitle(), d); return; } URI dwcaUri = dwcaEndpoint.get().getUrl(); try { downloadAndExtract(d, dwcaUri); send(new DwcaMetasyncFinishedMessage(d.getKey(), d.getType(), dwcaUri, 1, Maps.<String, UUID>newHashMap(), new DwcaValidationReport(d.getKey(), new GenericValidationReport(1, true, Lists.<String>newArrayList(), Lists.<Integer>newArrayList())) ) ); } catch (Exception e) { LOG.error("Failed to download and extract dwc archive for dataset {} from {}", d.getTitle(), dwcaUri, e); } } private void downloadAndExtract(Dataset d, URI dwcaUri) throws IOException, UnsupportedArchiveException { final File dwca = cfg.archiveFile(d.getKey()); if (dwca.exists()) { dwca.delete(); LOG.debug("Removed previous dwc archive at {}", dwca.getAbsolutePath()); } http.download(dwcaUri, dwca); // success! LOG.info("Downloaded dwc archive for dataset {} from {} to {}", d.getTitle(), dwcaUri, dwca.getAbsolutePath()); // open archive final File archiveDir = cfg.archiveDir(d.getKey()); if (archiveDir.exists()) { FileUtils.deleteDirectory(archiveDir); LOG.debug("Removed previous dwc archive dir {}", dwca.getAbsolutePath()); } ArchiveFactory.openArchive(dwca, archiveDir); LOG.debug("Opened dwc archive successfully for dataset {} at {}", d.getTitle(), dwca, archiveDir.getAbsolutePath()); } @Override public Class<StartCrawlMessage> getMessageClass() { return StartCrawlMessage.class; } }