package org.gbif.checklistbank.nub.source;
import org.apache.commons.lang3.StringUtils;
import org.gbif.api.model.registry.Dataset;
import org.gbif.api.model.registry.Installation;
import org.gbif.api.model.registry.Organization;
import org.gbif.api.service.registry.DatasetService;
import org.gbif.api.service.registry.InstallationService;
import org.gbif.api.service.registry.OrganizationService;
import org.gbif.api.util.iterables.Iterables;
import org.gbif.api.vocabulary.DatasetSubtype;
import org.gbif.api.vocabulary.DatasetType;
import org.gbif.api.vocabulary.Rank;
import org.gbif.checklistbank.cli.nubbuild.NubConfiguration;
import org.gbif.checklistbank.config.ClbConfiguration;
import org.gbif.utils.file.FileUtils;
import org.gbif.utils.file.csv.CSVReader;
import org.gbif.utils.file.csv.CSVReaderFactory;
import java.io.InputStream;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.inject.Injector;
import org.neo4j.helpers.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A source for nub sources backed by usage data from checklistbank.
* The list of source datasets is discovered by reading a configured tab delimited online file.
* The sources are then loaded asynchroneously through a single background thread into temporary neo4j databases.
*/
public class ClbSourceList extends NubSourceList {
private static final Logger LOG = LoggerFactory.getLogger(ClbSourceList.class);
private final DatasetService datasetService;
private final OrganizationService organizationService;
private final InstallationService installationService;
private final NubConfiguration cfg;
public static ClbSourceList create(NubConfiguration cfg) {
Injector regInj = cfg.registry.createRegistryInjector();
return new ClbSourceList(regInj.getInstance(DatasetService.class), regInj.getInstance(OrganizationService.class), regInj.getInstance(InstallationService.class), cfg);
}
public static ClbSourceList create(NubConfiguration cfg, List<UUID> sourceDatasetKeys) {
Injector regInj = cfg.registry.createRegistryInjector();
DatasetService datasetService = regInj.getInstance(DatasetService.class);
List<NubSource> sources = Lists.newArrayList();
for (UUID dKey : sourceDatasetKeys) {
sources.add(buildSource(datasetService.get(dKey), Rank.FAMILY, cfg.clb, false));
}
return new ClbSourceList(cfg, sources);
}
public ClbSourceList(NubConfiguration cfg, List<NubSource> sources) {
super(false);
this.cfg = cfg;
this.datasetService = null;
this.organizationService = null;
this.installationService = null;
submitSources(sources);
}
public ClbSourceList(DatasetService datasetService, OrganizationService organizationService, InstallationService installationService, NubConfiguration cfg) {
super(false);
this.cfg = cfg;
this.datasetService = datasetService;
this.organizationService = organizationService;
this.installationService = installationService;
loadSources();
}
private static NubSource buildSource(Dataset d, Rank rank, ClbConfiguration cfg, boolean ignoreSynonyms) {
NubSource src = new ClbSource(cfg, d.getKey(), d.getTitle());
src.created = d.getCreated();
src.ignoreSynonyms = ignoreSynonyms;
src.nomenclator = DatasetSubtype.NOMENCLATOR_AUTHORITY == d.getSubtype();
if (rank != null) {
src.ignoreRanksAbove = rank;
}
return src;
}
private void loadSources() {
LOG.info("Loading backbone sources from {}", cfg.sourceList);
Set<UUID> keys = Sets.newHashSet();
List<NubSource> sources = Lists.newArrayList();
try {
InputStream stream;
if (cfg.sourceList.isAbsolute()) {
stream = cfg.sourceList.toURL().openStream();
} else {
stream = FileUtils.classpathStream(cfg.sourceList.toString());
}
CSVReader reader = CSVReaderFactory.build(stream, "UTF-8", "\t", null, 0);
for (String[] row : reader) {
if (row.length < 1) continue;
UUID key = UUID.fromString(row[0]);
if (keys.contains(key)) continue;
keys.add(key);
Rank rank = row.length > 1 && !StringUtils.isBlank(row[1]) ? Rank.valueOf(row[1]) : null;
Dataset d = datasetService.get(key);
if (d != null) {
sources.add(buildSource(d, rank, cfg.clb, cfg.ignoreSynonyms.contains(key)));
} else {
// try if its an organization
Organization org = organizationService.get(key);
if (org != null) {
boolean ignoreSyns = cfg.ignoreSynonyms.contains(key);
int counter = 0;
for (Dataset d2 : Iterables.publishedDatasets(org.getKey(), DatasetType.CHECKLIST, organizationService)) {
if (!keys.contains(d2.getKey())) {
sources.add(buildSource(d2, rank, cfg.clb, ignoreSyns));
counter++;
}
}
LOG.info("Found {} new nub sources published by organization {} {}", counter, org.getKey(), org.getTitle());
} else {
// try an installation
Installation inst = installationService.get(key);
if (inst != null) {
boolean ignoreSyns = cfg.ignoreSynonyms.contains(key);
int counter = 0;
for (Dataset d2 : Iterables.hostedDatasets(inst.getKey(), DatasetType.CHECKLIST, installationService)) {
if (!keys.contains(d2.getKey())) {
sources.add(buildSource(d2, rank, cfg.clb, ignoreSyns));
counter++;
}
}
LOG.info("Found {} new nub sources hosted by installation {} {}", counter, inst.getKey(), inst.getTitle());
} else {
LOG.warn("Unknown nub source {}. Ignore", key);
}
}
}
}
} catch (Exception e) {
LOG.error("Cannot read nub sources from {}", cfg.sourceList);
throw new RuntimeException(e);
}
submitSources(sources);
}
}