package org.gbif.checklistbank.neo;
import org.gbif.api.exception.UnparsableException;
import org.gbif.api.model.checklistbank.NameUsage;
import org.gbif.api.model.checklistbank.ParsedName;
import org.gbif.api.model.checklistbank.VerbatimNameUsage;
import org.gbif.api.service.checklistbank.NameParser;
import org.gbif.api.vocabulary.Extension;
import org.gbif.api.vocabulary.NameType;
import org.gbif.api.vocabulary.NameUsageIssue;
import org.gbif.api.vocabulary.NomenclaturalStatus;
import org.gbif.api.vocabulary.Origin;
import org.gbif.api.vocabulary.Rank;
import org.gbif.api.vocabulary.TaxonomicStatus;
import org.gbif.checklistbank.cli.common.Metrics;
import org.gbif.checklistbank.cli.normalizer.ExtensionInterpreter;
import org.gbif.checklistbank.cli.normalizer.IgnoreNameUsageException;
import org.gbif.checklistbank.cli.normalizer.InsertMetadata;
import org.gbif.checklistbank.cli.normalizer.NormalizationFailedException;
import org.gbif.checklistbank.model.UsageExtensions;
import org.gbif.common.parsers.NomStatusParser;
import org.gbif.common.parsers.RankParser;
import org.gbif.common.parsers.TaxStatusParser;
import org.gbif.common.parsers.UrlParser;
import org.gbif.common.parsers.core.EnumParser;
import org.gbif.common.parsers.core.ParseResult;
import org.gbif.dwc.terms.AcTerm;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.TermFactory;
import org.gbif.dwca.io.Archive;
import org.gbif.dwca.io.ArchiveFactory;
import org.gbif.dwca.record.Record;
import org.gbif.dwca.record.StarRecord;
import org.gbif.nameparser.GBIFNameParser;
import org.gbif.utils.ObjectUtils;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.UUID;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.io.FileUtils;
import org.neo4j.kernel.api.index.PreexistingIndexEntryConflictException;
import org.neo4j.unsafe.batchinsert.BatchInserter;
import org.neo4j.unsafe.batchinsert.BatchInserters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.gbif.dwc.terms.GbifTerm.datasetKey;
/**
*
*/
public class NeoInserter implements AutoCloseable {
private static final Logger LOG = LoggerFactory.getLogger(NeoInserter.class);
private static final Pattern NULL_PATTERN = Pattern.compile("^\\s*(\\\\N|\\\\?NULL)\\s*$");
private static final TermFactory TF = TermFactory.instance();
private Archive arch;
private Map<String, UUID> constituents;
private NameParser nameParser = new GBIFNameParser();
private RankParser rankParser = RankParser.getInstance();
private EnumParser<NomenclaturalStatus> nomStatusParser = NomStatusParser.getInstance();
private EnumParser<TaxonomicStatus> taxStatusParser = TaxStatusParser.getInstance();
private InsertMetadata meta = new InsertMetadata();
private ExtensionInterpreter extensionInterpreter = new ExtensionInterpreter();
private final BatchInserter inserter;
private final int batchSize;
private final Meter insertMeter;
private final Map<Term, Extension> extensions;
private final UsageDao dao;
private NeoInserter(UsageDao dao, File storeDir, int batchSize, @Nullable Meter insertMeter) throws IOException {
Preconditions.checkNotNull(dao, "DAO required");
LOG.info("Creating new neo db at {}", storeDir.getAbsolutePath());
this.dao = dao;
initNeoDir(storeDir);
inserter = BatchInserters.inserter(storeDir);
this.batchSize = batchSize;
this.insertMeter = insertMeter;
extensions = Maps.newHashMap();
for (Extension e : Extension.values()) {
extensions.put(TF.findTerm(e.getRowType()), e);
}
}
public static NeoInserter create(UsageDao dao, File storeDir, int batchSize, @Nullable MetricRegistry registry) throws IOException {
return new NeoInserter(dao, storeDir, batchSize, registry == null ? null : registry.meter(Metrics.INSERT_METER));
}
public InsertMetadata insert(File dwca, Map<String, UUID> constituents) throws NormalizationFailedException {
this.constituents = constituents;
openArchive(dwca);
for (StarRecord star : arch) {
insertStarRecord(star);
}
LOG.info("Data insert completed, {} nodes created", meta.getRecords());
if (insertMeter != null) {
LOG.info("Insert rate: {}", insertMeter.getMeanRate());
}
return meta;
}
@VisibleForTesting
protected void insertStarRecord(StarRecord star) throws NormalizationFailedException {
try {
VerbatimNameUsage v = new VerbatimNameUsage();
// set core props
Record core = star.core();
for (Term t : core.terms()) {
String val = clean(core.value(t));
if (val != null) {
v.setCoreField(t, val);
}
}
// make sure this is last to override already put taxonID keys
v.setCoreField(DwcTerm.taxonID, taxonID(core));
// readUsage extensions data
for (Map.Entry<Term, Extension> ext : extensions.entrySet()) {
if (star.hasExtension(ext.getKey())) {
v.getExtensions().put(ext.getValue(), Lists.<Map<Term, String>>newArrayList());
for (Record eRec : star.extension(ext.getKey())) {
Map<Term, String> data = Maps.newHashMap();
for (Term t : eRec.terms()) {
String val = clean(eRec.value(t));
if (val != null) {
data.put(t, val);
}
}
v.getExtensions().get(ext.getValue()).add(data);
}
}
}
// convert into a NameUsage interpreting all enums and other needed types
NameUsage u = buildUsage(v);
UsageExtensions ext = extensionInterpreter.interpret(u, v);
// and batch insert key neo properties used during normalization
Map<String, Object> props = dao.neoProperties(core.id(), u, v);
long nodeId = inserter.createNode(props, Labels.TAXON, u.isSynonym() ? Labels.SYNONYM : Labels.TAXON);
// store verbatim instance
dao.store(nodeId, v);
dao.store(nodeId, u, false);
dao.store(nodeId, ext);
meta.incRecords();
meta.incRank(u.getRank());
if (insertMeter != null) {
insertMeter.mark();
}
if (meta.getRecords() % (batchSize * 10) == 0) {
LOG.info("Inserts done into neo4j: {}", meta.getRecords());
if (Thread.interrupted()) {
LOG.warn("NeoInserter interrupted, exit {} early with incomplete parsing", datasetKey);
throw new NormalizationFailedException("NeoInserter interrupted");
}
}
} catch (IgnoreNameUsageException e) {
meta.incIgnored();
LOG.info("Ignoring record {}: {}", star.core().id(), e.getMessage());
}
}
private void openArchive(File dwca) throws NormalizationFailedException {
meta = new InsertMetadata();
try {
LOG.info("Reading dwc archive from {}", dwca);
arch = ArchiveFactory.openArchive(dwca);
if (!arch.getCore().hasTerm(DwcTerm.taxonID)) {
LOG.warn("Using core ID for taxonID");
meta.setCoreIdUsed(true);
}
// multi values in use for acceptedID?
for (Term t : arch.getCore().getTerms()) {
String delim = arch.getCore().getField(t).getDelimitedBy();
if (!Strings.isNullOrEmpty(delim)) {
meta.getMultiValueDelimiters().put(t, Splitter.on(delim).omitEmptyStrings());
}
}
for (Term t : DwcTerm.HIGHER_RANKS) {
if (arch.getCore().hasTerm(t)) {
meta.setDenormedClassificationMapped(true);
break;
}
}
if (arch.getCore().hasTerm(DwcTerm.parentNameUsageID) || arch.getCore().hasTerm(DwcTerm.parentNameUsage)) {
meta.setParentNameMapped(true);
}
if (arch.getCore().hasTerm(DwcTerm.acceptedNameUsageID) || arch.getCore().hasTerm(DwcTerm.acceptedNameUsage)) {
meta.setAcceptedNameMapped(true);
}
if (arch.getCore().hasTerm(DwcTerm.originalNameUsageID) || arch.getCore().hasTerm(DwcTerm.originalNameUsage)) {
meta.setOriginalNameMapped(true);
}
} catch (IOException e) {
throw new NormalizationFailedException("IOException opening archive " + dwca.getAbsolutePath(), e);
}
}
private void initNeoDir(File storeDir) {
try {
if (storeDir.exists()) {
FileUtils.forceDelete(storeDir);
}
FileUtils.forceMkdir(storeDir);
} catch (IOException e) {
throw new NormalizationFailedException("Cannot prepare neo db directory " + storeDir.getAbsolutePath(), e);
}
}
private NameUsage buildUsage(VerbatimNameUsage v) throws IgnoreNameUsageException {
NameUsage u = new NameUsage();
u.setTaxonID(v.getCoreField(DwcTerm.taxonID));
u.setOrigin(Origin.SOURCE);
if (constituents != null && v.hasCoreField(DwcTerm.datasetID)) {
UUID cKey = constituents.get(v.getCoreField(DwcTerm.datasetID));
u.setConstituentKey(cKey);
}
// classification
//TODO: interpret classification string if others are not given
// DwcTerm.higherClassification;
u.setKingdom(v.getCoreField(DwcTerm.kingdom));
u.setPhylum(v.getCoreField(DwcTerm.phylum));
u.setClazz(v.getCoreField(DwcTerm.class_));
u.setOrder(v.getCoreField(DwcTerm.order));
u.setFamily(v.getCoreField(DwcTerm.family));
u.setGenus(v.getCoreField(DwcTerm.genus));
u.setSubgenus(v.getCoreField(DwcTerm.subgenus));
// rank
String vRank = firstClean(v, DwcTerm.taxonRank, DwcTerm.verbatimTaxonRank);
if (!Strings.isNullOrEmpty(vRank)) {
ParseResult<Rank> rankParse = rankParser.parse(vRank);
if (rankParse.isSuccessful()) {
u.setRank(rankParse.getPayload());
} else {
u.addIssue(NameUsageIssue.RANK_INVALID);
}
}
final Rank rank = u.getRank();
// build best name
ParsedName pn = setScientificName(u, v, rank);
// tax status
String tstatus = v.getCoreField(DwcTerm.taxonomicStatus);
if (!Strings.isNullOrEmpty(tstatus)) {
ParseResult<TaxonomicStatus> taxParse = taxStatusParser.parse(tstatus);
if (taxParse.isSuccessful()) {
u.setTaxonomicStatus(taxParse.getPayload());
} else {
u.addIssue(NameUsageIssue.TAXONOMIC_STATUS_INVALID);
}
}
// nom status
String nstatus = v.getCoreField(DwcTerm.nomenclaturalStatus);
if (!Strings.isNullOrEmpty(nstatus)) {
ParseResult<NomenclaturalStatus> nsParse = nomStatusParser.parse(nstatus);
if (nsParse.isSuccessful()) {
u.getNomenclaturalStatus().add(nsParse.getPayload());
} else {
u.addIssue(NameUsageIssue.NOMENCLATURAL_STATUS_INVALID);
}
}
if (!Strings.isNullOrEmpty(pn.getNomStatus())) {
ParseResult<NomenclaturalStatus> nsParse = nomStatusParser.parse(pn.getNomStatus());
if (nsParse.isSuccessful()) {
u.getNomenclaturalStatus().add(nsParse.getPayload());
}
}
// other properties
u.setPublishedIn(v.getCoreField(DwcTerm.namePublishedIn));
u.setAccordingTo(v.getCoreField(DwcTerm.nameAccordingTo));
u.setRemarks(v.getCoreField(DwcTerm.taxonRemarks));
u.setAuthorship(v.getCoreField(DwcTerm.scientificNameAuthorship));
u.setReferences(ObjectUtils.coalesce(
UrlParser.parse(v.getCoreField(DcTerm.references)),
UrlParser.parse(v.getCoreField(AcTerm.furtherInformationURL)),
UrlParser.parse(v.getCoreField(DcTerm.source))
));
return u;
}
@VisibleForTesting
protected ParsedName setScientificName(NameUsage u, VerbatimNameUsage v, Rank rank) throws IgnoreNameUsageException {
ParsedName pn = new ParsedName();
final String sciname = clean(v.getCoreField(DwcTerm.scientificName));
try {
if (sciname != null) {
pn = nameParser.parse(sciname, rank);
// append author if its not part of the name yet
String author = v.getCoreField(DwcTerm.scientificNameAuthorship);
if (!Strings.isNullOrEmpty(author) && !sciname.contains(author)
&& (!pn.isAuthorsParsed() || Strings.isNullOrEmpty(pn.getAuthorship()))) {
u.addIssue(NameUsageIssue.SCIENTIFIC_NAME_ASSEMBLED);
pn.setAuthorship(buildAuthorship(v));
}
} else {
String genus = firstClean(v, GbifTerm.genericName, DwcTerm.genus);
if (genus == null) {
// bad atomized name, we can't assemble anything. Ignore this record completely!!!
throw new IgnoreNameUsageException("No name found");
} else {
pn.setGenusOrAbove(genus);
pn.setSpecificEpithet(v.getCoreField(DwcTerm.specificEpithet));
pn.setInfraSpecificEpithet(v.getCoreField(DwcTerm.infraspecificEpithet));
pn.setAuthorship(buildAuthorship(v));
pn.setRank(rank);
pn.setType(NameType.SCIENTIFIC);
u.addIssue(NameUsageIssue.SCIENTIFIC_NAME_ASSEMBLED);
}
}
} catch (UnparsableException e) {
LOG.debug("Unparsable {} name {}", e.type, e.name);
pn = new ParsedName();
pn.setType(e.type);
pn.setScientificName(sciname);
}
if (u.getIssues().contains(NameUsageIssue.SCIENTIFIC_NAME_ASSEMBLED)) {
u.setScientificName(pn.fullName());
} else {
u.setScientificName(sciname);
}
u.setCanonicalName(Strings.emptyToNull(pn.canonicalName()));
//TODO: verify name parts and rank
u.setNameType(pn.getType());
return pn;
}
private static String buildAuthorship(VerbatimNameUsage v) {
StringBuilder sb = new StringBuilder();
if (v.hasCoreField(DwcTerm.scientificNameAuthorship)) {
sb.append(v.getCoreField(DwcTerm.scientificNameAuthorship));
}
if (v.hasCoreField(DwcTerm.namePublishedInYear) && !sb.toString().contains(v.getCoreField(DwcTerm.namePublishedInYear))) {
if (sb.length() > 0) {
sb.append(", ");
}
sb.append(v.getCoreField(DwcTerm.namePublishedInYear));
}
return sb.toString();
}
private static String firstClean(VerbatimNameUsage v, Term... terms) {
for (Term t : terms) {
String x = clean(v.getCoreField(t));
if (x != null) {
return x;
}
}
return null;
}
private String taxonID(Record core) {
if (meta.isCoreIdUsed()) {
return clean(core.id());
} else {
return clean(core.value(DwcTerm.taxonID));
}
}
public static String clean(String x) {
if (Strings.isNullOrEmpty(x) || NULL_PATTERN.matcher(x).find()) {
return null;
}
return Strings.emptyToNull(x.trim());
}
@Override
public void close() throws NotUniqueRuntimeException {
try {
try {
// define indices
LOG.info("Building lucene index taxonID ...");
//TODO: neo4j batchinserter does not seem to evaluate the unique constraint. Duplicates pass thru (see tests) !!!
inserter.createDeferredConstraint(Labels.TAXON).assertPropertyIsUnique(NeoProperties.TAXON_ID).create();
LOG.info("Building lucene index scientific name ...");
inserter.createDeferredSchemaIndex(Labels.TAXON).on(NeoProperties.SCIENTIFIC_NAME).create();
LOG.info("Building lucene index canonical name ...");
inserter.createDeferredSchemaIndex(Labels.TAXON).on(NeoProperties.CANONICAL_NAME).create();
} finally {
// this is when lucene indices are build and thus throws RuntimeExceptions when unique constraints are broken
// we catch these exceptions below
inserter.shutdown();
}
} catch (RuntimeException e) {
Throwable t = e.getCause();
// check if the cause was a broken unique constraint which can only be taxonID in our case
if (t != null && t instanceof PreexistingIndexEntryConflictException) {
PreexistingIndexEntryConflictException pe = (PreexistingIndexEntryConflictException) t;
LOG.error("TaxonID not unique. Value {} used for both node {} and {}", pe.getPropertyValue(), pe.getExistingNodeId(), pe.getAddedNodeId());
throw new NotUniqueRuntimeException("TaxonID", pe.getPropertyValue());
} else {
throw e;
}
}
LOG.info("Neo batch inserter closed, data flushed to disk. Opening regular neo db again ...", meta.getRecords());
dao.openNeo();
}
}