package org.gbif.checklistbank.nub; import org.gbif.api.model.Constants; import org.gbif.api.vocabulary.Kingdom; import org.gbif.api.vocabulary.Rank; import org.gbif.nub.lookup.straight.IdLookup; import org.gbif.nub.lookup.straight.LookupUsage; import java.io.File; import java.io.IOException; import java.io.Writer; import java.sql.Date; import java.util.Collections; import java.util.List; import java.util.Spliterator; import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntSet; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Nub id generator trying to reuse previously existing ids, even if they had been deleted. * It will only ever issue the same id once. */ public class IdGenerator { private static final Logger LOG = LoggerFactory.getLogger(IdGenerator.class); private IdLookup lookup; private int nextId; private IntSet resurrected = new IntOpenHashSet(); private IntSet reissued = new IntOpenHashSet(); private List<LookupUsage> created = Lists.newArrayList(); private final Joiner nameJoiner = Joiner.on(" ").skipNulls(); /** * * @param lookup * @param idStart */ public IdGenerator(IdLookup lookup, int idStart) { this.lookup = lookup; Preconditions.checkArgument(idStart < Constants.NUB_MAXIMUM_KEY, "Lowest current backbone id exceeds maximum nub id limit"); nextId = idStart; } public int issue(String canonicalName, String authorship, String year, Rank rank, Kingdom kingdom) { return issue(canonicalName, authorship, year, rank, kingdom, null); } public int issue(String canonicalName, String authorship, String year, Rank rank, Kingdom kingdom, Integer parentKey) { LookupUsage u = lookup.match(canonicalName, authorship, year, rank, kingdom); int id; if (u == null) { id = create(canonicalName, authorship, year, rank, kingdom); } else { final int matchKey = keyOrProParte(u, parentKey); if (reissued.contains(matchKey) || resurrected.contains(matchKey)) { id = create(canonicalName, authorship, year, rank, kingdom); LOG.warn("{} {} {} was already issued as {}. Generating new id {} instead", kingdom, rank, canonicalName, matchKey, id); } else { id = matchKey; if (u.isDeleted()) { resurrected.add(id); LOG.debug("Resurrected id {} for {} {}", id, rank, name(canonicalName, authorship, year)); } else { reissued.add(id); LOG.debug("Reissued id {} for {} {}", id, rank, name(canonicalName, authorship, year)); } } } // make sure we dont exceed the maximum nub id limit which we use to identify nub usages elsewhere if (id > Constants.NUB_MAXIMUM_KEY) { throw new IllegalStateException("Exceeded maximum nub id limit " + Constants.NUB_MAXIMUM_KEY); } return id; } // select best match from pro parte keys if possible, otherwise return match key private int keyOrProParte(LookupUsage u, Integer parentKey) { if (u.getProParteKeys() != null && parentKey != null && u.getProParteKeys().containsKey(parentKey)) { return Math.abs(u.getProParteKeys().get(parentKey)); } return u.getKey(); } private int create(String canonicalName, String authorship, String year, Rank rank, Kingdom kingdom) { int id = nextId++; LOG.debug("New id {} generated for {} {}", id, rank, name(canonicalName, authorship, year)); created.add(new LookupUsage(id, canonicalName, authorship, year, rank, kingdom, false)); return id; } /** * Forces a reissues of a usage key. Useful if the key is managed outside but the IdGenerator should still keep track of it. */ public int reissue(int id) { reissued.add(id); LOG.debug("Reissued id {} externally", id); return id; } private String name(String canonicalName, String authorship, String year){ StringBuilder sb = new StringBuilder(); sb.append(canonicalName); if (authorship != null){ sb.append(", "); sb.append(authorship); } if (year != null){ sb.append(", "); sb.append(year); } return sb.toString(); } public void writeReports(File reportingDir) throws IOException { // add current date folder reportingDir = new File(reportingDir, new Date(System.currentTimeMillis()).toString()); LOG.info("Writing nub reports to {}", reportingDir.getAbsolutePath()); if (reportingDir.exists()) { FileUtils.deleteDirectory(reportingDir); } FileUtils.forceMkdir(reportingDir); // prepare lists for sorting List<LookupUsage> del = Lists.newArrayList(); List<LookupUsage> res = Lists.newArrayList(); // also include pro parte usages that are hidden in the main usages proParteKeys property streamAll().forEach(u -> { if (u.isDeleted()) { if (resurrected.contains(u.getKey())) { res.add(u); } } else if (!reissued.contains(u.getKey())) { del.add(u); } } ); // write report files print(del, new File(reportingDir, "deleted.txt")); print(res, new File(reportingDir, "resurrected.txt")); print(created, new File(reportingDir, "created.txt")); } private Stream<LookupUsage> streamAll(){ final int characteristics = Spliterator.NONNULL | Spliterator.SIZED | Spliterator.DISTINCT; return StreamSupport.stream(Spliterators.spliterator(lookup.iterator(), lookup.size(), characteristics), false) .flatMap(u -> u.getProParteKeys() == null ? Stream.of(u) : Stream.concat(Stream.of(u), proParteUsages(u))); } private static Stream<LookupUsage> proParteUsages(LookupUsage orig) { return StreamSupport.stream(orig.getProParteKeys().values().spliterator(), false).map(val -> { LookupUsage ppu = new LookupUsage(); ppu.setDeleted(val < 0); ppu.setKey(Math.abs(val)); ppu.setKingdom(orig.getKingdom()); ppu.setRank(orig.getRank()); ppu.setCanonical(orig.getCanonical()); ppu.setAuthorship(orig.getAuthorship()); ppu.setYear(orig.getYear()); return ppu; }); } private void print(List<LookupUsage> usages, File f) throws IOException { try (Writer writer = org.gbif.utils.file.FileUtils.startNewUtf8File(f)){ // sort and write Collections.sort(usages); for (LookupUsage u : usages) { writer.write(Integer.toString(u.getKey())); writer.write('\t'); writer.write(u.getRank().name()); writer.write('\t'); writer.write(nameJoiner.join(u.getCanonical(), u.getAuthorship(), u.getYear())); writer.write('\n'); } } } }