package org.gbif.checklistbank.nub.validation; import org.gbif.api.model.common.LinneanClassification; import org.gbif.api.service.checklistbank.NameUsageService; import org.gbif.api.vocabulary.Kingdom; import org.gbif.api.vocabulary.Rank; import org.gbif.checklistbank.model.Classification; import org.gbif.checklistbank.nub.NubDb; import org.gbif.utils.file.InputStreamUtils; import org.gbif.utils.file.csv.CSVReader; import org.gbif.utils.file.csv.CSVReaderFactory; import java.io.IOException; import java.io.InputStream; import com.google.common.base.Joiner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Production backbone assertions that have to pass before we can replace the backbone with a newer version. * This acts on name usage used as input to the postgres importer */ public class NubAssertions implements NubValidation { private static final Logger LOG = LoggerFactory.getLogger(NubAssertions.class); private static final String ASSERTION_FILENAME = "backbone/assertions.tsv"; private static final Joiner SEMICOLON_JOIN = Joiner.on("; "); private final AssertionEngine assertionEngine; public NubAssertions(NubDb db) { this.assertionEngine = new NeoAssertionEngine(db); } public NubAssertions(NameUsageService pgService) { this.assertionEngine = new PgAssertionEngine(pgService); } @Override /** * This requires an open neo4j transaction! */ public boolean validate() { // populate the reverse key map LOG.info("Start nub assertions"); // TODO: num accepted in expected range // TODO: num accepted per kingdom in expected range // TODO: num accepted in Asteraceae, Mammalia, Aves ??? // TODO: num accepted families, genera // run simple file based assertions assertFileNames(); // issues spotted by the open tree of life assertOtolIssues(); // old assertions from ECAT times oldEcatVerifications(); return assertionEngine.isValid(); } private void assertFileNames() { InputStream tsv = new InputStreamUtils().classpathStream(ASSERTION_FILENAME); try { CSVReader csv = CSVReaderFactory.buildUtf8TabReader(tsv); for (String[] row : csv) { if (row == null || row.length < 11 || row[0].startsWith("#")) { continue; } assertRow(row); } } catch (IOException e) { LOG.warn("Failed to read assertion resource {}", ASSERTION_FILENAME, e); } assertFileNameHomonyms(); } private void assertFileNameHomonyms() { // http://dev.gbif.org/issues/browse/PF-2445 assertionEngine.assertSearchMatch(1, "Zygophyllum apiculatum"); // http://dev.gbif.org/issues/browse/POR-389 assertionEngine.assertSearchMatch(1, "Albizia", Rank.GENUS); } // ID name rank synonym kingdom phylum class order family private void assertRow(String[] row) { try { int key = Integer.valueOf(row[0]); String name = row[1]; Rank rank = Rank.valueOf(row[2]); String accepted = row[3]; Kingdom kingdom = Kingdom.valueOf(row[4].toUpperCase()); LinneanClassification classification = new Classification(); classification.setKingdom(row[4]); classification.setPhylum(row[5]); classification.setClazz(row[6]); classification.setOrder(row[7]); classification.setFamily(row[8]); classification.setGenus(row[9]); classification.setSpecies(row[10]); assertionEngine.assertUsage(key, rank, name, accepted, kingdom); assertionEngine.assertClassification(key, classification); } catch (RuntimeException e) { LOG.error("Failed assertion for {}", SEMICOLON_JOIN.join(row), e); } } /** * Patches from Jonathan: https://github.com/OpenTreeOfLife/reference-taxonomy/blob/master/taxonomies.py#L562 */ private void assertOtolIssues() { // http://iphylo.blogspot.com/2014/03/gbif-liverwort-taxonomy-broken.html assertionEngine.assertSearchMatch(1, "Jungermanniales", Rank.ORDER); assertionEngine.assertUsage(7219205, Rank.ORDER, "Jungermanniales", null, Kingdom.PLANTAE); assertionEngine.assertClassification(7219205, "Jungermanniopsida", "Bryophyta", "Plantae"); // Joseph 2013-07-23 https://github.com/OpenTreeOfLife/opentree/issues/62 // GBIF has 3 copies of Myospalax, but only one accepted assertionEngine.assertSearchMatch(3, "Myospalax", Rank.GENUS); assertionEngine.assertUsage(7427330, Rank.GENUS, "Myospalax Hermann, 1783", "Spalax", Kingdom.ANIMALIA); assertionEngine.assertUsage(8188734, Rank.GENUS, "Myospalax Blyth, 1846", "Ellobius", Kingdom.ANIMALIA); // accepted assertionEngine.assertUsage(2439119, Rank.GENUS, "Myospalax Laxmann, 1769", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(2439119, "Spalacidae", "Rodentia", "Mammalia", "Chordata", "Animalia"); // Drake-brockmania & Drake-Brockmania should be one assertionEngine.assertNotExisting("Drake-brockmania", Rank.GENUS); assertionEngine.assertUsage(4120905, Rank.GENUS, "Drake-Brockmania", null, Kingdom.PLANTAE); //TODO: find ID assertionEngine.assertUsage(-1, Rank.GENUS, "Drakebrockmania", null, Kingdom.PLANTAE); // Saxo-Fridericia vs Saxo-fridericia (COL!) vs Saxofridericia assertionEngine.assertNotExisting("Saxo-Fridericia", Rank.GENUS); assertionEngine.assertNotExisting("Saxofridericia", Rank.GENUS); assertionEngine.assertUsage(7276512, Rank.GENUS, "Saxo-fridericia", null, Kingdom.PLANTAE); assertionEngine.assertNotExisting("Solms-Laubachia", Rank.GENUS); assertionEngine.assertNotExisting("Solmslaubachia", Rank.GENUS); assertionEngine.assertUsage(3044438, Rank.GENUS, "Solms-laubachia", null, Kingdom.PLANTAE); assertionEngine.assertNotExisting("Cyrto-Hypnum", Rank.GENUS); assertionEngine.assertNotExisting("Cyrtohypnum", Rank.GENUS); assertionEngine.assertUsage(2673193, Rank.GENUS, "Cyrto-hypnum", null, Kingdom.PLANTAE); } /** * From https://gbif-ecat.googlecode.com/svn/trunk/ecat-checklistbank/src/test/java/org/gbif/manualtests/VerifyNub.java */ private void oldEcatVerifications() { LOG.info("Test kingdoms"); assertionEngine.assertUsage(0, Rank.KINGDOM, "incertae sedis", null, Kingdom.INCERTAE_SEDIS); assertionEngine.assertUsage(1, Rank.KINGDOM, "Animalia", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(2, Rank.KINGDOM, "Archaea", null, Kingdom.ARCHAEA); assertionEngine.assertUsage(3, Rank.KINGDOM, "Bacteria", null, Kingdom.BACTERIA); assertionEngine.assertUsage(4, Rank.KINGDOM, "Chromista", null, Kingdom.CHROMISTA); assertionEngine.assertUsage(5, Rank.KINGDOM, "Fungi", null, Kingdom.FUNGI); assertionEngine.assertUsage(6, Rank.KINGDOM, "Plantae", null, Kingdom.PLANTAE); assertionEngine.assertUsage(7, Rank.KINGDOM, "Protozoa", null, Kingdom.PROTOZOA); assertionEngine.assertUsage(8, Rank.KINGDOM, "Viruses", null, Kingdom.VIRUSES); LOG.info("Test Puma"); assertionEngine.assertUsage(2435099, Rank.SPECIES, "Puma concolor (Linnaeus, 1771)", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(2435104, Rank.SPECIES, "Felis concolor Linnaeus, 1771", "Puma concolor", Kingdom.ANIMALIA); // http://dev.gbif.org/issues/browse/CLB-72 LOG.info("Test stable ids issue CLB-72"); // assorted assertionEngine.assertUsage(5304574, Rank.SPECIES, "Dracaena cinnabari Balf.", null, Kingdom.PLANTAE); assertionEngine.assertUsage(5214860, Rank.SPECIES, "Zeus faber sp. mauritanicus Desbrosses, 1937", "Zeus faber Linnaeus, 1758", Kingdom.ANIMALIA); assertionEngine.assertUsage(4404259, Rank.SPECIES, "Hydraena truncata Rey, 1885", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(5346048, Rank.SPECIES, "Astragalus angustifolius Lam.", null, Kingdom.PLANTAE); assertionEngine.assertUsage(5358748, Rank.SPECIES, "Trifolium repens L.", null, Kingdom.PLANTAE); assertionEngine.assertUsage(5219173, Rank.SPECIES, "Canis lupus Linnaeus, 1758", null, Kingdom.ANIMALIA); // species with genus homonyms, aka "partial" homonyms assertionEngine.assertUsage(5277309, Rank.SPECIES, "Acanthophora aokii Okamura", null, Kingdom.PLANTAE); // homonym families assertionEngine.assertSearchMatch(2, "Axiidae ", Rank.FAMILY); assertionEngine.assertUsage(8024, Rank.FAMILY, "Axiidae Rebel", "Cimeliidae", Kingdom.ANIMALIA); assertionEngine.assertUsage(8026, Rank.FAMILY, "Axiidae Huxley", null, Kingdom.ANIMALIA); assertionEngine.assertParentsContain(8024, Rank.CLASS, "Malacostraca"); assertionEngine.assertParentsContain(8026, Rank.CLASS, "Insecta"); // 2 authorities exist L. Agassiz, 1862 AND Berlese, 1896 assertionEngine.assertSearchMatch(2, "Cepheidae ", Rank.FAMILY); assertionEngine.assertUsage(8172, Rank.FAMILY, "Cepheidae", null, Kingdom.ANIMALIA); assertionEngine.assertParentsContain(8172, Rank.PHYLUM, "Cnidaria"); assertionEngine.assertParentsContain(8172, Rank.CLASS, "Scyphozoa"); assertionEngine.assertUsage(7410, Rank.FAMILY, "Cepheidae", null, Kingdom.ANIMALIA); assertionEngine.assertParentsContain(7410, Rank.PHYLUM, "Arthropoda"); assertionEngine.assertParentsContain(7410, Rank.CLASS, "Arachnida"); LOG.info("Test higher taxa"); assertionEngine.assertUsage(212, Rank.CLASS, "Aves", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(358, Rank.CLASS, "Reptilia", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(359, Rank.CLASS, "Mammalia", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(3065, Rank.FAMILY, "Asteraceae", null, Kingdom.PLANTAE); assertionEngine.assertUsage(6070956, Rank.FAMILY, "Compositae", "Asteraceae", Kingdom.PLANTAE); LOG.info("Test Pachycephala"); assertionEngine.assertUsage(6006971, Rank.GENUS, "Pachycephala Lioy, 1864", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(6006971, "Tachinidae", "Diptera", "Insecta", "Arthropoda", "Animalia"); assertionEngine.assertUsage(5959160, Rank.GENUS, "Pachycephala Vigors, 1825", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(5959160, "Corvidae", "Passeriformes", "Aves", "Chordata", "Animalia"); assertionEngine.assertUsage(6007723, Rank.GENUS, "Pachycephala Klug, 1834", "Platychile Macleay", Kingdom.ANIMALIA); assertionEngine.assertClassification(6007723, "Platychile", "Carabidae", "Coleoptera", "Insecta", "Arthropoda", "Animalia"); LOG.info("Test Oenanthes"); assertionEngine.assertUsage(2492483, Rank.GENUS, "Oenanthe Vieillot, 1816", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(3034893, Rank.GENUS, "Oenanthe Linnaeus, 1753", null, Kingdom.PLANTAE); assertionEngine.assertSearchMatch(2, "Oenanthe ", Rank.GENUS); assertionEngine.assertSearchMatch(0, "Oenanthe spec", Rank.SPECIES); LOG.info("Test aves classification"); assertionEngine.assertUsage(212, Rank.CLASS, "Aves", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(212, "Reptilia", "Chordata", "Animalia"); assertionEngine.assertUsage(358, Rank.CLASS, "Reptilia", null, Kingdom.ANIMALIA); assertionEngine.assertUsage(44, Rank.PHYLUM, "Chordata", null, Kingdom.ANIMALIA); LOG.info("Test beetles"); assertionEngine.assertParentsContain("Syntomus", Rank.GENUS, "Carabidae"); assertionEngine.assertParentsContain("Thalassophilus", Rank.GENUS, "Carabidae"); assertionEngine.assertParentsContain("Lionychus", Rank.GENUS, "Carabidae"); assertionEngine.assertParentsContain("Oxypselaphus", Rank.GENUS, "Carabidae"); assertionEngine.assertParentsContain("Bembidion", Rank.GENUS, "Carabidae"); // family unknown, but a beetle assertionEngine.assertParentsContain("Lastia", Rank.GENUS, "Coleoptera"); // http://dev.gbif.org/issues/browse/CLB-83 LOG.info("Test Fucus"); assertionEngine.assertUsage(1010512, Rank.GENUS, "Fucus L.", null, Kingdom.CHROMISTA); assertionEngine.assertClassification(1010512, "Fucaceae", "Fucales", "Phaeophyceae", "Ochrophyta", "Chromista"); LOG.info("Test various"); assertionEngine.assertUsage(2765628, Rank.GENUS, "Astelia Banks & Sol. ex R. Br.", null, Kingdom.PLANTAE); assertionEngine.assertSearchMatch(1, "Astelia( |$)", Rank.GENUS); assertionEngine.assertUsage(6014709, Rank.SPECIES, "Tulostoma exasperatum Mont.", null, Kingdom.FUNGI); // http://dev.gbif.org/issues/browse/CLB-70 assertionEngine.assertSearchMatch(1, "Tulostoma exasperatum", Rank.SPECIES); assertionEngine.assertUsage(2295111, Rank.GENUS, "Chloritis Beck, 1837", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(2295111, "Camaenidae", "Stylommatophora", "Gastropoda", "Mollusca", "Animalia"); // see http://dev.gbif.org/issues/browse/CLB-69 assertionEngine.assertSearchMatch(1, "Trifolium repens", Rank.SPECIES); // Fucus http://dev.gbif.org/issues/browse/CLB-83 assertionEngine.assertSearchMatch(1, "Fucus( |$)", Rank.GENUS); assertionEngine.assertClassification(1010512, "Fucaceae", "Fucales", "Phaeophyceae", "Ochrophyta", "Chromista"); // bird Zonotrichia albicollis http://dev.gbif.org/issues/browse/CLB-119 assertionEngine.assertUsage(5231140, Rank.SPECIES, "Zonotrichia albicollis (Gmelin, 1789)", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(5231140, "Zonotrichia", "Emberizidae", "Passeriformes", "Aves", "Chordata", "Animalia"); LOG.info("Test inter rank homonyms"); // http://dev.gbif.org/issues/browse/CLB-63 assertionEngine.assertSearchMatch(2, "Lobata"); assertionEngine.assertSearchMatch(2, "Coccoidea"); // 2 genera, 1 class assertionEngine.assertSearchMatch(3, "Acantharia"); assertionEngine.assertSearchMatch(1, "Acantharia", Rank.CLASS); assertionEngine.assertSearchMatch(2, "Acantharia", Rank.GENUS); // the subphylum isnt part of the nub assertionEngine.assertSearchMatch(1, "Radiolaria"); LOG.info("Test duplicate names"); // http://dev.gbif.org/issues/browse/CLB-126 assertionEngine.assertSearchMatch(1, "Orbicularia foveolata", Rank.SPECIES); assertionEngine.assertUsage(6638224, Rank.SPECIES, "Orbicularia foveolata Britton", "Phyllanthus myrtilloides subsp. erythrinus", Kingdom.PLANTAE); assertionEngine.assertSearchMatch(1, "Nymania insignis", Rank.SPECIES); assertionEngine.assertUsage(6046058, Rank.SPECIES, "Nymania insignis K.Schum.", "Phyllanthus clamboides", Kingdom.PLANTAE); assertionEngine.assertSearchMatch(1, "Phyllanthodendron lingulatum", Rank.SPECIES); assertionEngine.assertSearchMatch(1, "Acidoton flueggeoides", Rank.SPECIES); assertionEngine.assertSearchMatch(1, "Villanova buxifolia", Rank.SPECIES); assertionEngine.assertSearchMatch(1, "Colmeiroa buxifolia", Rank.SPECIES); assertionEngine.assertSearchMatch(1, "Maschalanthus obovatus", Rank.SPECIES); // homonym genus assertionEngine.assertSearchMatch(3, "Wielandia", Rank.GENUS); assertionEngine.assertSearchMatch(1, "Wielandia danguyana", Rank.SPECIES); LOG.info("Test fungal names"); assertionEngine.assertUsage(6348821, Rank.VARIETY, "Hendersonia sarmentorum var. catalpae Sandu", null, Kingdom.FUNGI); assertionEngine.assertUsage(6332962, Rank.VARIETY, "Ithyphallus aurantiacus var. gracilis E. Fisch.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6336702, Rank.VARIETY, "Microsphaera alphitoides var. chenii U. Braun", null, Kingdom.FUNGI); assertionEngine.assertUsage(6342638, Rank.VARIETY, "Peltidea canina var. glabra Ach.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6774914, Rank.VARIETY, "Placodium boergesenii var. squamosoareolata Vain.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6329816, Rank.VARIETY, "Togaria aurea var. aurea (Matt.) W.G. Sm.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6340009, Rank.VARIETY, "Nesolechia xenophana var. hazslinszkyana Keissl.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6328015, Rank.VARIETY, "Omphalia lignatilis var. albovirens Quél.", null, Kingdom.FUNGI); //ID had changed before, why? assertionEngine.assertUsage(7252140, Rank.SPECIES, "Briarea orbicula (Corda) Bonord.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6012748, Rank.FORM, "Echinoderma acutesquamosum f. giganteum (Pilát) Bon", null, Kingdom.FUNGI); //instable autonym ID: http://dev.gbif.org/issues/browse/CLB-183 LOG.info("Autonym ids"); assertionEngine.assertUsage(7243638, Rank.VARIETY, "Drosophila squamosa var. squamosa", null, Kingdom.FUNGI); // animal homonym genus assertionEngine.assertUsage(6330670, Rank.VARIETY, "Drosophila caput-medusae var. caput-medusae (Fr.) Kühner & Romagn.", null, Kingdom.FUNGI); assertionEngine.assertUsage(6350412, Rank.VARIETY, "Parasterina grewiae var. grewiae (Cooke) Bat. & Maia", null, Kingdom.FUNGI); LOG.info("Test mollusc names"); assertionEngine.assertUsage(4366023, Rank.VARIETY, "Bolinus brandaris var. adunca Coen, 1933", "Bolinus brandaris (Linnaeus, 1758)", Kingdom.ANIMALIA); assertionEngine.assertClassification(4366023, "Bolinus brandaris", "Bolinus", "Muricidae", "Neogastropoda", "Gastropoda", "Mollusca", "Animalia"); assertionEngine.assertUsage(5726859, Rank.VARIETY, "Murex brandaris var. aculeatus Philippi, 1836", "Bolinus brandaris (Linnaeus, 1758)", Kingdom.ANIMALIA); assertionEngine.assertUsage(4371850, Rank.VARIETY, "Ensis arcuatus var. directus", "Ensis directus (Conrad, 1843)", Kingdom.ANIMALIA); LOG.info("Test insect names"); assertionEngine.assertUsage(6097478, Rank.VARIETY, "Atheta elongatula var. balcanica", null, Kingdom.ANIMALIA); assertionEngine.assertClassification(6097478, "Atheta elongatula", "Atheta", "Staphylinidae", "Staphylinoidea", "Coleoptera", "Insecta", "Arthropoda", "Animalia"); LOG.info("Test Asteraceae"); assertionEngine.assertSearchMatch(1, "Asteraceae"); assertionEngine.assertUsage(3065, Rank.FAMILY, "Asteraceae Bercht. & J.Presl", null, Kingdom.PLANTAE); LOG.info("Nub verified!"); } }