package org.gbif.checklistbank.cli.importer; import org.gbif.api.model.Constants; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.search.NameUsageSearchParameter; import org.gbif.api.model.checklistbank.search.NameUsageSearchRequest; import org.gbif.api.model.checklistbank.search.NameUsageSearchResult; import org.gbif.api.model.common.paging.PagingRequest; import org.gbif.api.model.common.paging.PagingResponse; import org.gbif.api.model.common.search.Facet; import org.gbif.api.model.common.search.SearchResponse; import org.gbif.api.service.checklistbank.NameUsageSearchService; import org.gbif.api.service.checklistbank.NameUsageService; import org.gbif.api.util.ClassificationUtils; import org.gbif.api.vocabulary.Origin; import org.gbif.api.vocabulary.Rank; import org.gbif.checklistbank.cli.BaseTest; import org.gbif.checklistbank.cli.normalizer.NormalizerStats; import org.gbif.checklistbank.cli.normalizer.NormalizerTest; import org.gbif.checklistbank.index.guice.RealTimeModule; import org.gbif.checklistbank.index.guice.Solr; import org.gbif.checklistbank.index.service.NameUsageSearchServiceImpl; import org.gbif.checklistbank.nub.NubBuilder; import org.gbif.checklistbank.nub.source.ClasspathSourceList; import org.gbif.checklistbank.service.DatasetImportService; import org.gbif.checklistbank.service.UsageService; import org.gbif.checklistbank.service.mybatis.guice.ChecklistBankServiceMyBatisModule; import org.gbif.checklistbank.service.mybatis.guice.InternalChecklistBankServiceMyBatisModule; import org.gbif.checklistbank.service.mybatis.guice.Mybatis; import org.gbif.checklistbank.service.mybatis.postgres.ClbDbTestRule; import org.gbif.nub.lookup.straight.IdLookupImpl; import org.gbif.nub.lookup.straight.LookupUsage; import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; import java.util.List; import java.util.Map; import java.util.UUID; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.io.Resources; import com.google.inject.Guice; import com.google.inject.Injector; import com.google.inject.Key; import com.zaxxer.hikari.HikariDataSource; import org.apache.solr.client.solrj.SolrClient; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; /** * Importer tests, using the normalizer test dwcas to first produce a neo4j db and then import that into postgres. * By default solr indexing is not tested and a mock service is used instead. * This is done cause neo4j uses an old version of lucene which conflicts with solr, preventing the use of an embedded solr server for tests. * An external solr instance can be configured manually in cfg-importer.yaml if wanted */ public class ImporterIT extends BaseTest implements AutoCloseable { private static final ObjectMapper CFG_MAPPER = new ObjectMapper(new YAMLFactory()); private ImporterConfiguration iCfg; private NameUsageService nameUsageService; private UsageService usageService; private DatasetImportService sqlService; private DatasetImportService solrService; private NameUsageSearchService searchService; private HikariDataSource hds; @Rule public ClbDbTestRule dbSetup = ClbDbTestRule.empty(); /** * Uses an internal metrics registry to setup the normalizer */ public Importer build(ImporterConfiguration cfg, UUID datasetKey) throws SQLException { initGuice(cfg); return Importer.create(cfg, datasetKey, nameUsageService, usageService, sqlService, solrService); } private void initGuice(ImporterConfiguration cfg) { if (hds == null) { // init mybatis layer and solr from cfg instance Injector inj = Guice.createInjector(ChecklistBankServiceMyBatisModule.create(cfg.clb), new RealTimeModule(cfg.solr)); hds = (HikariDataSource) inj.getInstance(InternalChecklistBankServiceMyBatisModule.DATASOURCE_KEY); nameUsageService = inj.getInstance(NameUsageService.class); usageService = inj.getInstance(UsageService.class); sqlService = inj.getInstance(Key.get(DatasetImportService.class, Mybatis.class)); solrService = inj.getInstance(Key.get(DatasetImportService.class, Solr.class)); if (!RealTimeModule.empty(iCfg.solr)) { searchService = new NameUsageSearchServiceImpl(inj.getInstance(SolrClient.class)); } } } @Before public void initDwcaRepo() throws Exception { iCfg = CFG_MAPPER.readValue(Resources.getResource("cfg-importer.yaml"), ImporterConfiguration.class); iCfg.chunkMinSize=10; iCfg.chunkSize=50; iCfg.neo = cfg.neo; initGuice(iCfg); // truncate tables try (Connection con = hds.getConnection()) { try (Statement st = con.createStatement()) { st.execute("TRUNCATE citation CASCADE"); st.execute("TRUNCATE name CASCADE"); } } } @After public void close() throws Exception { if (sqlService != null) { sqlService.close(); } if (solrService != null) { solrService.close(); } if (hds != null) { hds.close(); } } @Test public void testIdList() throws SQLException { final UUID datasetKey = NormalizerTest.datasetKey(1); // insert neo db insertNeo(datasetKey); // import runImport(datasetKey); // test db, all usages must be accepted and there is one root! PagingResponse<NameUsage> resp = nameUsageService.list(null, datasetKey, null, new PagingRequest(0, 500)); assertEquals(20, resp.getResults().size()); for (NameUsage u : resp.getResults()) { assertEquals("Bad datasetKey", datasetKey, u.getDatasetKey()); if (u.isSynonym()) { assertNotNull(u.getAcceptedKey()); assertNotNull(u.getAccepted()); } else { assertNull(u.getAcceptedKey()); assertNull(u.getAccepted()); } if (u.getRank() != Rank.KINGDOM) { assertNotNull(u.getParentKey()); assertNotNull(u.getParent()); } assertNotNull(u.getOrigin()); assertNotNull(u.getRank()); if (u.getRank().isLinnean()) { if (u.isSynonym()) { assertFalse("Higher classification key for synonym " + u.getScientificName() + " cannot point to itself!", u.getKey().equals(ClassificationUtils.getHigherRankKey(u, u.getRank()))); } else { assertEquals("Bad higher classification key for " + u.getScientificName() + " of rank " + u.getRank(), u.getKey(), ClassificationUtils.getHigherRankKey(u, u.getRank())); } } } } /** * Testing CLIMBER dataset from ZooKeys: * http://www.gbif.org/dataset/e2bcea8c-dfea-475e-a4ae-af282b4ea1c5 * Especially the behavior of acceptedNameUsage (canonical form withut authorship) * pointing to itself (scientificName WITH authorship) indicating this is NOT a synonym. */ @Test public void testVerbatimAccepted() throws Exception { final UUID datasetKey = NormalizerTest.datasetKey(14); // insert neo db insertNeo(datasetKey); // import runImport(datasetKey); // test db, all usages must be accepted and there is one root! PagingResponse<NameUsage> resp = nameUsageService.list(null, datasetKey, null, new PagingRequest(0, 100)); assertEquals(16, resp.getResults().size()); for (NameUsage u : resp.getResults()) { assertFalse(u.isSynonym()); assertNull(u.getAcceptedKey()); assertNull(u.getAccepted()); assertNull(u.getBasionymKey()); assertNull(u.getBasionym()); assertNotNull(u.getOrigin()); assertNotNull(u.getRank()); if (u.getScientificName().equals("Animalia")) { assertNull(u.getParentKey()); assertNull(u.getParent()); } else { assertNotNull(u.getParentKey()); assertNotNull(u.getParent()); } if (u.getRank().isLinnean()) { assertEquals("Bad higher classification key for " + u.getScientificName() + " of rank " + u.getRank(), u.getKey(), ClassificationUtils.getHigherRankKey(u, u.getRank())); } } } /** * Reimport the same dataset and make sure ids stay the same. * This test also checks solr if manually configured - default is without solr. */ @Test public void testStableIds() throws Exception { final UUID datasetKey = NormalizerTest.datasetKey(14); // truncate solr solrService.deleteDataset(datasetKey); NameUsageSearchRequest search = new NameUsageSearchRequest(); search.setLimit(1); search.setFacetLimit(100); search.addFacets(NameUsageSearchParameter.HIGHERTAXON_KEY); search.addChecklistFilter(datasetKey); if (!RealTimeModule.empty(iCfg.solr)) { // make sure there are no facets anymore Thread.sleep(1000); SearchResponse<NameUsageSearchResult, NameUsageSearchParameter> srep = searchService.search(search); assertEquals(0, srep.getResults().size()); assertEquals(0, srep.getFacets().get(0).getCounts().size()); } // insert neo db insertNeo(datasetKey); // 1st import, keep neo db runImport(datasetKey); // check higher taxa // http://dev.gbif.org/issues/browse/POR-3204 if (!RealTimeModule.empty(iCfg.solr)) { SearchResponse<NameUsageSearchResult, NameUsageSearchParameter> srep = searchService.search(search); List<Facet.Count> facets = srep.getFacets().get(0).getCounts(); // make sure the key actually exists! for (Facet.Count c : facets) { System.out.println(c); int key = Integer.valueOf(c.getName()); NameUsage u = nameUsageService.get(key, null); assertNotNull("Higher taxon key "+key+" in solr does not exist in postgres", u); } } // remember ids Map<Integer, String> ids = Maps.newHashMap(); int sourceCounter = 0; PagingResponse<NameUsage> resp = nameUsageService.list(null, datasetKey, null, new PagingRequest(0, 100)); assertEquals(16, resp.getResults().size()); for (NameUsage u : resp.getResults()) { ids.put(u.getKey(), u.getScientificName()); if (Origin.SOURCE == u.getOrigin()) { sourceCounter++; } } assertEquals(10, sourceCounter); // wait for 2 seconds, we allow a small time difference in old usage deletions Thread.sleep(2000); // 2nd import - there are 10 SOURCE usages with stable ids and 6 HIGHER usages with instable ids runImport(datasetKey); resp = nameUsageService.list(null, datasetKey, null, new PagingRequest(0, 100)); assertEquals(16, resp.getResults().size()); for (NameUsage u : resp.getResults()) { if (Origin.SOURCE == u.getOrigin()) { assertEquals(u.getScientificName(), ids.get(u.getKey())); } else { assertFalse("Usage key " + u.getKey() + " existed before", ids.containsKey(u.getKey())); } } // check higher taxa again, wait a little for solr to catch up if (!RealTimeModule.empty(iCfg.solr)) { Thread.sleep(1000); SearchResponse<NameUsageSearchResult, NameUsageSearchParameter> srep = searchService.search(search); List<Facet.Count> facets = srep.getFacets().get(0).getCounts(); for (Facet.Count c : facets) { System.out.println(c); } // make sure the key actually exists! for (Facet.Count c : facets) { System.out.println(c); int key = Integer.valueOf(c.getName()); NameUsage u = nameUsageService.get(key, null); assertNotNull("Higher taxon key "+key+" in solr does not exist in postgres", u); } } } /** * Test richer nomenclatural data, make sure namePublishedIn is set. * See bottom comments on http://dev.gbif.org/issues/browse/POR-2480 * See also http://dev.gbif.org/issues/browse/POR-3213 */ @Test public void testIndexFungorumNomen() throws Exception { final UUID datasetKey = NormalizerTest.datasetKey(6); // insert neo db NormalizerStats stats = insertNeo(datasetKey); assertEquals(1, stats.getRoots()); assertEquals(290, stats.getCount()); assertEquals(36, stats.getSynonyms()); // 1st import runImport(datasetKey); // check first name NameUsage u426221 = getUsageByTaxonID(datasetKey, "426221"); assertEquals("Führ. Pilzk. (Zwickau) 136 (1871)", u426221.getPublishedIn()); PagingResponse<NameUsage> resp = nameUsageService.list(null, datasetKey, null, new PagingRequest(0, 200)); NameUsage expected = null; for (NameUsage u : resp.getResults()) { if (u.getIssues().size() > 0) { System.err.println("Found "+u.toString()); System.err.println("Issues are "+u.getIssues()); expected = u; break; } } NameUsage same = nameUsageService.get(expected.getKey(), null); assertEquals(expected, same); } /** * Import a dataset that has basionym & proparte links to not previously imported usages. * We need to post update those foreign keys after all records have been inserted! */ @Test public void testMissingUsageKeys() throws Exception { final UUID datasetKey = NormalizerTest.datasetKey(16); // insert neo db NormalizerStats stats = insertNeo(datasetKey); assertEquals(20, stats.getCount()); // 6+2 pro parte counts assertEquals(8, stats.getSynonyms()); assertEquals(1, stats.getRoots()); // pro parte records stay as one in the normalized neo db assertEquals(0, stats.getCountByOrigin(Origin.PROPARTE)); assertEquals(20, stats.getCountByOrigin(Origin.SOURCE)); // 1st import runImport(datasetKey); // verify verify16(datasetKey); // wait for 2 seconds, we allow a small time difference in old usage deletions Thread.sleep(2000); // 2nd import to make sure sync updates also work fine runImport(datasetKey); // verify verify16(datasetKey); } /** * Builds a small new nub and imports it, making sure the nub specific data gets through fine */ @Test public void testNubImport() throws Exception { // build nub ClasspathSourceList src = ClasspathSourceList.source(3, 2, 15, 16, 51); src.setSourceRank(3, Rank.KINGDOM); openDb(Constants.NUB_DATASET_KEY); NubBuilder nb = NubBuilder.create(dao, src, IdLookupImpl.temp().load(Lists.<LookupUsage>newArrayList()), 10, 100); nb.run(); dao.close(); // import Importer imp = runImport(Constants.NUB_DATASET_KEY); assertEquals(67, imp.getSyncCounter()); // make sure all usages have preassigned keys, not postgres generated ones! assertTrue(usageService.maxUsageKey(Constants.NUB_DATASET_KEY) < Constants.NUB_MAXIMUM_KEY); //test issue for 12 Neotetrastichodes flavus Girault, 1913 [synonym SPECIES] CONFLICTING_BASIONYM_COMBINATION NameUsage u = nameUsageService.listByCanonicalName(null, "Neotetrastichodes flavus", null, null).getResults().get(0); assertEquals("Neotetrastichodes flavus Girault, 1913", u.getScientificName()); assertTrue(u.isSynonym()); assertEquals("Aprostocetus rieki (De Santis, 1979)", u.getAccepted()); NameUsage u2 = nameUsageService.listByCanonicalName(null, "Aprostocetus flavus", null, null).getResults().get(0); assertEquals("Aprostocetus flavus (Girault, 1913)", u2.getScientificName()); assertTrue(u2.isSynonym()); assertEquals(u.getAcceptedKey(), u2.getAcceptedKey()); // make sure get does the same as list u2 = nameUsageService.get(u.getKey(), null); assertEquals(u, u2); } /** * http://dev.gbif.org/issues/browse/POR-2755 */ @Test public void testMissingGenusFloraBrazil() throws Exception { final UUID datasetKey = NormalizerTest.datasetKey(19); // insert neo db NormalizerStats stats = insertNeo(datasetKey); assertEquals(3, stats.getRoots()); assertEquals(151, stats.getCount()); assertEquals(62, stats.getSynonyms()); assertEquals(3, stats.getCountByOrigin(Origin.VERBATIM_PARENT)); assertEquals(1, stats.getCountByOrigin(Origin.VERBATIM_ACCEPTED)); assertEquals(60, stats.getCountByOrigin(Origin.MISSING_ACCEPTED)); assertEquals(87, stats.getCountByOrigin(Origin.SOURCE)); // 1st import runImport(datasetKey); assertTrue(usageService.maxUsageKey(datasetKey) > Constants.NUB_MAXIMUM_KEY); } private void verify16(UUID datasetKey) { PagingResponse<NameUsage> resp = nameUsageService.list(null, datasetKey, null, new PagingRequest(0, 100)); // 18 source ones, 2 pro parte assertEquals(20, resp.getResults().size()); int sources = 0; int proparte = 0; Map<String, String> proParteAcceptedNameGenusMap = Maps.newHashMap(); proParteAcceptedNameGenusMap.put("Quertuga occidentalis", "Quertuga"); proParteAcceptedNameGenusMap.put("Crepis occidentalis Nutt.", "Crepis"); for (NameUsage u : resp.getResults()) { switch (u.getOrigin()) { case PROPARTE: proparte++; assertEquals("Leontodon occidentalis", u.getScientificName()); assertNotNull(u.getGenusKey()); assertNotNull(u.getAcceptedKey()); assertTrue(u.isSynonym()); assertNotNull(u.getFamilyKey()); assertEquals("Asteraceae", u.getFamily()); assertTrue(proParteAcceptedNameGenusMap.containsKey(u.getAccepted())); assertEquals(proParteAcceptedNameGenusMap.remove(u.getAccepted()), u.getGenus()); break; case SOURCE: sources++; break; default: fail("Bad origin " + u.getOrigin()); } assertNotNull(u.toString(), u.getKingdomKey()); assertNotNull(u.toString(), u.getKingdom()); if (Rank.KINGDOM != u.getRank()) { assertNotNull(u.toString(), u.getFamilyKey()); assertNotNull(u.toString(), u.getFamily()); if (u.isSynonym()) { assertNotNull(u.toString(), u.getAcceptedKey()); assertNotNull(u.toString(), u.getAccepted()); } else { assertNotNull(u.toString(), u.getParentKey()); assertNotNull(u.toString(), u.getParent()); } } } assertEquals(18, sources); assertEquals(2, proparte); assertTrue(proParteAcceptedNameGenusMap.isEmpty()); NameUsage u = getUsageByTaxonID(datasetKey, "1001"); assertNotNull(u.getBasionymKey()); assertEquals("Kreps bakeri DC.", u.getBasionym()); u = getUsageByTaxonID(datasetKey, "1002"); assertNotNull(u.getBasionymKey()); assertEquals("Leontodon occidentalis", u.getBasionym()); assertEquals("Crepis", u.getParent()); assertEquals("Crepis", u.getGenus()); assertEquals("Asteraceae", u.getFamily()); u = getUsageByTaxonID(datasetKey, "1006-s1"); assertTrue(u.isSynonym()); assertNotNull(u.getAcceptedKey()); assertEquals("Leontodon taraxacoides (Vill.) Mérat", u.getAccepted()); assertEquals("Leontodon", u.getGenus()); assertEquals("Asteraceae", u.getFamily()); } private NameUsage getUsageByTaxonID(UUID datasetKey, String taxonID) { PagingResponse<NameUsage> resp = nameUsageService.list(null, datasetKey, taxonID, null); assertEquals("More than one usage have the taxonID " + taxonID, 1, resp.getResults().size()); return resp.getResults().get(0); } private Importer runImport(UUID datasetKey) throws SQLException { Importer importer = build(iCfg, datasetKey); importer.run(); return importer; } }