package org.gbif.checklistbank.authorship; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.service.checklistbank.NameParser; import org.gbif.api.vocabulary.Rank; import org.gbif.nameparser.GBIFNameParser; import org.gbif.utils.file.csv.CSVReader; import org.gbif.utils.file.csv.CSVReaderFactory; import java.io.IOException; import java.io.InputStream; import java.util.Collection; import java.util.Iterator; import java.util.List; import com.google.common.collect.Lists; import com.google.common.io.Resources; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; /** * Another autho comparator test that runs over files of names taken from the real GBIF backbone. * Each file contains a group of names that share the same terminal epithet within a family. * See http://dev.gbif.org/issues/browse/POR-398 for more. */ public class BasionymSorterTest { private final NameParser parser = new GBIFNameParser(); private final BasionymSorter sorter = new BasionymSorter(); @Test public void testGroupPlantBasionyms() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Gymnolomia microcephala var. abbreviata (B.L.Rob. & Greenm.) B.L.Rob. & Greenm.", null)); names.add(parser.parse("Leucheria abbreviata (Bertero) Steud.", null)); names.add(parser.parse("Centaurea phrygia subsp. abbreviata (K. Koch) Dostál", null)); names.add(parser.parse("Centaurea abbreviata (K.Koch) Hand.-Mazz.", null)); names.add(parser.parse("Jacea abbreviata (K.Koch) Soják", null)); names.add(parser.parse("Artemisia abbreviata (Krasch. ex Korobkov) Krasnob.", null)); names.add(parser.parse("Artemisia lagopus subsp. abbreviata Krasch. ex Korobkov", null)); names.add(parser.parse("Bigelowia leiosperma var. abbreviata M.E.Jones", null)); names.add(parser.parse("Brickellia oblongifolia var. abbreviata A.Gray", null)); names.add(parser.parse("Calea abbreviata Pruski & Urbatsch", null)); names.add(parser.parse("Centaurea salicifolia subsp. abbreviata K. Koch", null)); names.add(parser.parse("Chabraea abbreviata Colla ex Bertero", null)); names.add(parser.parse("Chaetanthera stuebelii Hieron. var. abbreviata Cabrera", null)); names.add(parser.parse("Conyza abbreviata Wall.", null)); names.add(parser.parse("Cousinia abbreviata Tscherneva", null)); names.add(parser.parse("Gymnolomia patens var. abbreviata B.L.Rob. & Greenm.", null)); names.add(parser.parse("Gynura abbreviata F.G.Davies", null)); names.add(parser.parse("Jacea abbreviata subsp. abbreviata", null)); names.add(parser.parse("Nassauvia abbreviata Dusén", null)); names.add(parser.parse("Nassauvia abbreviata var. abbreviata", null)); names.add(parser.parse("Scorzonera latifolia var. abbreviata Lipsch.", null)); names.add(parser.parse("Vernonia abbreviata DC.", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(4, groups.size()); for (BasionymGroup<ParsedName> g : groups) { assertFalse(g.getRecombinations().isEmpty()); switch (g.getRecombinations().get(0).getBracketAuthorship()) { case "B.L.Rob. & Greenm.": assertEquals(1, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; case "Bertero": assertEquals(1, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; case "K. Koch": assertEquals(3, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; case "Krasch. ex Korobkov": assertEquals(1, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; default: fail("Unknown basionym group " + g.getRecombinations().get(0)); } } } /** * Here we have a real case from the Asteraceae where 2 different authors with the same surname exist. * A.Nelson and E.E.Nelson must be kept separate! * http://kiki.huh.harvard.edu/databases/botanist_search.php?botanistid=628 * http://kiki.huh.harvard.edu/databases/botanist_search.php?botanistid=519 */ @Test public void testGroupPlantBasionyms2() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Triniteurybia aberrans (A. Nelson) Brouillet, Urbatsch & R.P. Roberts", null)); names.add(parser.parse("Haplopappus aberrans (A.Nelson) H.M.Hall", null)); names.add(parser.parse("Sideranthus aberrans (A.Nelson) Rydb.", null)); names.add(parser.parse("Tonestus aberrans (A.Nelson) G.L.Nesom & D.R.Morgan", null)); names.add(parser.parse("Hysterionica aberrans (Cabrera) Cabrera", null)); names.add(parser.parse("Antennaria luzuloides ssp. aberrans (E.E. Nelson) Bayer & Stebbins", null)); names.add(parser.parse("Logfia aberrans (Wagenitz) Anderb.", null)); names.add(parser.parse("Antennaria argentea subsp. aberrans", null)); names.add(parser.parse("Filago aberrans Wagenitz", null)); names.add(parser.parse("Hysterionica aberrans var. aberrans", null)); names.add(parser.parse("Hysterionica bakeri var. aberrans Cabrera", null)); names.add(parser.parse("Macronema aberrans A.Nelson", null)); names.add(parser.parse("Senecio aberrans Greenm.", null)); names.add(parser.parse("Taraxacum aberrans Hagend. & al.", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(4, groups.size()); for (BasionymGroup<ParsedName> g : groups) { assertFalse(g.getRecombinations().isEmpty()); switch (g.getRecombinations().get(0).getBracketAuthorship()) { case "A. Nelson": assertEquals(4, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; case "Cabrera": assertEquals(1, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; case "E.E. Nelson": assertEquals(1, g.getRecombinations().size()); assertNull(g.getBasionym()); break; case "Wagenitz": assertEquals(1, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; default: fail("Unknown basionym group " + g.getRecombinations().get(0)); } } } @Test public void testGroupPlantBasionyms3() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add( parser.parse("Negundo aceroides subsp. violaceus (G.Kirchn.) W.A.Weber", null) ); names.add( parser.parse("Negundo aceroides subsp. violaceus (Kirchner) W.A. Weber", null) ); names.add(parser.parse("Negundo aceroides subsp. violaceum (Booth ex G.Kirchn.) Holub", null)); names.add(parser.parse("Negundo aceroides subsp. violaceum (Booth ex Kirchner) Holub", null)); names.add(parser.parse("Negundo aceroides var. violaceum G.Kirchn. in Petzold & G.Kirchn.", null)); names.add(parser.parse("Acer violaceum (Kirchner) Simonkai", null)); names.add(parser.parse("Acer negundo var. violaceum (G. Kirchn.) H. Jaeger", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(1, groups.size()); BasionymGroup<ParsedName> g = groups.iterator().next(); assertFalse(g.getRecombinations().isEmpty()); assertEquals(6, g.getRecombinations().size()); assertNotNull(g.getBasionym()); assertEquals("G.Kirchn. in Petzold & G.Kirchn.", g.getBasionym().authorshipComplete()); } @Test public void testGroupWithDifferentInitials() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Negundo aceroides subsp. violaceum (Booth ex G.Kirchn.) Holub", null)); names.add(parser.parse("Negundo aceroides subsp. violaceum (Booth ex Kirchn.) Holub", null)); names.add(parser.parse("Negundo aceroides var. violaceum G.Kirchn. in Petzold & G.Kirchn.", null)); names.add(parser.parse("Acer violaceum (T.Kirchn.) Simonkai", null)); names.add(parser.parse("Acer negundo var. violaceum (G. Kirchn.) H. Jaeger", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(3, groups.size()); for (BasionymGroup<ParsedName> g : groups) { assertFalse(g.getRecombinations().isEmpty()); switch (g.getRecombinations().get(0).getBracketAuthorship()) { case "Booth ex G.Kirchn.": assertEquals(2, g.getRecombinations().size()); assertNotNull(g.getBasionym()); break; case "T.Kirchn.": // author comparison has to be very strict and must treat different initials as relevant assertEquals(1, g.getRecombinations().size()); assertNull(g.getBasionym()); break; case "Booth ex Kirchn.": // Kirchn. is the abbreviation for Emil Otto Oskar Kirchner assertEquals(1, g.getRecombinations().size()); assertNull(g.getBasionym()); break; default: fail("Unknown basionym group " + g.getRecombinations().get(0)); } } } @Test public void testGroupAuthorTeams() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Negundo aceroides var. californicum (Torr. & A.Gray) Sarg.", null)); names.add(parser.parse("Acer negundo var. californicum (Torr. & Gray) Sarg.", null)); names.add(parser.parse("Acer californicum Torr et Gray", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(1, groups.size()); BasionymGroup<ParsedName> g = groups.iterator().next(); assertEquals(2, g.getRecombinations().size()); assertEquals("Acer californicum Torr et Gray", g.getBasionym().getScientificName()); } @Test public void testAtrocincta() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Anthophora atrocincta Lepeletier, 1841", Rank.SPECIES)); names.add(parser.parse("Amegilla atrocincta (Lepeletier)", Rank.SPECIES)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(1, groups.size()); BasionymGroup<ParsedName> g = groups.iterator().next(); assertEquals(1, g.getRecombinations().size()); assertEquals("Anthophora atrocincta Lepeletier, 1841", g.getBasionym().getScientificName()); } @Test public void testPlumipes() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Anthophora plumipes (Fabricius)", Rank.SPECIES)); names.add(parser.parse("Apis plumipes Fabricius, 1781", Rank.SPECIES)); names.add(parser.parse("Centris plumipes (Fabricius)", Rank.SPECIES)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(1, groups.size()); BasionymGroup<ParsedName> g = groups.iterator().next(); assertEquals(2, g.getRecombinations().size()); assertEquals("Apis plumipes Fabricius, 1781", g.getBasionym().getScientificName()); } /** * Test what happens if a group contains 2 or more basionyms. */ @Test public void testMultipleBasionyms() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Negundo violaceum G.Kirchn.", null)); names.add(parser.parse("Negundo aceroides var. violaceum G.Kirchn. in Petzold & G.Kirchn.", null)); names.add(parser.parse("Acer violaceum (G Kirchn.) Simonkai", null)); names.add(parser.parse("Acer negundo var. violaceum (G. Kirchn.) H. Jaeger", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertTrue(groups.isEmpty()); } @Test public void testGroupAnimalBasionyms() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Microtus parvulus (A. H. Howell, 1916)", null)); names.add(parser.parse("Microtus pinetorum parvulus (A. H. Howell, 1916)", null)); names.add(parser.parse("Pitymys parvulus A. H. Howell, 1916", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); assertEquals(1, groups.size()); BasionymGroup<ParsedName> g = groups.iterator().next(); assertEquals(2, g.getRecombinations().size()); assertNotNull(g.getBasionym()); assertEquals("A. H. Howell", g.getBasionym().getAuthorship()); assertEquals("1916", g.getBasionym().getYear()); } @Test public void testGroupAnimalBasionyms2() throws Exception { List<ParsedName> names = Lists.newArrayList(); names.add(parser.parse("Heliodoxa rubinoides aequatorialis (Gould, 1860)", null)); names.add(parser.parse("Androdon aequatorialis Gould, 1863", null)); names.add(parser.parse("Clementoron aequatorialis Gould, 1864", null)); // this one is 1 year apart so it matches the first recombination on top! names.add(parser.parse("Campylopterus largipennis aequatorialis Gould, 1861", null)); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); // multiple basionyms, no clear group! assertEquals(1, groups.size()); BasionymGroup<ParsedName> bg = groups.iterator().next(); assertEquals("aequatorialis", bg.getEpithet()); assertEquals("1861", bg.getBasionym().getYear()); assertEquals("aequatorialis", bg.getBasionym().getInfraSpecificEpithet()); assertEquals("Gould", bg.getAuthorship()); } @Test /** * create test files from current nub with this SQL: * \copy (select coalesce(infra_specific_epithet, specific_epithet) as epi, scientific_name from name_usage u join name n on name_fk=n.id where u.dataset_key='d7dddbf4-2cf0-4f39-9b2a-bb099caae36c' and u.family_fk=5386 order by 1,2) to 'fabaceae.txt' * \copy (select coalesce(infra_specific_epithet, specific_epithet) as epi, scientific_name from name_usage u join name n on name_fk=n.id where u.dataset_key='d7dddbf4-2cf0-4f39-9b2a-bb099caae36c' and u.family_fk=3065 order by 1,2) to 'asteraceae.txt' * \copy (select coalesce(infra_specific_epithet, specific_epithet) as epi, scientific_name from name_usage u join name n on name_fk=n.id where u.dataset_key='d7dddbf4-2cf0-4f39-9b2a-bb099caae36c' and u.family_fk=212 order by 1,2) to 'aves.txt' * \copy (select coalesce(infra_specific_epithet, specific_epithet) as epi, scientific_name from name_usage u join name n on name_fk=n.id where u.dataset_key='d7dddbf4-2cf0-4f39-9b2a-bb099caae36c' and u.family_fk=5719 order by 1,2) to 'molossidae.txt' * \copy (select coalesce(infra_specific_epithet, specific_epithet) as epi, scientific_name from name_usage u join name n on name_fk=n.id where u.dataset_key='d7dddbf4-2cf0-4f39-9b2a-bb099caae36c' and u.family_fk=5510 order by 1,2) to 'muridae.txt' */ public void testGroupBasionymFiles() throws Exception { assertInRage(70, 90, "molossidae.txt"); assertInRage(450, 470, "muridae.txt"); assertInRage(10070, 10080, "aves.txt"); assertInRage(14700, 14760, "fabaceae.txt"); assertInRage(22650, 22725, "asteraceae.txt"); } private void assertInRage(int min, int max, String filename) throws Exception { int count = testGroupBasionymFile("names/"+filename); assertTrue(filename + " with too little basionym groups", min <= count); assertTrue(filename + " with too many basionym groups", max >= count); } private int testGroupBasionymFile(String filename) throws Exception { int epithets = 0; int basionyms = 0; EpithetGroupIterator iter = new EpithetGroupIterator(Resources.getResource(filename).openStream()); while (iter.hasNext()) { List<ParsedName> names = iter.next(); Collection<BasionymGroup<ParsedName>> groups = sorter.groupBasionyms(names); //String epithet = names.get(0).getInfraSpecificEpithet() == null ? names.get(0).getSpecificEpithet() : names.get(0).getInfraSpecificEpithet(); //System.out.println(groups.size() + " groups for " + epithet); epithets++; basionyms = basionyms + groups.size(); } System.out.println("\n\n" + basionyms + " basionym groups found in " + epithets + " epithet groups for file " + filename); return basionyms; } class EpithetGroupIterator implements Iterator<List<ParsedName>> { private final CSVReader reader; private List<ParsedName> next; private String[] lastRow = null; EpithetGroupIterator(InputStream names) throws IOException { reader = CSVReaderFactory.buildTabReader(names, "UTF8", 0); next = readNextGroup(); } @Override public boolean hasNext() { return next != null; } @Override public List<ParsedName> next() { List<ParsedName> curr = next; next = readNextGroup(); return curr; } @Override public void remove() { throw new UnsupportedOperationException(); } private List<ParsedName> readNextGroup() { List<ParsedName> names = Lists.newArrayList(); String epithet = null; while (reader.hasNext() || lastRow != null) { String[] row; if (lastRow != null) { row = lastRow; lastRow = null; } else { row = reader.next(); } if (row == null || row.length < 2 || row[1].startsWith("?")) { // ignore basionym placeholders (?) continue; } try { ParsedName p = parser.parse(row[1], null); if (epithet != null && !epithet.trim().equalsIgnoreCase(row[0])) { // a new group, store this row for next call lastRow = row; return names.isEmpty() ? null : names; } if (epithet == null) { epithet = row[0]; } names.add(p); } catch (UnparsableException e) { if (e.type.isParsable()) { throw new RuntimeException(e); } } } return null; } } }