package org.gbif.dwca.io; import org.gbif.dwc.terms.DcTerm; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.dwc.terms.TermFactory; import org.gbif.dwca.record.Record; import org.gbif.dwca.record.StarRecord; import org.gbif.utils.file.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import com.google.common.base.Joiner; import com.google.common.collect.Maps; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; public class DwcaWriterTest { private static Logger LOG = LoggerFactory.getLogger(DwcaWriterTest.class); @Test(expected = IllegalStateException.class) public void testAddingCoreIdTermTwice() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, DwcTerm.taxonID, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.taxonID, "dummy1"); } @Test public void testHeaders1() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.parentNameUsageID); writer.addCoreColumn(DwcTerm.acceptedNameUsageID); writer.newRecord("dummy2"); writer.addCoreColumn(DwcTerm.parentNameUsageID); writer.addCoreColumn(DwcTerm.acceptedNameUsageID); } @Test(expected = IllegalStateException.class) public void testHeaders2() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.parentNameUsageID); writer.addCoreColumn(DwcTerm.acceptedNameUsageID); writer.newRecord("dummy2"); writer.addCoreColumn(DwcTerm.scientificName); } @Test(expected = IllegalStateException.class) public void testHeaders3() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.parentNameUsageID); writer.addCoreColumn(DwcTerm.acceptedNameUsageID); // define extension columns Map<Term, String> eData = Maps.newHashMap(); eData.put(DwcTerm.locality, "locality1"); eData.put(DwcTerm.occurrenceStatus, "present"); writer.addExtensionRecord(GbifTerm.Distribution, eData); eData.put(DwcTerm.establishmentMeans, "alien"); writer.addExtensionRecord(GbifTerm.Distribution, eData); } @Test public void testHeaderWriting() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); LOG.info("Test archive writer in {}", dwcaDir.getAbsolutePath()); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.parentNameUsageID); writer.addCoreColumn(DwcTerm.acceptedNameUsageID); writer.addCoreColumn(DwcTerm.scientificName); writer.addCoreColumn(GbifTerm.canonicalName); writer.addCoreColumn(DwcTerm.taxonRank, "species"); writer.addCoreColumn(DwcTerm.taxonomicStatus); writer.addCoreColumn(DwcTerm.kingdom); writer.addCoreColumn(DwcTerm.phylum); writer.addCoreColumn(DwcTerm.class_); writer.addCoreColumn(DwcTerm.order); writer.addCoreColumn(DwcTerm.family); writer.addCoreColumn(GbifTerm.depth); writer.addCoreColumn(GbifTerm.depthAccuracy); writer.newRecord("dummy2"); writer.addCoreColumn(DwcTerm.kingdom, "Plantae"); writer.addCoreColumn(DwcTerm.phylum); writer.addCoreColumn(DwcTerm.class_); writer.addCoreColumn(DwcTerm.order); writer.addCoreColumn(DwcTerm.family, "Asteraceae"); writer.newRecord("dummy3"); writer.addCoreColumn(GbifTerm.depth, "2"); writer.addCoreColumn(GbifTerm.depthAccuracy, "1"); // define extension columns Map<Term, String> eData = Maps.newHashMap(); // distributions eData.put(DwcTerm.locality, "locality1"); eData.put(DwcTerm.occurrenceStatus, "present"); eData.put(DwcTerm.establishmentMeans, "alien"); writer.addExtensionRecord(GbifTerm.Distribution, eData); eData.put(DwcTerm.locality, "locality2"); writer.addExtensionRecord(GbifTerm.Distribution, eData); writer.close(); File cf = new File(dwcaDir, writer.getDataFiles().get(DwcTerm.Taxon)); File df = new File(dwcaDir, writer.getDataFiles().get(GbifTerm.Distribution)); // check if taxon file contains headers String[] headers = getFirstRow(cf); LOG.debug(Joiner.on("; ").useForNull("NULL").join(headers)); assertEquals(14, headers.length); assertEquals("taxonID", headers[0]); assertEquals("parentNameUsageID", headers[1]); assertEquals("kingdom", headers[7]); // check if extension file contains headers headers = getFirstRow(df); LOG.debug(Joiner.on("; ").useForNull("NULL").join(headers)); assertEquals(4, headers.length); assertEquals("taxonID", headers[0]); } private String[] getFirstRow(File f) throws IOException { BufferedReader r = FileUtils.getUtf8Reader(f); String firstRow = r.readLine(); return firstRow.split("\t"); } @Test public void testRoundtrip() throws Exception { try { TermFactory termFactory = TermFactory.instance(); // read taxon archive Archive arch = ArchiveFactory.openArchive(FileUtils.getClasspathFile("archive-dwc")); assertEquals(2, arch.getExtensions().size()); int coreRecords = 0; int allRecords = 0; // write taxon archive File tempArch = FileUtils.createTempDir(); tempArch.deleteOnExit(); System.out.println("Writing temporary test archive to " + tempArch.getAbsolutePath()); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, tempArch); for (StarRecord rec : arch) { // core coreRecords++; allRecords += rec.size(); writer.newRecord(rec.core().id()); for (Term term : arch.getCore().getTerms()) { writer.addCoreColumn(term, rec.core().value(term)); } // extensions for (Term rt : rec.extensions().keySet()) { ArchiveFile af = arch.getExtension(rt); // iterate over records for one extension for (Record row : rec.extension(rt)) { writer.addExtensionRecord(rt, DwcaWriter.recordToMap(row, af)); } } } writer.close(); // reread and compare Archive arch2 = ArchiveFactory.openArchive(tempArch); int coreRecords2 = 0; int allRecords2 = 0; for (StarRecord rec : arch2) { // core coreRecords2++; allRecords2 += rec.size(); } // compare assertEquals(coreRecords, coreRecords2); assertEquals(allRecords, allRecords2); } catch (Exception e) { e.printStackTrace(); fail(); } } @Test public void testWriterUsingCoreIdTerm() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); LOG.info("Test archive writer in {}", dwcaDir.getAbsolutePath()); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, DwcTerm.taxonID, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.parentNameUsageID); writer.addCoreColumn(DwcTerm.acceptedNameUsageID); writer.close(); Archive arch = ArchiveFactory.openArchive(dwcaDir); Iterator<Record> recIt = arch.getCore().iterator(); Record firstRecord = recIt.next(); assertEquals("dummy1", firstRecord.id()); assertEquals("dummy1", firstRecord.value(DwcTerm.taxonID)); } /** * Test the writing of an archive that includes some default values in the core and in one extension. * * @throws Exception */ @Test public void testWriterUsingDefaultValues() throws Exception { File dwcaDir = FileUtils.createTempDir(); dwcaDir.deleteOnExit(); LOG.info("Test archive writer in {}", dwcaDir.getAbsolutePath()); DwcaWriter writer = new DwcaWriter(DwcTerm.Taxon, DwcTerm.taxonID, dwcaDir, true); writer.newRecord("dummy1"); writer.addCoreColumn(DwcTerm.parentNameUsageID, "1"); writer.addCoreColumn(DwcTerm.acceptedNameUsageID, "2"); writer.addCoreColumn(DwcTerm.countryCode); // add a VernacularName extension record Map<Term,String> extensionRecord = new HashMap<Term, String>(); extensionRecord.put(DwcTerm.vernacularName, "Komodo Dragon"); extensionRecord.put(DcTerm.language, null); writer.addExtensionRecord(GbifTerm.VernacularName, extensionRecord); writer.addCoreDefaultValue(DwcTerm.collectionCode, "A2Z"); writer.addCoreDefaultValue(DwcTerm.countryCode, "CA"); writer.addDefaultValue(GbifTerm.VernacularName, DcTerm.language, "en"); // add a second records and overwrite the default value writer.newRecord("dummy2"); writer.addCoreColumn(DwcTerm.parentNameUsageID, "1"); writer.addCoreColumn(DwcTerm.acceptedNameUsageID, "2"); writer.addCoreColumn(DwcTerm.countryCode, "ID"); // add a VernacularName extension record extensionRecord = new HashMap<Term, String>(); extensionRecord.put(DwcTerm.vernacularName, "Varano De Komodo"); extensionRecord.put(DcTerm.language, "es"); writer.addExtensionRecord(GbifTerm.VernacularName, extensionRecord); writer.close(); // validate core content Archive arch = ArchiveFactory.openArchive(dwcaDir); Iterator<Record> recIt = arch.getCore().iterator(); Record firstRecord = recIt.next(); assertEquals("dummy1", firstRecord.id()); assertEquals("dummy1", firstRecord.value(DwcTerm.taxonID)); assertEquals("A2Z", firstRecord.value(DwcTerm.collectionCode)); assertEquals("CA", firstRecord.value(DwcTerm.countryCode)); assertEquals("A2Z", arch.getCore().getField(DwcTerm.collectionCode).getDefaultValue()); assertEquals("CA", arch.getCore().getField(DwcTerm.countryCode).getDefaultValue()); Record secondRecord = recIt.next(); assertEquals("dummy2", secondRecord.id()); assertEquals("dummy2", secondRecord.value(DwcTerm.taxonID)); assertEquals("A2Z", secondRecord.value(DwcTerm.collectionCode)); assertEquals("ID", secondRecord.value(DwcTerm.countryCode)); // validate extension content Iterator<Record> extRecIt = arch.getExtension(GbifTerm.VernacularName).iterator(); assertEquals("en", arch.getExtension(GbifTerm.VernacularName).getField(DcTerm.language).getDefaultValue()); firstRecord = extRecIt.next(); assertEquals("dummy1", firstRecord.id()); assertEquals("en", firstRecord.value(DcTerm.language)); secondRecord = extRecIt.next(); assertEquals("dummy2", secondRecord.id()); assertEquals("es", secondRecord.value(DcTerm.language)); } }