package org.gbif.occurrence.processor; import org.gbif.api.model.crawler.DwcaValidationReport; import org.gbif.api.model.crawler.OccurrenceValidationReport; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.vocabulary.BasisOfRecord; import org.gbif.api.vocabulary.Country; import org.gbif.api.vocabulary.EndpointType; import org.gbif.api.vocabulary.MediaType; import org.gbif.api.vocabulary.OccurrenceIssue; import org.gbif.api.vocabulary.OccurrenceSchemaType; import org.gbif.api.vocabulary.TypeStatus; import org.gbif.common.messaging.ConnectionParameters; import org.gbif.common.messaging.DefaultMessagePublisher; import org.gbif.common.messaging.MessageListener; import org.gbif.common.messaging.api.MessagePublisher; import org.gbif.common.messaging.api.messages.OccurrenceFragmentedMessage; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.occurrence.persistence.api.FragmentPersistenceService; import org.gbif.occurrence.persistence.api.OccurrenceKeyPersistenceService; import org.gbif.occurrence.persistence.api.OccurrencePersistenceService; import org.gbif.occurrence.processor.guice.ApiClientConfiguration; import org.gbif.occurrence.processor.interpreting.CoordinateInterpreter; import org.gbif.occurrence.processor.interpreting.LocationInterpreter; import org.gbif.occurrence.processor.interpreting.DatasetInfoInterpreter; import org.gbif.occurrence.processor.interpreting.OccurrenceInterpreter; import org.gbif.occurrence.processor.interpreting.TaxonomyInterpreter; import org.gbif.occurrence.processor.messaging.FragmentPersistedListener; import org.gbif.occurrence.processor.messaging.OccurrenceFragmentedListener; import org.gbif.occurrence.processor.messaging.VerbatimPersistedListener; import org.gbif.occurrence.processor.zookeeper.ZookeeperConnector; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.Calendar; import java.util.TimeZone; import java.util.UUID; import java.util.concurrent.TimeUnit; import com.google.common.io.Resources; import org.apache.commons.io.Charsets; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.retry.RetryNTimes; import org.apache.curator.test.TestingServer; import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @Ignore("requires live webservices and messaging") public class OccurrenceProcessorIT { private final OccurrenceKeyPersistenceService occurrenceKeyService = new OccurrenceKeyPersistenceServiceMock(); private final FragmentPersistenceService fragmentPersister = new FragmentPersistenceServiceMock(occurrenceKeyService); private FragmentProcessor fragmentProcessor; private VerbatimProcessor verbatimProcessor; private InterpretedProcessor interpretedProcessor; private OccurrenceFragmentedListener occurrenceFragmentedListener; private FragmentPersistedListener fragmentPersistedListener; private VerbatimPersistedListener verbatimPersistedListener; private final OccurrencePersistenceService occurrenceService = new OccurrencePersistenceServiceMock(fragmentPersister); private MessageListener messageListener; private MessagePublisher messagePublisher; private TestingServer zkServer; private CuratorFramework curator; private ZookeeperConnector zookeeperConnector; private static String abcd206Single; private static String abcd206Multi; private static String dwc14; private static String dwc14_modified; private static String dwcaSingle; private static final String BGBM_KEY = "57254bd0-8256-11d8-b7ed-b8a03c50a862"; private static final String BOGART_DATASET_KEY = "85697f04-f762-11e1-a439-00145eb45e9a"; private static final String PONTAURUS_DATASET_KEY = "8575f23e-f762-11e1-a439-00145eb45e9a"; @BeforeClass public static void preClass() throws IOException { abcd206Single = Resources.toString(Resources.getResource("abcd206_single.xml"), Charsets.UTF_8); abcd206Multi = Resources.toString(Resources.getResource("abcd206_multi.xml"), Charsets.UTF_8); dwc14 = Resources.toString(Resources.getResource("dwc14.xml"), Charsets.UTF_8); dwc14_modified = Resources.toString(Resources.getResource("dwc14_modified.xml"), Charsets.UTF_8); dwcaSingle = Resources.toString(Resources.getResource("fragment.json"), Charsets.UTF_8); } @Before public void setUp() throws Exception { ApiClientConfiguration cfg = new ApiClientConfiguration();; cfg.url = URI.create("http://api.gbif-dev.org/v1/"); ConnectionParameters connectionParams = new ConnectionParameters("localhost", 5672, "guest", "guest", "/"); messagePublisher = new DefaultMessagePublisher(connectionParams); messageListener = new MessageListener(connectionParams); zkServer = new TestingServer(); curator = CuratorFrameworkFactory.builder().connectString(zkServer.getConnectString()).namespace("crawlertest") .retryPolicy(new RetryNTimes(1, 1000)).build(); curator.start(); zookeeperConnector = new ZookeeperConnector(curator); long now = System.currentTimeMillis(); fragmentProcessor = new FragmentProcessor(fragmentPersister, occurrenceKeyService, messagePublisher, zookeeperConnector); occurrenceFragmentedListener = new OccurrenceFragmentedListener(fragmentProcessor); messageListener.listen("occ_frag_test_" + now, 1, occurrenceFragmentedListener); verbatimProcessor = new VerbatimProcessor(fragmentPersister, occurrenceService, messagePublisher, zookeeperConnector); fragmentPersistedListener = new FragmentPersistedListener(verbatimProcessor); messageListener.listen("frag_persisted_test_" + now, 1, fragmentPersistedListener); interpretedProcessor = new InterpretedProcessor( new OccurrenceInterpreter(new DatasetInfoInterpreter(cfg.newApiClient()), new TaxonomyInterpreter(cfg.newApiClient()), new LocationInterpreter(new CoordinateInterpreter(cfg.newApiClient()))), fragmentPersister, occurrenceService, messagePublisher, zookeeperConnector ); verbatimPersistedListener = new VerbatimPersistedListener(interpretedProcessor); messageListener.listen("verb_persisted_test_" + now, 1, verbatimPersistedListener); } @After public void tearDown() throws IOException { messageListener.close(); zkServer.stop(); curator.close(); } @Test public void testEndToEndDwca() throws IOException, InterruptedException, URISyntaxException { UUID datasetKey = UUID.fromString(PONTAURUS_DATASET_KEY); OccurrenceSchemaType xmlSchema = OccurrenceSchemaType.DWCA; Integer crawlId = 1; DwcaValidationReport report = new DwcaValidationReport(datasetKey, new OccurrenceValidationReport(1, 1, 0, 1, 0, true)); OccurrenceFragmentedMessage msg = new OccurrenceFragmentedMessage(datasetKey, crawlId, dwcaSingle.getBytes(), xmlSchema, EndpointType.DWC_ARCHIVE, report); messagePublisher.send(msg); TimeUnit.MILLISECONDS.sleep(5000); Occurrence got = occurrenceService.get(1); assertNotNull(got); assertEquals("BGBM", got.getVerbatimField(DwcTerm.institutionCode)); assertEquals("Pontaurus", got.getVerbatimField(DwcTerm.collectionCode)); assertEquals("988", got.getVerbatimField(DwcTerm.catalogNumber)); assertEquals(datasetKey, got.getDatasetKey()); // note: this is set here inside the occ project, but from a ws call the serializer will omit these 'superseded' terms assertEquals("Verbascum cheiranthifolium var. cheiranthifolium", got.getVerbatimField(DwcTerm.scientificName)); assertEquals("Verbascum cheiranthifolium var. cheiranthifolium", got.getScientificName()); assertEquals(37.421230, got.getDecimalLatitude().doubleValue(), 0.000001); assertEquals(34.568123, got.getDecimalLongitude().doubleValue(), 0.000001); assertEquals(Country.fromIsoCode("TR"), got.getCountry()); Calendar c = Calendar.getInstance(TimeZone.getTimeZone("UTC")); c.set(1999, 6, 30); c.set(Calendar.HOUR_OF_DAY, 0); c.set(Calendar.MINUTE, 0); c.set(Calendar.SECOND, 0); c.set(Calendar.MILLISECOND, 0); assertEquals(c.getTime(), got.getEventDate()); assertEquals(BasisOfRecord.PRESERVED_SPECIMEN, got.getBasisOfRecord()); assertEquals("Markus Döring", got.getVerbatimField(DwcTerm.identifiedBy)); assertEquals(BGBM_KEY, got.getPublishingOrgKey().toString()); assertEquals(Country.GERMANY, got.getPublishingCountry()); assertEquals(EndpointType.DWC_ARCHIVE, got.getProtocol()); assertEquals("1", got.getVerbatimField(GbifTerm.gbifID)); assertEquals("ABC123", got.getVerbatimField(DwcTerm.occurrenceID)); // multimedia assertNotNull(got.getMedia()); assertEquals(1, got.getMedia().size()); assertEquals(MediaType.StillImage, got.getMedia().get(0).getType()); assertEquals("http://digit.snm.ku.dk/www/Aves/full/AVES-100348_Caprimulgus_pectoralis_fervidus_ad____f.jpg", got.getMedia().get(0).getIdentifier().toString()); } @Test public void testEndToEndAbcd2() throws IOException, InterruptedException, URISyntaxException { UUID datasetKey = UUID.fromString(BOGART_DATASET_KEY); OccurrenceSchemaType xmlSchema = OccurrenceSchemaType.ABCD_2_0_6; Integer crawlId = 1; OccurrenceFragmentedMessage msg = new OccurrenceFragmentedMessage(datasetKey, crawlId, abcd206Single.getBytes(), xmlSchema, EndpointType.BIOCASE, null); messagePublisher.send(msg); TimeUnit.MILLISECONDS.sleep(10000); Occurrence got = occurrenceService.get(1); assertNotNull(got); assertEquals("BGBM", got.getVerbatimField(DwcTerm.institutionCode)); assertEquals("AlgaTerra", got.getVerbatimField(DwcTerm.collectionCode)); assertEquals("5834", got.getVerbatimField(DwcTerm.catalogNumber)); assertEquals(datasetKey, got.getDatasetKey()); assertEquals("Tetraedron caudatum (Corda) Hansgirg, 1888", got.getScientificName()); assertEquals(52.123456, got.getDecimalLatitude().doubleValue(), 0.000001); assertEquals(13.123456, got.getDecimalLongitude().doubleValue(), 0.000001); assertEquals("WGS84", got.getGeodeticDatum()); assertTrue(got.getIssues().contains(OccurrenceIssue.COORDINATE_REPROJECTED)); assertEquals(450, got.getElevation().intValue()); assertEquals(Country.fromIsoCode("DE"), got.getCountry()); assertEquals("Kusber, W.-H.", got.getVerbatimField(DwcTerm.recordedBy)); assertEquals("Nikolassee, Berlin", got.getVerbatimField(DwcTerm.locality)); Calendar c = Calendar.getInstance(TimeZone.getTimeZone("UTC")); c.set(1987, 3, 13); c.set(Calendar.HOUR_OF_DAY, 0); c.set(Calendar.MINUTE, 0); c.set(Calendar.SECOND, 0); c.set(Calendar.MILLISECOND, 0); assertEquals(c.getTime(), got.getEventDate()); assertEquals(BasisOfRecord.HUMAN_OBSERVATION, got.getBasisOfRecord()); assertEquals("Kusber, W.-H.", got.getVerbatimField(DwcTerm.identifiedBy)); assertEquals(BGBM_KEY, got.getPublishingOrgKey().toString()); assertEquals(Country.GERMANY, got.getPublishingCountry()); assertEquals(EndpointType.BIOCASE, got.getProtocol()); assertEquals("1", got.getVerbatimField(GbifTerm.gbifID)); assertEquals(TypeStatus.HOLOTYPE, got.getTypeStatus()); assertEquals(2, got.getMedia().size()); assertEquals(new URI("http://www.tierstimmenarchiv.de/recordings/Ailuroedus_buccoides_V2010_04_short.mp3"), got.getMedia().get(0).getIdentifier()); } @Test public void testEndToEndZkCounts() throws IOException, InterruptedException { UUID datasetKey = UUID.randomUUID(); // one abcd206 OccurrenceSchemaType schemaType = OccurrenceSchemaType.ABCD_2_0_6; Integer crawlId = 1; OccurrenceFragmentedMessage msg = new OccurrenceFragmentedMessage(datasetKey, crawlId, abcd206Single.getBytes(), schemaType, EndpointType.BIOCASE, null); messagePublisher.send(msg); // two in one abcd 2 schemaType = OccurrenceSchemaType.ABCD_2_0_6; crawlId = 1; msg = new OccurrenceFragmentedMessage(datasetKey, crawlId, abcd206Multi.getBytes(), schemaType, EndpointType.BIOCASE, null); messagePublisher.send(msg); // one dwc 1.4 schemaType = OccurrenceSchemaType.DWC_1_4; crawlId = 1; msg = new OccurrenceFragmentedMessage(datasetKey, crawlId, dwc14.getBytes(), schemaType, EndpointType.BIOCASE, null); messagePublisher.send(msg); // dupe of dwc 1.4 messagePublisher.send(msg); // update of dwc 1.4 schemaType = OccurrenceSchemaType.DWC_1_4; crawlId = 1; msg = new OccurrenceFragmentedMessage(datasetKey, crawlId, dwc14_modified.getBytes(), schemaType, EndpointType.DIGIR, null); messagePublisher.send(msg); TimeUnit.MILLISECONDS.sleep(6000); assertEquals(5l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.FRAGMENT_RECEIVED).longValue()); assertEquals(5l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.FRAGMENT_PROCESSED).longValue()); assertEquals(0l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.RAW_OCCURRENCE_PERSISTED_ERROR) .longValue() ); assertEquals(1l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.RAW_OCCURRENCE_PERSISTED_UNCHANGED) .longValue() ); assertEquals(1l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.RAW_OCCURRENCE_PERSISTED_UPDATED) .longValue() ); assertEquals(4l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.RAW_OCCURRENCE_PERSISTED_NEW) .longValue() ); assertEquals(0l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.VERBATIM_OCCURRENCE_PERSISTED_ERROR) .longValue() ); assertEquals(5l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.VERBATIM_OCCURRENCE_PERSISTED_SUCCESS) .longValue() ); assertEquals(0l, zookeeperConnector.readCounter(datasetKey, ZookeeperConnector.CounterName.INTERPRETED_OCCURRENCE_PERSISTED_ERROR) .longValue() ); assertEquals(5l, zookeeperConnector .readCounter(datasetKey, ZookeeperConnector.CounterName.INTERPRETED_OCCURRENCE_PERSISTED_SUCCESS).longValue()); } }