package org.gbif.occurrence.persistence;
import org.gbif.api.vocabulary.EndpointType;
import org.gbif.api.vocabulary.OccurrenceSchemaType;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.occurrence.common.config.OccHBaseConfiguration;
import org.gbif.occurrence.common.identifier.HolyTriplet;
import org.gbif.occurrence.common.identifier.PublisherProvidedUniqueIdentifier;
import org.gbif.occurrence.common.identifier.UniqueIdentifier;
import org.gbif.occurrence.persistence.api.Fragment;
import org.gbif.occurrence.persistence.api.FragmentCreationResult;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.occurrence.persistence.api.OccurrenceKeyPersistenceService;
import org.gbif.occurrence.persistence.guice.ThreadLocalLockProvider;
import org.gbif.occurrence.persistence.hbase.Columns;
import org.gbif.occurrence.persistence.keygen.KeyPersistenceService;
import org.gbif.occurrence.persistence.keygen.ZkLockingKeyService;
import java.util.Arrays;
import java.util.Date;
import java.util.Random;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import com.google.common.collect.Sets;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.RetryNTimes;
import org.apache.curator.test.TestingServer;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
public class FragmentPersistenceServiceImplTest {
private static final OccHBaseConfiguration CFG = new OccHBaseConfiguration();
static {
CFG.setEnvironment("test");
}
private static final byte[] TABLE = Bytes.toBytes(CFG.occTable);
private static final String CF_NAME = "o";
private static final byte[] CF = Bytes.toBytes(CF_NAME);
private static final byte[] COUNTER_TABLE = Bytes.toBytes(CFG.counterTable);
private static final String COUNTER_CF_NAME = "o";
private static final byte[] COUNTER_CF = Bytes.toBytes(COUNTER_CF_NAME);
private static final byte[] LOOKUP_TABLE = Bytes.toBytes(CFG.lookupTable);
private static final String LOOKUP_CF_NAME = "o";
private static final byte[] LOOKUP_CF = Bytes.toBytes(LOOKUP_CF_NAME);
private int xmlKey;
private int jsonKey;
private static final int BAD_KEY = 2000000;
private static final String CAT = "abc123";
private static final String COL_CODE = "Big cats";
private static final UUID XML_DATASET_KEY = UUID.randomUUID();
private static final String DWC_ID = "asdf-hasdf-234-dwcid";
private static final String INST_CODE = "BGBM";
private static final String UNIT_QUALIFIER = "Panthera onca (Linnaeus, 1758)";
private static final int CRAWL_ID = 234;
private static final Long CREATED = 123456789l;
private static final Date HARVEST_DATE = new Date();
private static final EndpointType ENDPOINT_TYPE = EndpointType.DIGIR;
private static final byte[] XML = "<parsing>not parsing</parsing>".getBytes();
private static final byte[] XML_HASH = DigestUtils.md5(XML);
private static final OccurrenceSchemaType XML_SCHEMA = OccurrenceSchemaType.DWC_1_4;
private static final UUID JSON_DATASET_KEY = UUID.randomUUID();
private static final byte[] JSON = "{ \"pretend\": \"json\" }".getBytes();
private static final byte[] JSON_HASH = DigestUtils.md5(JSON);
private static final EndpointType JSON_ENDPOINT_TYPE = EndpointType.DWC_ARCHIVE;
private Connection connection = null;
private FragmentPersistenceServiceImpl fragmentService;
private OccurrenceKeyPersistenceService occurrenceKeyService;
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private static TestingServer zookeeperServer;
private static CuratorFramework curator;
private static ThreadLocalLockProvider zooLockProvider;
@Rule
public ExpectedException exception = ExpectedException.none();
@BeforeClass
public static void beforeClass() throws Exception {
TEST_UTIL.startMiniCluster(1);
TEST_UTIL.createTable(TABLE, CF);
TEST_UTIL.createTable(COUNTER_TABLE, COUNTER_CF);
TEST_UTIL.createTable(LOOKUP_TABLE, LOOKUP_CF);
// setup zookeeper
zookeeperServer = new TestingServer();
curator =
CuratorFrameworkFactory.builder().namespace("hbasePersistence").connectString(zookeeperServer.getConnectString())
.retryPolicy(new RetryNTimes(1, 1000)).build();
curator.start();
zooLockProvider = new ThreadLocalLockProvider(curator);
}
@AfterClass
public static void afterClass() throws Exception {
TEST_UTIL.shutdownMiniCluster();
curator.close();
zookeeperServer.stop();
}
@Before
public void setUp() throws Exception {
TEST_UTIL.truncateTable(TABLE);
TEST_UTIL.truncateTable(COUNTER_TABLE);
TEST_UTIL.truncateTable(LOOKUP_TABLE);
connection = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration());
// reset lookup table
KeyPersistenceService keyPersistenceService =
new ZkLockingKeyService(CFG, connection, zooLockProvider);
occurrenceKeyService = new OccurrenceKeyPersistenceServiceImpl(keyPersistenceService);
Set<UniqueIdentifier> ids = Sets.newHashSet();
HolyTriplet holyTriplet = new HolyTriplet(XML_DATASET_KEY, INST_CODE, COL_CODE, CAT, UNIT_QUALIFIER);
ids.add(holyTriplet);
PublisherProvidedUniqueIdentifier pubId = new PublisherProvidedUniqueIdentifier(XML_DATASET_KEY, DWC_ID);
ids.add(pubId);
xmlKey = occurrenceKeyService.generateKey(ids).getKey();
ids = Sets.newHashSet();
holyTriplet = new HolyTriplet(JSON_DATASET_KEY, INST_CODE, COL_CODE, CAT, UNIT_QUALIFIER);
ids.add(holyTriplet);
pubId = new PublisherProvidedUniqueIdentifier(JSON_DATASET_KEY, DWC_ID);
ids.add(pubId);
jsonKey = occurrenceKeyService.generateKey(ids).getKey();
fragmentService = new FragmentPersistenceServiceImpl(CFG, connection, occurrenceKeyService);
Table table = connection.getTable(TableName.valueOf(CFG.occTable));
Put put = new Put(Bytes.toBytes(xmlKey));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.catalogNumber)), Bytes.toBytes(CAT));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.collectionCode)), Bytes.toBytes(COL_CODE));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.fragmentCreated)), Bytes.toBytes(CREATED));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifTerm.datasetKey)), Bytes.toBytes(XML_DATASET_KEY.toString()));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.institutionCode)), Bytes.toBytes(INST_CODE));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.unitQualifier)), Bytes.toBytes(UNIT_QUALIFIER));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.occurrenceID)), Bytes.toBytes(DWC_ID));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifTerm.lastCrawled)), Bytes.toBytes(HARVEST_DATE.getTime()));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.crawlId)), Bytes.toBytes(CRAWL_ID));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.fragment)), XML);
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.fragmentHash)), XML_HASH);
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.xmlSchema)), Bytes.toBytes(XML_SCHEMA.toString()));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifTerm.protocol)), Bytes.toBytes(ENDPOINT_TYPE.toString()));
table.put(put);
put = new Put(Bytes.toBytes(jsonKey));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.catalogNumber)), Bytes.toBytes(CAT));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.collectionCode)), Bytes.toBytes(COL_CODE));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifTerm.datasetKey)), Bytes.toBytes(JSON_DATASET_KEY.toString()));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.fragmentCreated)), Bytes.toBytes(CREATED));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.institutionCode)), Bytes.toBytes(INST_CODE));
put.addColumn(CF, Bytes.toBytes(Columns.column(DwcTerm.occurrenceID)), Bytes.toBytes(DWC_ID));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifTerm.lastCrawled)), Bytes.toBytes(HARVEST_DATE.getTime()));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.crawlId)), Bytes.toBytes(CRAWL_ID));
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.fragment)), JSON);
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifInternalTerm.fragmentHash)), JSON_HASH);
put.addColumn(CF, Bytes.toBytes(Columns.column(GbifTerm.protocol)), Bytes.toBytes(JSON_ENDPOINT_TYPE.toString()));
table.put(put);
//table.flushCommits();
table.close();
}
@Test
public void testGetFullXml() {
Fragment frag = fragmentService.get(xmlKey);
assertNotNull(frag);
assertEquals(XML_DATASET_KEY, frag.getDatasetKey());
assertEquals(xmlKey, frag.getKey().intValue());
assertEquals(CRAWL_ID, frag.getCrawlId().intValue());
assertEquals(HARVEST_DATE, frag.getHarvestedDate());
assertTrue(Arrays.equals(XML, frag.getData()));
assertTrue(Arrays.equals(XML_HASH, frag.getDataHash()));
assertEquals(XML_SCHEMA, frag.getXmlSchema());
assertEquals(ENDPOINT_TYPE, frag.getProtocol());
assertEquals(Fragment.FragmentType.XML, frag.getFragmentType());
assertEquals(CREATED, frag.getCreated());
}
@Test
public void testGetNull() {
Fragment frag = fragmentService.get(BAD_KEY);
assertNull(frag);
}
@Test
public void testInsertFullXml() {
Fragment got = fragmentService.get(xmlKey);
got.setKey(null);
HolyTriplet triplet = new HolyTriplet(XML_DATASET_KEY, "fake", "fake", "fake", null);
Set<UniqueIdentifier> ids = Sets.newHashSet();
ids.add(triplet);
got = fragmentService.insert(got, ids).getFragment();
Fragment frag = fragmentService.get(got.getKey());
Assert.assertNotNull(frag);
assertEquals(got.getKey().intValue(), frag.getKey().intValue());
assertEquals(XML_DATASET_KEY, frag.getDatasetKey());
assertEquals(CRAWL_ID, frag.getCrawlId().intValue());
assertEquals(HARVEST_DATE, frag.getHarvestedDate());
assertTrue(Arrays.equals(XML, frag.getData()));
assertTrue(Arrays.equals(XML_HASH, frag.getDataHash()));
assertEquals(XML_SCHEMA, frag.getXmlSchema());
assertEquals(ENDPOINT_TYPE, frag.getProtocol());
assertEquals(Fragment.FragmentType.XML, frag.getFragmentType());
assertNotNull(frag.getCreated());
}
@Test
public void testInsertEmptyFragment() {
exception.expect(NullPointerException.class);
exception.expectMessage("fragment can't be null");
fragmentService.insert(null, null);
}
@Test
public void testInsertEmptyIds() {
byte[] data = "boo".getBytes();
byte[] dataHash = "far".getBytes();
Fragment frag =
new Fragment(UUID.randomUUID(), data, dataHash, Fragment.FragmentType.JSON, EndpointType.DWC_ARCHIVE, new Date(),
1, null, null, null);
exception.expect(NullPointerException.class);
exception.expectMessage("uniqueIds can't be null");
fragmentService.insert(frag, null);
}
@Test
public void testUpdateFullXml() {
Fragment orig = fragmentService.get(xmlKey);
int crawlId = 567;
Date harvestDate = new Date();
byte[] xml = Bytes.toBytes("<parsing>this is not a love song</parsing>");
byte[] xmlHash = DigestUtils.md5(xml);
OccurrenceSchemaType xmlSchema = OccurrenceSchemaType.ABCD_2_0_6;
EndpointType endpointType = EndpointType.BIOCASE;
String unitQualifier = "Puma concolor";
Long created = System.currentTimeMillis();
Fragment update =
new Fragment(orig.getDatasetKey(), xml, xmlHash, orig.getFragmentType(), endpointType, harvestDate, crawlId,
xmlSchema, unitQualifier, created);
update.setKey(orig.getKey());
fragmentService.update(update);
Fragment frag = fragmentService.get(xmlKey);
assertNotNull(frag);
assertEquals(XML_DATASET_KEY, frag.getDatasetKey());
assertEquals(crawlId, frag.getCrawlId().intValue());
assertEquals(unitQualifier, frag.getUnitQualifier());
assertEquals(harvestDate, frag.getHarvestedDate());
assertTrue(Arrays.equals(xml, frag.getData()));
assertTrue(Arrays.equals(xmlHash, frag.getDataHash()));
assertEquals(xmlSchema, frag.getXmlSchema());
assertEquals(endpointType, frag.getProtocol());
assertEquals(Fragment.FragmentType.XML, frag.getFragmentType());
assertEquals(created, frag.getCreated());
}
@Test
public void testGetFullJson() {
Fragment frag = fragmentService.get(jsonKey);
assertNotNull(frag);
assertEquals(JSON_DATASET_KEY, frag.getDatasetKey());
assertEquals(jsonKey, frag.getKey().intValue());
assertEquals(CRAWL_ID, frag.getCrawlId().intValue());
assertEquals(HARVEST_DATE, frag.getHarvestedDate());
assertTrue(Arrays.equals(JSON, frag.getData()));
assertTrue(Arrays.equals(JSON_HASH, frag.getDataHash()));
assertEquals(OccurrenceSchemaType.DWCA, frag.getXmlSchema());
assertEquals(JSON_ENDPOINT_TYPE, frag.getProtocol());
assertEquals(Fragment.FragmentType.JSON, frag.getFragmentType());
assertEquals(CREATED, frag.getCreated());
}
@Test
public void testInsertFullJson() {
Fragment got = fragmentService.get(jsonKey);
got.setKey(null);
HolyTriplet triplet = new HolyTriplet(JSON_DATASET_KEY, "fake", "fake", "fake", null);
Set<UniqueIdentifier> ids = Sets.newHashSet();
ids.add(triplet);
got = fragmentService.insert(got, ids).getFragment();
Fragment frag = fragmentService.get(got.getKey());
Assert.assertNotNull(frag);
assertEquals(got.getKey().intValue(), frag.getKey().intValue());
assertEquals(JSON_DATASET_KEY, frag.getDatasetKey());
assertEquals(CRAWL_ID, frag.getCrawlId().intValue());
assertEquals(HARVEST_DATE, frag.getHarvestedDate());
assertTrue(Arrays.equals(JSON, frag.getData()));
assertTrue(Arrays.equals(JSON_HASH, frag.getDataHash()));
assertEquals(OccurrenceSchemaType.DWCA, frag.getXmlSchema());
assertEquals(JSON_ENDPOINT_TYPE, frag.getProtocol());
assertEquals(Fragment.FragmentType.JSON, frag.getFragmentType());
assertNotNull(frag.getCreated());
}
@Test
public void testUpdateFullJson() {
Fragment orig = fragmentService.get(jsonKey);
int crawlId = 568;
Date harvestDate = new Date();
byte[] json = Bytes.toBytes("{ \"json\" : { \"nested\" : \"looks like this\" } }");
byte[] jsonHash = DigestUtils.md5(json);
Long created = System.currentTimeMillis();
Fragment update =
new Fragment(orig.getDatasetKey(), json, jsonHash, orig.getFragmentType(), orig.getProtocol(), harvestDate,
crawlId, null, null, created);
update.setKey(orig.getKey());
fragmentService.update(update);
Fragment frag = fragmentService.get(jsonKey);
Assert.assertNotNull(frag);
assertEquals(JSON_DATASET_KEY, frag.getDatasetKey());
assertEquals(crawlId, frag.getCrawlId().intValue());
assertEquals(harvestDate, frag.getHarvestedDate());
assertTrue(Arrays.equals(json, frag.getData()));
assertTrue(Arrays.equals(jsonHash, frag.getDataHash()));
assertEquals(OccurrenceSchemaType.DWCA, frag.getXmlSchema());
assertEquals(JSON_ENDPOINT_TYPE, frag.getProtocol());
assertNull(frag.getUnitQualifier());
assertEquals(Fragment.FragmentType.JSON, frag.getFragmentType());
assertEquals(created, frag.getCreated());
}
@Test
public void testMultiThreadInsertIdenticalFullJson() throws InterruptedException {
Fragment fragment = fragmentService.get(jsonKey);
fragment.setKey(null);
HolyTriplet triplet = new HolyTriplet(JSON_DATASET_KEY, "fake", "fake", "fake", null);
Set<UniqueIdentifier> ids = Sets.newHashSet();
ids.add(triplet);
int threadCount = 100;
ExecutorService tp = Executors.newFixedThreadPool(threadCount);
for (int i=0; i < threadCount; i++) {
tp.submit(new FragmentInserter(fragment, ids));
}
tp.shutdown();
tp.awaitTermination(1, TimeUnit.MINUTES);
FragmentCreationResult got = fragmentService.insert(fragment, ids);
assertFalse(got.isKeyCreated());
assertEquals(3, got.getFragment().getKey().intValue());
}
private class FragmentInserter implements Runnable {
private final Fragment fragment;
private final Set<UniqueIdentifier> ids;
private FragmentInserter(Fragment fragment, Set<UniqueIdentifier> ids) {
this.fragment = fragment;
this.ids = ids;
}
@Override
public void run() {
try {
Thread.sleep(new Random().nextInt(5));
} catch (InterruptedException e) {
// TODO: Handle exception
}
FragmentCreationResult result = fragmentService.insert(fragment, ids);
if (result.isKeyCreated()) {
System.out.println(new Date().getTime() + " " + Thread.currentThread().getName() + " Created id [" + result.getFragment().getKey() + "]");
} else {
System.out.println(new Date().getTime() + " " + Thread.currentThread().getName() + " Reusing existing id [" + result.getFragment().getKey() + "]");
}
}
}
}