package org.gbif.occurrence.parsing.xml; import org.gbif.api.vocabulary.OccurrenceSchemaType; import org.gbif.occurrence.common.identifier.HolyTriplet; import org.gbif.occurrence.common.identifier.OccurrenceKeyHelper; import org.gbif.occurrence.common.identifier.PublisherProvidedUniqueIdentifier; import org.gbif.occurrence.common.identifier.UniqueIdentifier; import org.gbif.occurrence.model.RawOccurrenceRecord; import org.gbif.occurrence.parsing.RawXmlOccurrence; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import com.google.common.io.Resources; import org.apache.commons.io.Charsets; import org.junit.Ignore; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; public class XmlFragmentParserTest { @Test public void testUtf8a() throws IOException { // note the collector name has an u umlaut String xml = Resources.toString(Resources.getResource("id_extraction/abcd1_umlaut.xml"), Charsets.UTF_8); RawXmlOccurrence rawRecord = createFakeOcc(xml); List<RawOccurrenceRecord> results = XmlFragmentParser.parseRecord(rawRecord); assertEquals(1, results.size()); // System.out.println("Looking for [Oschütz], got collector name [" + results.get(0).getCollectorName() + "]"); assertTrue(results.get(0).getCollectorName().equals("Oschütz")); } @Ignore("too expensive for constant use") @Test public void testMultiThreadLoad() throws Exception { String xml = Resources.toString(Resources.getResource("id_extraction/abcd1_umlaut.xml"), Charsets.UTF_8); // 5 parsers in 5 threads parse 1000 records each List<RawXmlOccurrence> raws = new ArrayList<RawXmlOccurrence>(); for (int i = 0; i < 5000; i++) { raws.add(createFakeOcc(xml)); } ExecutorService tp = Executors.newFixedThreadPool(5); List<Future<List<RawOccurrenceRecord>>> futures = new ArrayList<Future<List<RawOccurrenceRecord>>>(); for (int i = 0; i < 5; i++) { int start = i * 1000; int end = start + 1000; futures.add(tp.submit(new BatchRecordParser(raws.subList(start, end)))); } List<RawOccurrenceRecord> rors = new ArrayList<RawOccurrenceRecord>(); for (Future<List<RawOccurrenceRecord>> future : futures) { rors.addAll(future.get()); } assertEquals(5000, rors.size()); } private class BatchRecordParser implements Callable<List<RawOccurrenceRecord>> { private List<RawXmlOccurrence> raws; public BatchRecordParser(List<RawXmlOccurrence> raws) { this.raws = raws; } public List<RawOccurrenceRecord> call() { List<RawOccurrenceRecord> rors = new ArrayList<RawOccurrenceRecord>(); for (RawXmlOccurrence raw : raws) { rors.addAll(XmlFragmentParser.parseRecord(raw)); } return rors; } } @Test public void testIdExtractionSimple() throws IOException { String xml = Resources.toString(Resources.getResource("id_extraction/abcd1_simple.xml"), Charsets.UTF_8); UUID datasetKey = UUID.randomUUID(); HolyTriplet target = new HolyTriplet(datasetKey, "TLMF", "Tiroler Landesmuseum Ferdinandeum", "82D45C93-B297-490E-B7B0-E0A9BEED1326", null); byte[] xmlBytes = xml.getBytes(Charset.forName("UTF8")); Set<IdentifierExtractionResult> extractionResults = XmlFragmentParser.extractIdentifiers(datasetKey, xmlBytes, OccurrenceSchemaType.ABCD_1_2, true, true); Set<UniqueIdentifier> ids = extractionResults.iterator().next().getUniqueIdentifiers(); assertEquals(1, ids.size()); UniqueIdentifier id = ids.iterator().next(); assertEquals(datasetKey, id.getDatasetKey()); assertEquals(OccurrenceKeyHelper.buildKey(target), id.getUniqueString()); } @Test public void testIdExtractionMultipleIdsUnitQualifier() throws IOException { String xml = Resources.toString(Resources.getResource("id_extraction/abcd2_multi.xml"), Charsets.UTF_8); UUID datasetKey = UUID.randomUUID(); byte[] xmlBytes = xml.getBytes(Charset.forName("UTF8")); Set<IdentifierExtractionResult> extractionResults = XmlFragmentParser.extractIdentifiers(datasetKey, xmlBytes, OccurrenceSchemaType.ABCD_2_0_6, true, true); HolyTriplet holyTriplet1 = new HolyTriplet(datasetKey, "BGBM", "Bridel Herbar", "Bridel-1-428", "Grimmia alpicola Sw. ex Hedw."); HolyTriplet holyTriplet2 = new HolyTriplet(datasetKey, "BGBM", "Bridel Herbar", "Bridel-1-428", "Schistidium agassizii Sull. & Lesq. in Sull."); assertEquals(2, extractionResults.size()); for (IdentifierExtractionResult result : extractionResults) { String uniqueId = result.getUniqueIdentifiers().iterator().next().getUniqueString(); assertTrue(uniqueId.equals(OccurrenceKeyHelper.buildKey(holyTriplet1)) || uniqueId .equals(OccurrenceKeyHelper.buildKey(holyTriplet2))); } } @Test public void testIdExtractionWithTripletAndDwcOccurrenceId() throws IOException { String xml = Resources.toString(Resources.getResource("id_extraction/triplet_and_dwc_id.xml"), Charsets.UTF_8); UUID datasetKey = UUID.randomUUID(); byte[] xmlBytes = xml.getBytes(Charset.forName("UTF8")); Set<IdentifierExtractionResult> extractionResults = XmlFragmentParser.extractIdentifiers(datasetKey, xmlBytes, OccurrenceSchemaType.DWC_1_4, true, true); Set<UniqueIdentifier> ids = extractionResults.iterator().next().getUniqueIdentifiers(); PublisherProvidedUniqueIdentifier pubProvided = new PublisherProvidedUniqueIdentifier(datasetKey, "UGENT:vertebrata:50058"); HolyTriplet holyTriplet = new HolyTriplet(datasetKey, "UGENT", "vertebrata", "50058", null); assertEquals(2, ids.size()); for (UniqueIdentifier id : ids) { assertTrue(id.getUniqueString().equals(OccurrenceKeyHelper.buildKey(holyTriplet)) || id.getUniqueString() .equals(OccurrenceKeyHelper.buildKey(pubProvided))); } } @Test public void testIdExtractTapir() throws IOException { String xml = Resources.toString(Resources.getResource("id_extraction/tapir_triplet_contains_unrecorded.xml"), Charsets.UTF_8); byte[] xmlBytes = xml.getBytes(Charset.forName("UTF8")); Set<IdentifierExtractionResult> extractionResults = XmlFragmentParser.extractIdentifiers(UUID.randomUUID(), xmlBytes, OccurrenceSchemaType.DWC_1_4, true, true); assertFalse(extractionResults.isEmpty()); } @Test public void testIdExtractManisBlankCC() throws IOException { String xml = Resources.toString(Resources.getResource("id_extraction/manis_no_cc.xml"), Charsets.UTF_8); byte[] xmlBytes = xml.getBytes(Charset.forName("UTF8")); Set<IdentifierExtractionResult> extractionResults = XmlFragmentParser.extractIdentifiers(UUID.randomUUID(), xmlBytes, OccurrenceSchemaType.DWC_MANIS, true, true); assertTrue(extractionResults.isEmpty()); } private RawXmlOccurrence createFakeOcc(String xml) { RawXmlOccurrence rawRecord = new RawXmlOccurrence(); rawRecord.setCatalogNumber("fake catalog"); rawRecord.setCollectionCode("fake collection code"); rawRecord.setInstitutionCode("fake inst"); rawRecord.setResourceName("fake resource name"); rawRecord.setSchemaType(OccurrenceSchemaType.ABCD_1_2); rawRecord.setXml(xml); return rawRecord; } }