/* * Copyright 2011 Global Biodiversity Information Facility (GBIF) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.gbif.occurrence.parsing.xml; import org.gbif.api.vocabulary.OccurrenceSchemaType; import org.gbif.occurrence.model.IdentifierRecord; import org.gbif.occurrence.model.RawOccurrenceRecord; import org.gbif.occurrence.parsing.RawXmlOccurrence; import org.gbif.occurrence.parsing.xml.rules.Abcd12RuleSet; import org.gbif.occurrence.parsing.xml.rules.Abcd206RuleSet; import org.gbif.occurrence.parsing.xml.rules.Dwc10RuleSet; import org.gbif.occurrence.parsing.xml.rules.Dwc14RuleSet; import org.gbif.occurrence.parsing.xml.rules.Dwc2009RuleSet; import org.gbif.occurrence.parsing.xml.rules.DwcManisRuleSet; import org.gbif.occurrence.common.identifier.HolyTriplet; import org.gbif.occurrence.common.identifier.PublisherProvidedUniqueIdentifier; import org.gbif.occurrence.common.identifier.UniqueIdentifier; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.StringReader; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.apache.commons.digester.Digester; import org.apache.commons.digester.RuleSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Methods for parsing {@link RawXmlOccurrence}s and {@link UniqueIdentifier}s from xml fragments. */ public class XmlFragmentParser { private static final Logger LOG = LoggerFactory.getLogger(XmlFragmentParser.class); private static final Map<OccurrenceSchemaType, RuleSet> RULE_SETS = Maps.newHashMap(); // static class, should never be instantiated private XmlFragmentParser() { } static { try { RULE_SETS.put(OccurrenceSchemaType.ABCD_1_2, new Abcd12RuleSet()); RULE_SETS.put(OccurrenceSchemaType.ABCD_2_0_6, new Abcd206RuleSet()); RULE_SETS.put(OccurrenceSchemaType.DWC_1_0, new Dwc10RuleSet()); RULE_SETS.put(OccurrenceSchemaType.DWC_1_4, new Dwc14RuleSet()); RULE_SETS.put(OccurrenceSchemaType.DWC_2009, new Dwc2009RuleSet()); RULE_SETS.put(OccurrenceSchemaType.DWC_MANIS, new DwcManisRuleSet()); } catch (IOException e) { LOG.warn("Unable to read properties files for parsing xml", e); } } public static List<RawOccurrenceRecord> parseRecord(RawXmlOccurrence xmlRecord) { return parseRecord(xmlRecord.getXml(), xmlRecord.getSchemaType()); } public static List<RawOccurrenceRecord> parseRecord(String xml, OccurrenceSchemaType schemaType) { LOG.debug("Parsing xml [" + xml + "]"); List<RawOccurrenceRecord> records = null; try { InputSource inputSource = new InputSource(new StringReader(xml)); records = parseRecord(inputSource, schemaType); } catch (IOException e) { LOG.warn("IOException parsing xml string [{}]", xml, e); } catch (SAXException e) { LOG.warn("SAXException parsing xml string [{}]", xml, e); } return records; } public static List<RawOccurrenceRecord> parseRecord(byte[] xml, OccurrenceSchemaType schemaType) { List<RawOccurrenceRecord> records = null; try { InputSource inputSource = new InputSource(new ByteArrayInputStream(xml)); records = parseRecord(inputSource, schemaType); } catch (IOException e) { LOG.warn("IOException parsing xml bytes", e); } catch (SAXException e) { LOG.warn("SAXException parsing xml bytes", e); } return records; } private static List<RawOccurrenceRecord> parseRecord(InputSource inputSource, OccurrenceSchemaType schemaType) throws IOException, SAXException { RawOccurrenceRecordBuilder builder = new RawOccurrenceRecordBuilder(); Digester digester = new Digester(); digester.setNamespaceAware(true); digester.setValidating(false); digester.push(builder); digester.addRuleSet(RULE_SETS.get(schemaType)); digester.parse(inputSource); builder.resolvePriorities(); return builder.generateRawOccurrenceRecords(); } /** * This method is a hack to return a single result where ScientificName matches the given unitQualifier. This * behaviour is only relevant for ABCD 2.06 - the others all produce a single record anyway. * TODO: refactor the parse/builder to return what we want, rather than hacking around */ public static RawOccurrenceRecord parseRecord(byte[] xml, OccurrenceSchemaType schemaType, String unitQualifier) { RawOccurrenceRecord result = null; List<RawOccurrenceRecord> records = parseRecord(xml, schemaType); if (records.isEmpty()) { LOG.warn("Could not parse any records from given xml - returning null."); } else if (records.size() == 1) { result = records.get(0); } else if (unitQualifier == null) { LOG.warn("Got multiple records from given xml, but no unitQualifier set - returning first record as a guess."); result = records.get(0); } else { for (RawOccurrenceRecord record : records) { if (record.getScientificName().equals(unitQualifier)) { result = record; break; } } if (result == null) { LOG.warn("Got multiple records from xml but none matched unitQualifier - returning null"); } } return result; } /** * Extract sets of UniqueIdentifiers from the xml snippet. In the usual case the set will contain a single * result, which will in turn contain 1 or more UniqueIdentifiers for the given xml. In the ABCD 2 case there * may be more than one occurrence represented by the given xml, in which case there will be an * IdentifierExtractionResult (with UniqueIdentifiers) returned for each of the represented occurrences (e.g. if 3 * occurrences are in the xml snippet and each have one UniqueIdentifier the result will be a set of 3 * IdentifierExtractionResults, where each result contains a single UniqueIdentifier). If the passed in xml is * somehow malformed there may be 0 UniqueIdentifiers found, in which case an empty set is returned. * * @param datasetKey UUID for this dataset * @param xml snippet of xml representing one (or more, in ABCD) occurrence * @param schemaType the protocol that produced this xml (e.g. DWC, ABCD) * @param useOccurrenceId @return a set of 0 or more IdentifierExtractionResults containing UniqueIdentifiers as found * in the xml * * @see UniqueIdentifier */ public static Set<IdentifierExtractionResult> extractIdentifiers(UUID datasetKey, byte[] xml, OccurrenceSchemaType schemaType, boolean useTriplet, boolean useOccurrenceId) { Set<IdentifierExtractionResult> results = Sets.newHashSet(); // this is somewhat wasteful, but a whole separate stack of parsing to extract triplet seems excessive List<RawOccurrenceRecord> records = parseRecord(xml, schemaType); if (records != null && !records.isEmpty()) { for (RawOccurrenceRecord record : records) { Set<UniqueIdentifier> ids = Sets.newHashSet(); if (useTriplet) { HolyTriplet holyTriplet = null; try { holyTriplet = new HolyTriplet(datasetKey, record.getInstitutionCode(), record.getCollectionCode(), record.getCatalogueNumber(), record.getUnitQualifier()); } catch (IllegalArgumentException e) { // some of the triplet was null or empty, so it's not valid - that's highly suspicious, but could be ok... LOG.info("No holy triplet for an xml snippet in dataset [{}] and schema [{}], got error [{}]", new String[] {datasetKey.toString(), schemaType.toString(), e.getMessage()}); } if (holyTriplet != null) ids.add(holyTriplet); } if (useOccurrenceId) { if (record.getIdentifierRecords() != null && !record.getIdentifierRecords().isEmpty()) { for (IdentifierRecord idRecord : record.getIdentifierRecords()) { // TODO: this needs much better checking (ie can we trust that guid (type 1) and sourceid (type 7) are // getting set and parsed properly?) // TODO: identifier types need to be enums if (idRecord.getIdentifierType() == 1 || idRecord.getIdentifierType() == 7) { if (idRecord.getIdentifier() != null) { ids.add(new PublisherProvidedUniqueIdentifier(datasetKey, idRecord.getIdentifier())); } } } } } if (!ids.isEmpty()) { results.add(new IdentifierExtractionResult(ids, record.getUnitQualifier())); } } } return results; } }