/*
* Copyright 2011 Global Biodiversity Information Facility (GBIF)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.gbif.occurrence;
import org.gbif.api.exception.ServiceUnavailableException;
import org.gbif.occurrence.constants.ExtractionSimpleXPaths;
import org.gbif.occurrence.model.RawOccurrenceRecord;
import org.gbif.occurrence.parsing.RawXmlOccurrence;
import org.gbif.occurrence.parsing.response_file.ParsedSearchResponse;
import org.gbif.occurrence.parsing.xml.XmlFragmentParser;
import org.gbif.occurrence.util.XmlSanitizingReader;
import org.gbif.utils.file.CharsetDetection;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException;
import org.apache.commons.digester.Digester;
import org.apache.commons.digester.NodeCreateRule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import static com.google.common.base.Preconditions.checkNotNull;
/**
* Entry point into the parsing of raw occurrence records as retrieved from publishers. Will attempt to determine
* both XML encodings and schema type. Parse happens in two steps - first extracts each record element into a
* RawXmlOccurrence, and then parses each of those into RawOccurrenceRecords.
*/
public class OccurrenceParser {
private static final Logger LOG = LoggerFactory.getLogger(OccurrenceParser.class);
private static final String ENCONDING_EQ = "encoding=";
private static final Pattern ENCODING_PATTERN = Pattern.compile(ENCONDING_EQ);
private static final Pattern REPLACE_QUOTES_PAT = Pattern.compile("[\"']");
public static final String ADD_RECORD_AS_XML = "addRecordAsXml";
public static final String SET_ABCD_1_HEADER = "setAbcd1Header";
public List<RawOccurrenceRecord> parseResponseFileToRor(File inputFile) {
List<RawXmlOccurrence> raws = parseResponseFileToRawXml(inputFile);
return parseRawXmlToRor(raws);
}
/**
* This parses a stream of uncompressed ABCD or DwC Occurrences into {@link RawXmlOccurrence}s.
* No care is taken to handle wrong encodings or character sets in general. This might be changed later on.
*
* @param is stream to parse
*
* @return list of parsed occurrences
*
* @throws ParsingException if there were any problems during parsing the stream
*/
// TODO: Optionally handle compressed streams
public List<RawXmlOccurrence> parseStream(InputStream is) throws ParsingException {
checkNotNull(is, "is can't be null");
try {
ParsedSearchResponse responseBody = new ParsedSearchResponse();
InputSource inputSource = new InputSource(is);
Digester digester = new Digester();
digester.setNamespaceAware(true);
digester.setValidating(false);
digester.push(responseBody);
NodeCreateRule rawAbcd = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, rawAbcd);
digester.addSetNext(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, ADD_RECORD_AS_XML);
NodeCreateRule rawAbcd1Header = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, rawAbcd1Header);
digester.addSetNext(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, SET_ABCD_1_HEADER);
NodeCreateRule rawDwc1_0 = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, rawDwc1_0);
digester.addSetNext(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, ADD_RECORD_AS_XML);
NodeCreateRule rawDwc1_4 = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, rawDwc1_4);
digester.addSetNext(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, ADD_RECORD_AS_XML);
// NodeCreateRule rawDwcManis = new NodeCreateRule();
// digester.addRule(ExtractionSimpleXPaths.DWC_MANIS_RECORD_XPATH, rawDwcManis);
// digester.addSetNext(ExtractionSimpleXPaths.DWC_MANIS_RECORD_XPATH, "addRecordAsXml");
NodeCreateRule rawDwc2009 = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, rawDwc2009);
digester.addSetNext(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, ADD_RECORD_AS_XML);
digester.parse(inputSource);
return responseBody.getRecords();
} catch (ParserConfigurationException | TransformerException e) {
throw new ServiceUnavailableException("Error setting up Commons Digester", e);
} catch (SAXException | IOException e) {
throw new ParsingException("Parsing failed", e);
}
}
/**
* Parses a single response gzipFile and returns a List of the contained RawXmlOccurrences.
*/
public List<RawXmlOccurrence> parseResponseFileToRawXml(File gzipFile) {
if (LOG.isDebugEnabled()) LOG.debug(">> parseResponseFileToRawXml [{}]", gzipFile.getAbsolutePath());
ParsedSearchResponse responseBody = null;
try {
responseBody = new ParsedSearchResponse();
List<String> charsets = getCharsets(gzipFile);
String goodCharset = null;
boolean encodingError = false;
for (String charsetName : charsets) {
LOG.debug("Trying charset [{}]", charsetName);
try (FileInputStream fis = new FileInputStream(gzipFile);
GZIPInputStream inputStream = new GZIPInputStream(fis);
BufferedReader inputReader =
new BufferedReader(new XmlSanitizingReader(new InputStreamReader(inputStream, charsetName)));) {
InputSource inputSource = new InputSource(inputReader);
Digester digester = new Digester();
digester.setNamespaceAware(true);
digester.setValidating(false);
digester.push(responseBody);
NodeCreateRule rawAbcd = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, rawAbcd);
digester.addSetNext(ExtractionSimpleXPaths.ABCD_RECORD_XPATH, ADD_RECORD_AS_XML);
NodeCreateRule rawAbcd1Header = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, rawAbcd1Header);
digester.addSetNext(ExtractionSimpleXPaths.ABCD_HEADER_XPATH, SET_ABCD_1_HEADER);
NodeCreateRule rawDwc1_0 = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, rawDwc1_0);
digester.addSetNext(ExtractionSimpleXPaths.DWC_1_0_RECORD_XPATH, ADD_RECORD_AS_XML);
NodeCreateRule rawDwc1_4 = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, rawDwc1_4);
digester.addSetNext(ExtractionSimpleXPaths.DWC_1_4_RECORD_XPATH, ADD_RECORD_AS_XML);
// TODO: dwc_manis appears to work without a NodeCreateRule here - why?
NodeCreateRule rawDwc2009 = new NodeCreateRule();
digester.addRule(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, rawDwc2009);
digester.addSetNext(ExtractionSimpleXPaths.DWC_2009_RECORD_XPATH, ADD_RECORD_AS_XML);
digester.parse(inputSource);
LOG.debug("Success with charset [{}] - skipping any others", charsetName);
goodCharset = charsetName;
break;
} catch (SAXException e) {
String msg = "SAX exception when parsing parsing from response gzipFile [" + gzipFile.getAbsolutePath()
+ "] using encoding [" + charsetName + "] - trying another charset";
LOG.debug(msg, e);
} catch (MalformedByteSequenceException e) {
LOG.debug("Malformed utf-8 byte when parsing with encoding [{}] - trying another charset", charsetName);
encodingError = true;
} catch (IOException ex) {
LOG.warn("Error reading input files",ex);
}
}
if (goodCharset == null) {
if (encodingError) {
LOG.warn(
"Could not parse gzipFile - all encoding attempts failed with malformed utf8 - skipping gzipFile [{}]",
gzipFile.getAbsolutePath());
} else {
LOG.warn("Could not parse gzipFile (malformed parsing) - skipping gzipFile [{}]", gzipFile.getAbsolutePath());
}
}
} catch (IOException e) {
LOG.warn("Could not find response gzipFile [{}] - skipping gzipFile", gzipFile.getAbsolutePath(), e);
} catch (TransformerException e) {
LOG.warn("Could not create parsing transformer for [{}] - skipping gzipFile", gzipFile.getAbsolutePath(), e);
} catch (ParserConfigurationException e) {
LOG.warn("Failed to pull raw parsing from response gzipFile [{}] - skipping gzipFile",
gzipFile.getAbsolutePath(), e);
}
if (LOG.isDebugEnabled()) LOG.debug("<< parseResponseFileToRawXml [{}]", gzipFile.getAbsolutePath());
return (responseBody == null) ? null : responseBody.getRecords();
}
/**
* Utility method to extract character encondings from a gzip file.
* Charsets are a nightmare and users can't be trusted, so strategy
* is try these encodings in order until one of them (hopefully) works
* (note the last two could be repeats of the first two):
* - utf-8
* - latin1 (iso-8859-1)
* - the declared encoding from the parsing itself
* - a guess at detecting the charset from the raw gzipFile bytes
*/
private static List<String> getCharsets(File gzipFile) throws IOException {
List<String> charsets = new ArrayList<String>();
charsets.add("UTF-8");
charsets.add("ISO-8859-1");
// read parsing declaration
try (FileInputStream fis = new FileInputStream(gzipFile);
GZIPInputStream inputStream = new GZIPInputStream(fis);
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader)) {
boolean gotEncoding = false;
String encoding;
int lineCount = 0;
while (bufferedReader.ready() && !gotEncoding && lineCount < 5) {
String line = bufferedReader.readLine();
lineCount++;
if (line != null && line.contains(ENCONDING_EQ)) {
encoding = ENCODING_PATTERN.split(line,0)[1];
// drop trailing ?>
encoding = encoding.substring(0, encoding.length() - 2);
// drop quotes
encoding = REPLACE_QUOTES_PAT.matcher(encoding).replaceAll("").trim();
LOG.debug("Found encoding [{}] in parsing declaration", encoding);
try {
Charset.forName(encoding);
charsets.add(encoding);
} catch (Exception e) {
LOG.debug(
"Could not find supported charset matching detected encoding of [{}] - trying other guesses instead",
encoding);
}
gotEncoding = true;
}
}
}
// attempt detection from bytes
charsets.add(CharsetDetection.detectEncoding(gzipFile).name());
return charsets;
}
public List<RawOccurrenceRecord> parseRawXmlToRor(List<RawXmlOccurrence> raws) {
List<RawOccurrenceRecord> rors = new ArrayList<RawOccurrenceRecord>();
for (RawXmlOccurrence raw : raws) {
List<RawOccurrenceRecord> innerRors = XmlFragmentParser.parseRecord(raw);
rors.addAll(innerRors);
}
return rors;
}
}