package org.gbif.dwca.io; import org.gbif.dwc.terms.Term; import org.gbif.dwc.terms.TermFactory; import com.google.common.base.Strings; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * SAX handler to parse a meta.xml descriptor for dwc archives. It populates a given archive instance and ignores * namespaces. The parser needs to be namespace aware! */ class MetaXMLSaxHandler extends SimpleSaxHandler { private static final TermFactory TERM_FACTORY = TermFactory.instance(); private static final String NS_DWCA = "http://rs.tdwg.org/dwc/text/"; private final Archive archive; private ArchiveFile af; MetaXMLSaxHandler(Archive archive) { this.archive = archive; } private static Character getFirstChar(String x) throws UnsupportedArchiveException { if (x == null || x.length() == 0) { return null; } if (x.length() == 1) { return x.charAt(0); } if (x.equalsIgnoreCase("\\t")) { return '\t'; } if (x.equalsIgnoreCase("\\n")) { return '\n'; } if (x.equalsIgnoreCase("\\r")) { return '\r'; } if (x.length() > 1) { throw new UnsupportedArchiveException( "Only darwin core archives with a single quotation character are supported, but found >>>" + x + "<<<"); } return ' '; } private static String unescapeBackslash(String x) { if (x == null || x.length() == 0) { return null; } return x.replaceAll("\\\\t", String.valueOf('\t')).replaceAll("\\\\n", String.valueOf('\n')) .replaceAll("\\\\r", String.valueOf('\r')).replaceAll("\\\\f", String.valueOf('\f')); } private ArchiveFile buildArchiveFile(Attributes attr) throws UnsupportedArchiveException { ArchiveFile dwcFile = new ArchiveFile(); // extract the File attributes if (getAttr(attr, "encoding") != null) { dwcFile.setEncoding(getAttr(attr, "encoding")); } if (getAttr(attr, "fieldsTerminatedBy") != null) { dwcFile.setFieldsTerminatedBy(unescapeBackslash(getAttr(attr, "fieldsTerminatedBy"))); } if (getAttr(attr, "fieldsEnclosedBy") != null) { dwcFile.setFieldsEnclosedBy(getFirstChar(getAttr(attr, "fieldsEnclosedBy"))); } if (getAttr(attr, "linesTerminatedBy") != null) { dwcFile.setLinesTerminatedBy(unescapeBackslash(getAttr(attr, "linesTerminatedBy"))); } if (getAttr(attr, "rowType") != null) { dwcFile.setRowType(TERM_FACTORY.findTerm(getAttr(attr, "rowType"))); } String ignoreHeaderLines = getAttr(attr, "ignoreHeaderLines"); try { dwcFile.setIgnoreHeaderLines(Integer.parseInt(ignoreHeaderLines)); } catch (NumberFormatException ignored) { // swallow null or bad value } return dwcFile; } /** * Build an ArchiveField object based on xml attributes. */ private ArchiveField buildField(Attributes attributes) { // build field Term term = TERM_FACTORY.findTerm(getAttr(attributes, "term")); String defaultValue = getAttr(attributes, "default"); String vocabulary = getAttr(attributes, "vocabulary"); ArchiveField.DataType type = ArchiveField.DataType.findByXmlSchemaType(getAttr(attributes, "type")); if (type == null) { type = ArchiveField.DataType.string; } String indexAsString = getAttr(attributes, "index"); Integer index = null; if (indexAsString != null) { // let bad errors be thrown up try { index = Integer.parseInt(indexAsString); } catch (NumberFormatException e) { throw new UnsupportedArchiveException(e); } } String delimiter = getAttr(attributes, "delimitedBy"); return new ArchiveField(index, term, defaultValue, type, delimiter, vocabulary); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // calling the super method to stringify the character buffer super.endElement(uri, localName, qName); if (localName.equalsIgnoreCase("archive")) { // archive } else if (localName.equalsIgnoreCase("core")) { // update location to absolute path incl archive path // makeLocationPathsAbsolute(af, archive.getLocation()); archive.setCore(af); } else if (localName.equalsIgnoreCase("extension")) { // update location to absolute path incl archive path // makeLocationPathsAbsolute(af, archive.getLocation()); if (af.getId() != null && af.getId().getIndex() != null) { archive.addExtension(af); } else { log.warn("Skipping extension [" + af.getRowType() + "] with no index attribute"); } } else if (localName.equalsIgnoreCase("location")) { // a file location af.addLocation(content); } } /** * Get attribute from a key * * @param attributes * @param key * @return attributes value or null */ private String getAttr(Attributes attributes, String key) { String val = null; if (attributes != null) { // try without NS val = attributes.getValue("", key); if (val == null) { // try with dwca NS if nothing found val = attributes.getValue(NS_DWCA, key); } } return Strings.isNullOrEmpty(val) ? null : val; } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { super.startElement(uri, localName, qName, attributes); if (localName.equalsIgnoreCase("archive") || localName.equalsIgnoreCase("stararchive")) { // metadata location archive.setMetadataLocation(getAttr(attributes, "metadata")); } else if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) { // archive/extension af = new ArchiveFile(); if (localName.equalsIgnoreCase("core") || localName.equalsIgnoreCase("extension")) { // archive/core or archive/extension af = buildArchiveFile(attributes); } } else if (localName.equalsIgnoreCase("coreid") || localName.equalsIgnoreCase("id")) { ArchiveField field = buildField(attributes); if (af != null) { af.setId(field); } else { log.warn(localName + " field found outside of an archive file"); } } else if (localName.equalsIgnoreCase("field")) { ArchiveField field = buildField(attributes); if (af != null) { af.addField(field); } else { log.warn("field found outside of an archive file"); } } } }