package org.gbif.ipt.task; import org.gbif.api.model.common.DOI; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.Term; import org.gbif.dwc.terms.TermFactory; import org.gbif.dwca.io.Archive; import org.gbif.dwca.io.ArchiveFactory; import org.gbif.dwca.io.ArchiveField; import org.gbif.dwca.io.ArchiveFile; import org.gbif.dwca.io.MetaDescriptorWriter; import org.gbif.ipt.config.AppConfig; import org.gbif.ipt.config.Constants; import org.gbif.ipt.config.DataDir; import org.gbif.ipt.model.Extension; import org.gbif.ipt.model.ExtensionMapping; import org.gbif.ipt.model.ExtensionProperty; import org.gbif.ipt.model.PropertyMapping; import org.gbif.ipt.model.RecordFilter; import org.gbif.ipt.model.Resource; import org.gbif.ipt.service.admin.VocabulariesManager; import org.gbif.ipt.service.manage.SourceManager; import org.gbif.ipt.utils.MapUtils; import org.gbif.utils.file.ClosableReportingIterator; import org.gbif.utils.file.CompressionUtil; import org.gbif.utils.file.csv.CSVReader; import org.gbif.utils.file.csv.CSVReaderFactory; import org.gbif.utils.text.LineComparator; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.io.Writer; import java.math.BigDecimal; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.google.common.collect.Ordering; import com.google.inject.Inject; import com.google.inject.assistedinject.Assisted; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOCase; import org.apache.commons.io.filefilter.WildcardFileFilter; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Level; public class GenerateDwca extends ReportingTask implements Callable<Map<String, Integer>> { private enum STATE { WAITING, STARTED, DATAFILES, METADATA, BUNDLING, COMPLETED, ARCHIVING, VALIDATING, CANCELLED, FAILED } private static final Pattern escapeChars = Pattern.compile("[\t\n\r]"); private final Resource resource; // record counts by extension <rowType, count> private Map<String, Integer> recordsByExtension = Maps.newHashMap(); private Archive archive; private File dwcaFolder; // status reporting private int currRecords = 0; private int currRecordsSkipped = 0; private String currExtension; private STATE state = STATE.WAITING; private final SourceManager sourceManager; private final VocabulariesManager vocabManager; private Map<String, String> basisOfRecords; private Exception exception; private AppConfig cfg; private static final int ID_COLUMN_INDEX = 0; public static final String CHARACTER_ENCODING = "UTF-8"; private static final TermFactory TERM_FACTORY = TermFactory.instance(); private static final String SORTED_FILE_PREFIX = "sorted_"; private static final org.gbif.utils.file.FileUtils GBIF_FILE_UTILS = new org.gbif.utils.file.FileUtils(); public static final String CANCELLED_STATE_MSG = "Archive generation cancelled"; public static final String ID_COLUMN_NAME = "id"; public static final String TEXT_FILE_EXTENSION = ".txt"; public static final String WILDCARD_CHARACTER = "*"; public static final Set<DwcTerm> DWC_MULTI_VALUE_TERMS = ImmutableSet.of(DwcTerm.recordedBy, DwcTerm.preparations, DwcTerm.associatedMedia, DwcTerm.associatedReferences, DwcTerm.associatedSequences, DwcTerm.associatedTaxa, DwcTerm.otherCatalogNumbers, DwcTerm.associatedOccurrences, DwcTerm.associatedOrganisms, DwcTerm.previousIdentifications, DwcTerm.higherGeography, DwcTerm.georeferencedBy, DwcTerm.georeferenceSources, DwcTerm.typeStatus, DwcTerm.identifiedBy, DwcTerm.identificationReferences, DwcTerm.higherClassification, DwcTerm.measurementDeterminedBy); private static final Comparator<String> IGNORE_CASE_COMPARATOR = Ordering.from(new Comparator<String>() { public int compare(String o1, String o2) { return o1.compareToIgnoreCase(o2); } }).nullsFirst(); @Inject public GenerateDwca(@Assisted Resource resource, @Assisted ReportHandler handler, DataDir dataDir, SourceManager sourceManager, AppConfig cfg, VocabulariesManager vocabManager) throws IOException { super(1000, resource.getShortname(), handler, dataDir); this.resource = resource; this.sourceManager = sourceManager; this.cfg = cfg; this.vocabManager = vocabManager; } /** * Adds a single data file for a list of extension mappings that must all be mapped to the same extension. * </br> * The ID column is always the 1st column (index 0) and is always equal to the core record identifier that has been * mapped (e.g. occurrenceID, taxonID, etc). * * @param mappings list of ExtensionMapping * @param rowLimit maximum number of rows to write * @throws IllegalArgumentException if not all mappings are mapped to the same extension * @throws InterruptedException if the thread was interrupted * @throws IOException if problems occurred while persisting new data files * @throws GeneratorException if any problem was encountered writing data file */ public void addDataFile(List<ExtensionMapping> mappings, @Nullable Integer rowLimit) throws IOException, IllegalArgumentException, InterruptedException, GeneratorException { checkForInterruption(); if (mappings == null || mappings.isEmpty()) { return; } // update reporting currRecords = 0; currRecordsSkipped = 0; Extension ext = mappings.get(0).getExtension(); currExtension = ext.getTitle(); // verify that all mappings share this extension for (ExtensionMapping m : mappings) { if (!ext.equals(m.getExtension())) { throw new IllegalArgumentException( "All mappings for a single data file need to be mapped to the same extension: " + ext.getRowType()); } } // create new tab file with the help of the Archive class representing the core file or an extension ArchiveFile af = ArchiveFile.buildTabFile(); af.setRowType(TERM_FACTORY.findTerm(ext.getRowType())); af.setEncoding(CHARACTER_ENCODING); af.setDateFormat("YYYY-MM-DD"); // in the generated file column 0 will be the id column ArchiveField idField = new ArchiveField(); idField.setIndex(ID_COLUMN_INDEX); af.setId(idField); // find the union of all terms mapped and make them a field in the final archive Set<Term> mappedConceptTerms = addFieldsToArchive(mappings, af); // retrieve the ordered list of mapped ExtensionProperty List<ExtensionProperty> propertyList = getOrderedMappedExtensionProperties(ext, mappedConceptTerms); // reassign indexes ordered by Extension assignIndexesOrderedByExtension(propertyList, af); // total column count is equal to id column + mapped columns int totalColumns = 1 + propertyList.size(); // create file name from extension name, with incremental suffix to resolve name conflicts (e.g. taxon.txt, // taxon2.txt, taxon3.txt) String extensionName = (ext.getName() == null) ? "f" : ext.getName().toLowerCase().replaceAll("\\s", "_"); String fn = createFileName(dwcaFolder, extensionName); // open new file writer for single data file File dataFile = new File(dwcaFolder, fn); Writer writer = org.gbif.utils.file.FileUtils.startNewUtf8File(dataFile); // add source file location af.addLocation(dataFile.getName()); // ready to go though each mapping and dump the data addMessage(Level.INFO, "Start writing data file for " + currExtension); try { boolean headerWritten = false; for (ExtensionMapping m : mappings) { // prepare index ordered list of all output columns apart from id column PropertyMapping[] inCols = new PropertyMapping[totalColumns]; for (ArchiveField f : af.getFields().values()) { if (f.getIndex() != null && f.getIndex() > ID_COLUMN_INDEX) { inCols[f.getIndex()] = m.getField(f.getTerm().qualifiedName()); } } // write header line 1 time only to file if (!headerWritten) { writeHeaderLine(propertyList, totalColumns, af, writer); headerWritten = true; } // write data (records) to file dumpData(writer, inCols, m, totalColumns, rowLimit, resource.getDoi()); // store record number by extension rowType recordsByExtension.put(ext.getRowType(), currRecords); } } catch (IOException e) { // some error writing this file, report log.error("Fatal DwC-A Generator Error encountered while writing header line to data file", e); // set last error report! setState(e); throw new GeneratorException("Error writing header line to data file", e); } finally { writer.close(); } // add archive file to archive if (resource.getCoreRowType() != null && resource.getCoreRowType().equalsIgnoreCase(ext.getRowType())) { archive.setCore(af); } else { archive.addExtension(af); } // final reporting addMessage(Level.INFO, "Data file written for " + currExtension + " with " + currRecords + " records and " + totalColumns + " columns"); // how many records were skipped? if (currRecordsSkipped > 0) { addMessage(Level.WARN, "!!! " + currRecordsSkipped + " records were skipped for " + currExtension + " due to errors interpreting line, or because the line was empty"); } } /** * Write the header column line to file. * * @param propertyList ordered list of all ExtensionProperty that have been mapped across all mappings for a single * Extension * @param totalColumns total number of columns in header * @param af tab file with representing the core file or an extension * @param writer file writer * @throws IOException if writing the header line failed */ private void writeHeaderLine(List<ExtensionProperty> propertyList, int totalColumns, ArchiveFile af, Writer writer) throws IOException { String[] headers = new String[totalColumns]; // reserve 1st column for "id" headers[ID_COLUMN_INDEX] = ID_COLUMN_NAME; // add remaining mapped-column names int c = 1; for (ExtensionProperty property : propertyList) { headers[c] = property.simpleName(); c++; } // write header line - once per mapping String headerLine = tabRow(headers); af.setIgnoreHeaderLines(1); writer.write(headerLine); } /** * Adds EML file to DwC-A folder. * * @throws GeneratorException if EML file could not be copied to DwC-A folder * @throws InterruptedException if executing thread was interrupted */ private void addEmlFile() throws GeneratorException, InterruptedException { checkForInterruption(); setState(STATE.METADATA); try { FileUtils.copyFile(dataDir.resourceEmlFile(resource.getShortname()), new File(dwcaFolder, DataDir.EML_XML_FILENAME)); archive.setMetadataLocation(DataDir.EML_XML_FILENAME); } catch (IOException e) { throw new GeneratorException("Problem occurred while adding EML file to DwC-A folder", e); } // final reporting addMessage(Level.INFO, "EML file added"); } /** * Build a new ArchiveField having a ConceptTerm, plus optional multi-value delimiter. * </br> * Since all default values ​​will be written in the data file, they won't be expressed in the archive file (meta.xml). * That's why the default value is always set to null. * * @param term ConceptTerm * @param delimitedBy multi-value delimiter * * @return ArchiveField created */ private ArchiveField buildField(Term term, @Nullable String delimitedBy) { ArchiveField f = new ArchiveField(); f.setTerm(term); f.setDefaultValue(null); // is this term a multi-value field, and has a multi-value delimiter been configured? if (delimitedBy != null && term instanceof DwcTerm && DWC_MULTI_VALUE_TERMS.contains(term)) { f.setDelimitedBy(delimitedBy); } return f; } /** * Zips the DwC-A folder. A temp version is created first, and when successful, it it moved into the resource's * data directory. * * @throws GeneratorException if DwC-A could not be zipped or moved * @throws InterruptedException if executing thread was interrupted */ private void bundleArchive() throws GeneratorException, InterruptedException { checkForInterruption(); setState(STATE.BUNDLING); File zip = null; BigDecimal version = resource.getEmlVersion(); try { // create zip zip = dataDir.tmpFile("dwca", ".zip"); CompressionUtil.zipDir(dwcaFolder, zip); if (zip.exists()) { // move to data dir with versioned name File versionedFile = dataDir.resourceDwcaFile(resource.getShortname(), version); if (versionedFile.exists()) { FileUtils.forceDelete(versionedFile); } FileUtils.moveFile(zip, versionedFile); } else { throw new GeneratorException("Archive bundling failed: temp archive not created: " + zip.getAbsolutePath()); } } catch (IOException e) { throw new GeneratorException("Problem occurred while bundling DwC-A", e); } finally { // cleanup zip directory, if compression was incomplete for example due to Exception // if moving zip to data dir was successful, it won't exist any more and cleanup will be skipped if (zip != null && zip.exists()) { FileUtils.deleteQuietly(zip); } } // final reporting addMessage(Level.INFO, "Archive has been compressed"); } /** * Validate the DwC-A: * -ensure that if the core record identifier is mapped (e.g. occurrenceID, taxonID, etc) it is present on all * rows, and is unique * * @throws GeneratorException if DwC-A could not be validated * @throws InterruptedException if executing thread was interrupted */ private void validate() throws GeneratorException, InterruptedException { checkForInterruption(); setState(STATE.VALIDATING); try { // retrieve newly generated archive - decompressed Archive arch = ArchiveFactory.openArchive(dwcaFolder); // populate basisOfRecord lookup HashMap loadBasisOfRecordMapFromVocabulary(); // perform validation on core file (includes core ID and basisOfRecord validation) validateCoreDataFile(arch.getCore(), !arch.getExtensions().isEmpty()); // extra check for event core - publish warning if there aren't any associated occurrences if (isEventCore(arch)) { validateEventCore(arch); } // perform validation on extension files if (!arch.getExtensions().isEmpty()) { validateExtensionDataFiles(arch.getExtensions()); } } catch (IOException e) { throw new GeneratorException("Problem occurred while validating DwC-A", e); } // final reporting addMessage(Level.INFO, "Archive validated"); } /** * Sort the data file of a Darwin Core Archive by a column. Sorting is case sensitive. * * @param file unsorted file * @param column column to sort by file by * * @return the data file of the Archive sorted by column * @throws IOException if the sort fails for whatever reason */ private File sortCoreDataFile(ArchiveFile file, int column) throws IOException { // retrieve the core file File unsorted = file.getLocationFile(); // create a new file that will store the records sorted by column File sorted = new File(unsorted.getParentFile(), SORTED_FILE_PREFIX + unsorted.getName()); // get the ignore column rows, delimiter, enclosed by, newline character int headerLines = file.getIgnoreHeaderLines(); String columnDelimiter = file.getFieldsTerminatedBy(); Character enclosedBy = file.getFieldsEnclosedBy(); String newlineDelimiter = file.getLinesTerminatedBy(); // keep track of how long the sort takes long time = System.currentTimeMillis(); // sort by column LineComparator lineComparator = new LineComparator(column, columnDelimiter, enclosedBy, IGNORE_CASE_COMPARATOR); GBIF_FILE_UTILS .sort(unsorted, sorted, CHARACTER_ENCODING, column, columnDelimiter, enclosedBy, newlineDelimiter, headerLines, lineComparator, true); log.debug("Finished sorting file " + unsorted.getAbsolutePath() + " in " + String .valueOf((System.currentTimeMillis() - time) / 1000) + " secs, check: " + sorted.getAbsoluteFile().toString()); return sorted; } /** * For each extension data file: * </br> * -validate each record has an id * -validate basisOfRecord in extensions having occurrence rowType * -validate occurrenceId in extensions having occurrence rowType (if mapped) * * @param extensions Set of Archive extension data files (not core data files) * * @throws InterruptedException * @throws GeneratorException * @throws IOException */ private void validateExtensionDataFiles(Set<ArchiveFile> extensions) throws InterruptedException, GeneratorException, IOException { for (ArchiveFile extension: extensions) { validateExtensionDataFile(extension); } } /** * Populate basisOfRecords map from XML vocabulary, used to validate basisOfRecord values. */ private void loadBasisOfRecordMapFromVocabulary() { if (basisOfRecords == null) { basisOfRecords = new HashMap<String, String>(); basisOfRecords .putAll(vocabManager.getI18nVocab(Constants.VOCAB_URI_BASIS_OF_RECORDS, Locale.ENGLISH.getLanguage(), false)); basisOfRecords = MapUtils.getMapWithLowercaseKeys(basisOfRecords); } } /** * Validates that each record has a non empty ID, which is used to link the extension record and core record together. * </br> * Validates that each occurrence record has an occurrenceID, and that each occurrenceID is unique. * Performs this check only if the occurrenceID term has actually been mapped. * </br> * Validates that each occurrence record has a basisOfRecord, and that each basisOfRecord matches the * DwC Type Vocabulary. * * @param extFile extension file to validate * * @throws GeneratorException if validation was interrupted due to an error * @throws InterruptedException if the thread was interrupted * @throws java.io.IOException if a problem occurred sorting file, or opening iterator on it for example */ private void validateExtensionDataFile(ArchiveFile extFile) throws GeneratorException, InterruptedException, IOException { Preconditions.checkNotNull(resource.getCoreRowType()); addMessage(Level.INFO, "Validating the extension file: " + extFile.getTitle() + ". Depending on the number of records, this can take a while."); // get the core record ID term Term id = TERM_FACTORY.findTerm(AppConfig.coreIdTerm(resource.getCoreRowType())); Term occurrenceId = TERM_FACTORY.findTerm(Constants.DWC_OCCURRENCE_ID); Term basisOfRecord = TERM_FACTORY.findTerm(Constants.DWC_BASIS_OF_RECORD); int basisOfRecordIndex = -1; if (isOccurrenceFile(extFile)) { // fail immediately if occurrence core doesn't contain basisOfRecord mapping if (!extFile.hasTerm(basisOfRecord)) { addMessage(Level.ERROR, "Archive validation failed, because required term basisOfRecord was not mapped in the occurrence extension data file: " + extFile.getTitle()); throw new GeneratorException("Can't validate DwC-A for resource " + resource.getShortname() + "Required term basisOfRecord was not mapped in the occurrence extension data file: " + extFile.getTitle()); } addMessage(Level.INFO, "? Validating the basisOfRecord in the occurrence extension data file is always present and its " + "value matches the Darwin Core Type Vocabulary."); if (extFile.hasTerm(occurrenceId)) { addMessage(Level.INFO, "? Validating the occurrenceId in occurrence extension data file is always present and unique. "); } else { addMessage(Level.WARN, "No occurrenceId found in occurrence extension. To be indexed by GBIF, each occurrence record within a resource must have a unique record level identifier."); } // find index of basisOfRecord basisOfRecordIndex = extFile.getField(basisOfRecord).getIndex(); } // validate the extension ID has been mapped if (extFile.getId() == null) { addMessage(Level.ERROR, "Archive validation failed, because the ID field " + id.simpleName() + "was not mapped in the extension data file: " + extFile.getTitle()); throw new GeneratorException("Can't validate DwC-A for resource " + resource.getShortname() + ". The ID field was not mapped in the extension data file: " + extFile.getTitle()); } addMessage(Level.INFO, "? Validating the ID field " + id.simpleName() + " is always present in extension data file. "); // find index of column to sort file by - use occurrenceId term index if mapped, ID column otherwise int sortColumnIndex = (extFile.hasTerm(occurrenceId) && extFile.getField(occurrenceId).getIndex() != null) ? extFile.getField(occurrenceId).getIndex() : ID_COLUMN_INDEX; // create a sorted data file File sortedFile = sortCoreDataFile(extFile, sortColumnIndex); // create an iterator on the new sorted data file CSVReader reader = CSVReaderFactory .build(sortedFile, CHARACTER_ENCODING, extFile.getFieldsTerminatedBy(), extFile.getFieldsEnclosedBy(), extFile.getIgnoreHeaderLines()); // metrics int recordsWithNoId = 0; AtomicInteger recordsWithNoOccurrenceId = new AtomicInteger(0); AtomicInteger recordsWithDuplicateOccurrenceId = new AtomicInteger(0); AtomicInteger recordsWithNoBasisOfRecord = new AtomicInteger(0); AtomicInteger recordsWithNonMatchingBasisOfRecord = new AtomicInteger(0); AtomicInteger recordsWithAmbiguousBasisOfRecord = new AtomicInteger(0); ClosableReportingIterator<String[]> iter = null; int line = 0; String lastId = null; try { iter = reader.iterator(); while (iter.hasNext()) { line++; if (line % 1000 == 0) { checkForInterruption(line); reportIfNeeded(); } String[] record = iter.next(); if (record == null || record.length == 0) { continue; } // Exception on reading row was encountered if (iter.hasRowError() && iter.getException() != null) { throw new GeneratorException( "A fatal error was encountered while trying to validate sorted extension data file: " + iter .getErrorMessage(), iter.getException()); } else { // check id exists if (Strings.isNullOrEmpty(record[ID_COLUMN_INDEX])) { recordsWithNoId++; } if (isOccurrenceFile(extFile)) { if (extFile.hasTerm(occurrenceId)) { lastId = validateIdentifier(record[sortColumnIndex], lastId, recordsWithNoOccurrenceId, recordsWithDuplicateOccurrenceId); } validateBasisOfRecord(record[basisOfRecordIndex], line, recordsWithNoBasisOfRecord, recordsWithNonMatchingBasisOfRecord, recordsWithAmbiguousBasisOfRecord); } } } } catch (InterruptedException e) { // set last error report! setState(e); throw e; } catch (Exception e) { // some error validating this file, report log.error("Exception caught while validating extension file", e); // set last error report! setState(e); throw new GeneratorException("Error while validating extension file occurred on line " + line, e); } finally { if (iter != null) { // Exception on advancing cursor was encountered? if (!iter.hasRowError() && iter.getErrorMessage() != null) { writePublicationLogMessage("Error reading data: " + iter.getErrorMessage()); } iter.close(); } // always cleanup the sorted file, it must not be included in the dwca directory when compressed FileUtils.deleteQuietly(sortedFile); } // some final reporting.. if (recordsWithNoId > 0) { addMessage(Level.ERROR, String.valueOf(recordsWithNoId) + " line(s) in extension missing an ID " + id.simpleName() + ", which is required when linking the extension record and core record together"); throw new GeneratorException( "Can't validate DwC-A for resource " + resource.getShortname() + ". Each line in extension must have an ID " + id.simpleName() + ", which is required in order to link the extension to the core "); } else { addMessage(Level.INFO, "\u2713 Validated each line in extension has an ID " + id.simpleName()); writePublicationLogMessage("No lines in extension are missing an ID " + id.simpleName()); } if (isOccurrenceFile(extFile)) { if (extFile.hasTerm(occurrenceId)) { summarizeIdentifierValidation(recordsWithNoOccurrenceId, recordsWithDuplicateOccurrenceId, occurrenceId.simpleName()); } summarizeBasisOfRecordValidation(recordsWithNoBasisOfRecord, recordsWithNonMatchingBasisOfRecord, recordsWithAmbiguousBasisOfRecord); } } /** * Validate the Archive's core data file has an ID for each row, and that each ID is unique. Perform this check * only if the core record ID term (e.g. occurrenceID, taxonID, etc) has actually been mapped. * </br> * If the core has rowType occurrence, validate the core data file has a basisOfRecord for each row, and * that each basisOfRecord matches the DwC Type Vocabulary. * </br> * If the core has rowType event, validate there are associated occurrences. * * @param coreFile core ArchiveFile * @param archiveHasExtensions true if Archive has extensions, false otherwise * * @throws GeneratorException if validation was interrupted due to an error * @throws InterruptedException if the thread was interrupted * @throws java.io.IOException if a problem occurred sorting core file, or opening iterator on it for example */ private void validateCoreDataFile(ArchiveFile coreFile, boolean archiveHasExtensions) throws GeneratorException, InterruptedException, IOException { Preconditions.checkNotNull(resource.getCoreRowType()); addMessage(Level.INFO, "Validating the core file: " + coreFile.getTitle() + ". Depending on the number of records, this can take a while."); // get the core record ID term Term id = TERM_FACTORY.findTerm(AppConfig.coreIdTerm(resource.getCoreRowType())); Term basisOfRecord = TERM_FACTORY.findTerm(Constants.DWC_BASIS_OF_RECORD); int basisOfRecordIndex = -1; if (isOccurrenceFile(coreFile)) { // fail immediately if occurrence core doesn't contain basisOfRecord mapping if (!coreFile.hasTerm(basisOfRecord)) { addMessage(Level.ERROR, "Archive validation failed, because required term basisOfRecord was not mapped in the occurrence core"); throw new GeneratorException("Can't validate DwC-A for resource " + resource.getShortname() + ". Required term basisOfRecord was not mapped in the occurrence core"); } addMessage(Level.INFO, "? Validating the core basisOfRecord is always present is always present and its " + "value matches the Darwin Core Type Vocabulary."); // find index of basisOfRecord basisOfRecordIndex = coreFile.getField(basisOfRecord).getIndex(); } // validate the core ID / record identifier (e.g. occurrenceID, taxonID) if it has been mapped if (coreFile.hasTerm(id) || archiveHasExtensions) { String msg = "? Validating the core ID field " + id.simpleName() + " is always present and unique."; if (archiveHasExtensions) { msg = msg + " Note: the core ID field is required to link core records and extension records together. "; } addMessage(Level.INFO, msg); } // create a new core data file sorted by ID column 0 File sortedCore = sortCoreDataFile(coreFile, ID_COLUMN_INDEX); // create an iterator on the new sorted core data file CSVReader reader = CSVReaderFactory .build(sortedCore, CHARACTER_ENCODING, coreFile.getFieldsTerminatedBy(), coreFile.getFieldsEnclosedBy(), coreFile.getIgnoreHeaderLines()); // metrics AtomicInteger recordsWithNoId = new AtomicInteger(0); AtomicInteger recordsWithDuplicateId = new AtomicInteger(0); AtomicInteger recordsWithNoBasisOfRecord = new AtomicInteger(0); AtomicInteger recordsWithNonMatchingBasisOfRecord = new AtomicInteger(0); AtomicInteger recordsWithAmbiguousBasisOfRecord = new AtomicInteger(0); ClosableReportingIterator<String[]> iter = null; int line = 0; String lastId = null; try { iter = reader.iterator(); while (iter.hasNext()) { line++; if (line % 1000 == 0) { checkForInterruption(line); reportIfNeeded(); } String[] record = iter.next(); if (record == null || record.length == 0) { continue; } // Exception on reading row was encountered if (iter.hasRowError() && iter.getException() != null) { throw new GeneratorException( "A fatal error was encountered while trying to validate sorted core data file: " + iter.getErrorMessage(), iter.getException()); } else { // validate record id if it is mapped, or if archive has extensions (required to link core to extension) if (coreFile.hasTerm(id) || archiveHasExtensions) { lastId = validateIdentifier(record[ID_COLUMN_INDEX], lastId, recordsWithNoId, recordsWithDuplicateId); } if (isOccurrenceFile(coreFile)) { validateBasisOfRecord(record[basisOfRecordIndex], line, recordsWithNoBasisOfRecord, recordsWithNonMatchingBasisOfRecord, recordsWithAmbiguousBasisOfRecord); } } } } catch (InterruptedException e) { // set last error report! setState(e); throw e; } catch (Exception e) { // some error validating this file, report log.error("Exception caught while validating archive", e); // set last error report! setState(e); throw new GeneratorException("Error while validating archive occurred on line " + line, e); } finally { if (iter != null) { // Exception on advancing cursor was encountered? if (!iter.hasRowError() && iter.getErrorMessage() != null) { writePublicationLogMessage("Error reading data: " + iter.getErrorMessage()); } iter.close(); } // always cleanup the sorted file, it must not be included in the dwca directory when compressed FileUtils.deleteQuietly(sortedCore); } // some final reporting.. if (coreFile.hasTerm(id) || archiveHasExtensions) { summarizeIdentifierValidation(recordsWithNoId, recordsWithDuplicateId, id.simpleName()); } if (isOccurrenceFile(coreFile)) { summarizeBasisOfRecordValidation(recordsWithNoBasisOfRecord, recordsWithNonMatchingBasisOfRecord, recordsWithAmbiguousBasisOfRecord); } } /** * Check id exists, and check that the id is unique, using case insensitive comparison against another id, * e.g. FISHES:1 and fishes:1 are equal. * * @param id identifier value * @param lastId identifier value from last iteration * @param recordsWithNoId number of records with no id so far * @param recordsWithDuplicateId number of records with duplicate ids so far * * @return identifier value */ private String validateIdentifier(String id, String lastId, AtomicInteger recordsWithNoId, AtomicInteger recordsWithDuplicateId) { // check id exists if (Strings.isNullOrEmpty(id)) { recordsWithNoId.getAndIncrement(); } // check id is unique, using case insensitive comparison. E.g. FISHES:1 and fishes:1 are equal if (!Strings.isNullOrEmpty(lastId) && !Strings.isNullOrEmpty(id)) { if (id.equalsIgnoreCase(lastId)) { writePublicationLogMessage("Duplicate id found: " + id); recordsWithDuplicateId.getAndIncrement(); } } // set so id gets compared on next iteration return id; } /** * Check basisOfRecord exists, and check basisOfRecord matches vocabulary (lower case comparison). * E.g. specimen matches Specimen are equal. Lastly, check basisOfRecord matches ambiguous "occurrence" * (lower case comparison). * * @param bor basisOfRecord value * @param line line/record number * @param recordsWithNoBasisOfRecord number of records with no basisOfRecord so far * @param recordsWithNonMatchingBasisOfRecord number of records with basisOfRecord not matching vocabulary so far * @param recordsWithAmbiguousBasisOfRecord number of records with ambiguous basisOfRecord so far */ private void validateBasisOfRecord(String bor, int line, AtomicInteger recordsWithNoBasisOfRecord, AtomicInteger recordsWithNonMatchingBasisOfRecord, AtomicInteger recordsWithAmbiguousBasisOfRecord) { // check basisOfRecord exists if (Strings.isNullOrEmpty(bor)) { recordsWithNoBasisOfRecord.getAndIncrement(); } else { // check basisOfRecord matches vocabulary (lower case comparison). E.g. specimen matches Specimen are equal if (!basisOfRecords.containsKey(bor.toLowerCase())) { writePublicationLogMessage("Line #" + String.valueOf(line) + " has basisOfRecord [" + bor + "] that does not match the Darwin Core Type Vocabulary"); recordsWithNonMatchingBasisOfRecord.getAndIncrement(); } // check basisOfRecord matches ambiguous "occurrence" (lower case comparison) else if (bor.equalsIgnoreCase("occurrence")) { recordsWithAmbiguousBasisOfRecord.getAndIncrement(); } } } /** * Check if event core has an occurrence mapping, with at least one associated occurrence. Otherwise publish * warning message. * * @param arch Archive */ private void validateEventCore(Archive arch) throws GeneratorException { boolean validEventCore = true; // test if occurrence extension mapped ArchiveFile occurrenceExtension = arch.getExtension(DwcTerm.Occurrence); if (occurrenceExtension == null) { validEventCore = false; } // test if it has at least one record else { if (!occurrenceExtension.iterator().hasNext()) { validEventCore = false; } } if (!validEventCore) { addMessage(Level.WARN, "The sampling event resource has no associated occurrences."); } } /** * Report basisOfRecord validation (shared by two methods 1. validateBasisOfRecord(ArchiveFile archiveFile) * 2. validateCoreDataFile(Archive arch). * * @param recordsWithNoBasisOfRecord number of records with no basisOfRecord * @param recordsWithNonMatchingBasisOfRecord number of records with basisOfRecord not matching DwC Type Vocabulary * @param recordsWithAmbiguousBasisOfRecord number of records with basisOfRecord equal to 'occurrence' * * @throws GeneratorException if validation threshold exceeded */ private void summarizeBasisOfRecordValidation(AtomicInteger recordsWithNoBasisOfRecord, AtomicInteger recordsWithNonMatchingBasisOfRecord, AtomicInteger recordsWithAmbiguousBasisOfRecord) throws GeneratorException { // add empty BoR user message if (recordsWithNoBasisOfRecord.get() > 0) { addMessage(Level.ERROR, String.valueOf(recordsWithNoBasisOfRecord) + " line(s) are missing a basisOfRecord"); } else { writePublicationLogMessage("No lines are missing a basisOfRecord"); } // add non matching BoR user message if (recordsWithNonMatchingBasisOfRecord.get() > 0) { addMessage(Level.ERROR, String.valueOf(recordsWithNonMatchingBasisOfRecord) + " line(s) have basisOfRecord that does not match the Darwin Core Type Vocabulary " + "(please note comparisons are case insensitive)"); } else { writePublicationLogMessage("All lines have basisOfRecord that matches the Darwin Core Type Vocabulary"); } // add ambiguous BoR user message if (recordsWithAmbiguousBasisOfRecord.get() > 0) { addMessage(Level.WARN, String.valueOf(recordsWithAmbiguousBasisOfRecord) + " line(s) use ambiguous basisOfRecord 'occurrence'. It is advised that occurrence be " + "reserved for cases when the basisOfRecord is unknown. Otherwise, a more specific " + "basisOfRecord should be chosen."); } else { writePublicationLogMessage("No lines have ambiguous basisOfRecord 'occurrence'."); } // if there was 1 or more records missing a basisOfRecord, or having a non matching basisOfRecord, validation fails if (recordsWithNoBasisOfRecord.get() == 0 && recordsWithNonMatchingBasisOfRecord.get() == 0) { addMessage(Level.INFO, "✓ Validated each line has a basisOfRecord, and each basisOfRecord matches the Darwin Core Type Vocabulary"); } else { addMessage(Level.ERROR, "Archive validation failed, because not every row in the occurrence file(s) has a valid basisOfRecord " + "(please note all basisOfRecord must match Darwin Core Type Vocabulary, and comparisons are case " + "insensitive)"); throw new GeneratorException("Can't validate DwC-A for resource " + resource.getShortname() + ". Each row in the occurrence file(s) must have a basisOfRecord, and each " + "basisOfRecord must match the Darwin Core Type Vocabulary (please note " + "comparisons are case insensitive)"); } } /** * Report identifier validation (shared by two methods 1. validateOccurrenceDataFile(ArchiveFile archiveFile) * 2. validateCoreDataFile(Archive arch). * * @param recordsWithNoId number of records with no id * @param recordsWithDuplicateId number of records with duplicate ids * @param term name of identifier term being validated * * @throws GeneratorException if validation threshold exceeded */ private void summarizeIdentifierValidation(AtomicInteger recordsWithNoId, AtomicInteger recordsWithDuplicateId, String term) throws GeneratorException { // add empty ids user message if (recordsWithNoId.get() > 0) { addMessage(Level.ERROR, String.valueOf(recordsWithNoId) + " line(s) missing " + term); } else { writePublicationLogMessage("No lines are missing " + term); } // add duplicate ids user message if (recordsWithDuplicateId.get() > 0) { addMessage(Level.ERROR, String.valueOf(recordsWithDuplicateId) + " line(s) having a duplicate " + term + " (please note comparisons are case insensitive)"); } else { writePublicationLogMessage("No lines have duplicate " + term); } // if there was 1 or more records missing an ID, or having a duplicate ID, validation fails if (recordsWithNoId.get() == 0 && recordsWithDuplicateId.get() == 0) { addMessage(Level.INFO, "✓ Validated each line has a " + term + ", and each " + term + " is unique"); } else { addMessage(Level.ERROR, "Archive validation failed, because not every line has a unique " + term + " (please note comparisons are case insensitive)"); throw new GeneratorException( "Can't validate DwC-A for resource " + resource.getShortname() + ". Each line must have a " + term + ", and each " + term + " must be unique (please note comparisons are case insensitive)"); } } /** * @return true if the file has occurrence rowType. */ private boolean isOccurrenceFile(ArchiveFile archiveFile) { return archiveFile.getRowType().equals(DwcTerm.Occurrence); } /** * @return true if the archive core file has event rowType. */ private boolean isEventCore(Archive arch) { return arch.getCore().getRowType().equals(DwcTerm.Event); } /** * Method responsible for all stages of DwC-A file generation. * * @return number of records published in core file * @throws GeneratorException if DwC-A generation fails for any reason */ public Map<String, Integer> call() throws Exception { try { checkForInterruption(); setState(STATE.STARTED); // initial reporting addMessage(Level.INFO, "Archive generation started for version #" + String.valueOf(resource.getEmlVersion())); // create a temp dir to copy all dwca files to dwcaFolder = dataDir.tmpDir(); archive = new Archive(); // create data files createDataFiles(); // copy eml file addEmlFile(); // create meta.xml createMetaFile(); // perform some validation, e.g. ensure all core record identifiers are present and unique validate(); // zip archive and copy to resource folder bundleArchive(); // reporting addMessage(Level.INFO, "Archive version #" + String.valueOf(resource.getEmlVersion()) + " generated successfully!"); // set final state setState(STATE.COMPLETED); return recordsByExtension; } catch (GeneratorException e) { // set last error report! setState(e); // write exception to publication log file when IPT is in debug mode, otherwise just log it if (cfg.debug()) { writeFailureToPublicationLog(e); } else { log.error( "Exception occurred trying to generate Darwin Core Archive for resource " + resource.getTitleAndShortname() + ": " + e.getMessage(), e); } // rethrow exception, which gets wrapped in an ExecutionException and re caught when calling Future.get throw e; } catch (InterruptedException e) { setState(e); writeFailureToPublicationLog(e); throw e; } catch (Exception e) { setState(e); writeFailureToPublicationLog(e); throw new GeneratorException(e); } finally { // cleanup temp dir that was used to store dwca files if (dwcaFolder != null && dwcaFolder.exists()) { FileUtils.deleteQuietly(dwcaFolder); } // ensure publication log writer is closed closePublicationLogWriter(); } } /** * Checks if the executing thread has been interrupted, i.e. DwC-A generation was cancelled. * * @throws InterruptedException if the thread was found to be interrupted */ private void checkForInterruption() throws InterruptedException { if (Thread.interrupted()) { StatusReport report = report(); String msg = "Interrupting dwca generator. Last status: " + report.getState(); log.info(msg); throw new InterruptedException(msg); } } /** * Checks if the executing thread has been interrupted, i.e. DwC-A generation was cancelled. * * @param line number of lines currently processed at the time of the check * @throws InterruptedException if the thread was found to be interrupted */ private void checkForInterruption(int line) throws InterruptedException { if (Thread.interrupted()) { StatusReport report = report(); String msg = "Interrupting dwca generator at line " + line + ". Last status: " + report.getState(); log.info(msg); throw new InterruptedException(msg); } } @Override protected boolean completed() { return STATE.COMPLETED == this.state; } /** * Create data files. * * @throws GeneratorException if the resource had no core file that was mapped * @throws InterruptedException if the thread was interrupted */ private void createDataFiles() throws GeneratorException, InterruptedException { checkForInterruption(); setState(STATE.DATAFILES); if (!resource.hasCore() || resource.getCoreRowType() == null || resource.getCoreMappings().get(0).getSource() == null) { throw new GeneratorException("Core is not mapped"); } for (Extension ext : resource.getMappedExtensions()) { report(); try { addDataFile(resource.getMappings(ext.getRowType()), null); } catch (IOException e) { throw new GeneratorException("Problem occurred while writing data file", e); } catch (IllegalArgumentException e) { throw new GeneratorException("Problem occurred while writing data file", e); } } // final reporting addMessage(Level.INFO, "All data files completed"); report(); } /** * Create meta.xml file. * * @throws GeneratorException if meta.xml file creation failed * @throws InterruptedException if the thread was interrupted */ private void createMetaFile() throws GeneratorException, InterruptedException { checkForInterruption(); setState(STATE.METADATA); try { MetaDescriptorWriter.writeMetaFile(new File(dwcaFolder, "meta.xml"), archive); } catch (IOException e) { throw new GeneratorException("Meta.xml file could not be written", e); } // final reporting addMessage(Level.INFO, "meta.xml archive descriptor written"); } /* * (non-Javadoc) * @see org.gbif.ipt.task.ReportingTask#currentException() */ @Override protected Exception currentException() { return exception; } @Override protected String currentState() { switch (state) { case WAITING: return "Not started yet"; case STARTED: return "Starting archive generation"; case DATAFILES: return "Processing record " + currRecords + " for data file <em>" + currExtension + "</em>"; case METADATA: return "Creating metadata files"; case BUNDLING: return "Compressing archive"; case COMPLETED: return "Archive generated!"; case VALIDATING: return "Validating archive"; case ARCHIVING: return "Archiving version of archive"; case CANCELLED: return CANCELLED_STATE_MSG; case FAILED: return "Failed. Fatal error!"; default: return "You should never see this"; } } /** * Write data file for mapping. * * @param writer file writer for single data file * @param inCols index ordered list of all output columns apart from id column * @param mapping mapping * @param dataFileRowSize number of columns in data file * @param rowLimit maximum number of rows to write * @throws GeneratorException if there was an error writing data file for mapping. * @throws InterruptedException if the thread was interrupted */ private void dumpData(Writer writer, PropertyMapping[] inCols, ExtensionMapping mapping, int dataFileRowSize, @Nullable Integer rowLimit, @Nullable DOI doi) throws GeneratorException, InterruptedException { final String idSuffix = StringUtils.trimToEmpty(mapping.getIdSuffix()); final RecordFilter filter = mapping.getFilter(); // get maximum column index to check incoming rows for correctness int maxColumnIndex = mapping.getIdColumn() == null ? -1 : mapping.getIdColumn(); for (PropertyMapping pm : mapping.getFields()) { if (pm.getIndex() != null && maxColumnIndex < pm.getIndex()) { maxColumnIndex = pm.getIndex(); } } int recordsWithError = 0; int linesWithWrongColumnNumber = 0; int recordsFiltered = 0; int emptyLines = 0; ClosableReportingIterator<String[]> iter = null; int line = 0; try { // get the source iterator iter = sourceManager.rowIterator(mapping.getSource()); while (iter.hasNext()) { line++; if (line % 1000 == 0) { checkForInterruption(line); reportIfNeeded(); } String[] in = iter.next(); if (in == null || in.length == 0) { continue; } // Exception on reading row was encountered, meaning record is incomplete and not written if (iter.hasRowError()) { writePublicationLogMessage("Error reading line #" + line + "\n" + iter.getErrorMessage()); recordsWithError++; currRecordsSkipped++; } // empty line was encountered, meaning record only contains empty values and not written else if (isEmptyLine(in)) { writePublicationLogMessage("Empty line was skipped. SourceBase:" + mapping.getSource().getName() + " Line #" + line + ": " + printLine(in)); emptyLines++; currRecordsSkipped++; } else { if (in.length <= maxColumnIndex) { writePublicationLogMessage("Line with fewer columns than mapped. SourceBase:" + mapping.getSource().getName() + " Line #" + line + " has " + in.length + " Columns: " + printLine(in)); // input row is smaller than the highest mapped column. Resize array by adding nulls String[] in2 = new String[maxColumnIndex + 1]; System.arraycopy(in, 0, in2, 0, in.length); in = in2; linesWithWrongColumnNumber++; } String[] record = new String[dataFileRowSize]; // filter this record? boolean alreadyTranslated = false; if (filter != null && filter.getColumn() != null && filter.getComparator() != null && filter.getParam() != null) { boolean matchesFilter; if (filter.getFilterTime() == RecordFilter.FilterTime.AfterTranslation) { applyTranslations(inCols, in, record, mapping.isDoiUsedForDatasetId(), doi); matchesFilter = filter.matches(in); alreadyTranslated = true; } else { matchesFilter = filter.matches(in); } if (!matchesFilter) { writePublicationLogMessage("Line did not match the filter criteria and was skipped. SourceBase:" + mapping.getSource().getName() + " Line #" + line + ": " + printLine(in)); recordsFiltered++; continue; } } // add id column - either an existing column or the line number if (mapping.getIdColumn() == null) { record[ID_COLUMN_INDEX] = null; } else if (mapping.getIdColumn().equals(ExtensionMapping.IDGEN_LINE_NUMBER)) { record[ID_COLUMN_INDEX] = line + idSuffix; } else if (mapping.getIdColumn().equals(ExtensionMapping.IDGEN_UUID)) { record[ID_COLUMN_INDEX] = UUID.randomUUID().toString(); } else if (mapping.getIdColumn() >= 0) { record[ID_COLUMN_INDEX] = (Strings.isNullOrEmpty(in[mapping.getIdColumn()])) ? idSuffix : in[mapping.getIdColumn()] + idSuffix; } // go through all archive fields if (!alreadyTranslated) { applyTranslations(inCols, in, record, mapping.isDoiUsedForDatasetId(), doi); } String newRow = tabRow(record); if (newRow != null) { writer.write(newRow); currRecords++; // don't exceed row limit (e.g. only want to write X number of rows used to preview first X rows of file) if (rowLimit != null && currRecords >= rowLimit) { break; } } } } } catch (InterruptedException e) { // set last error report! setState(e); throw e; } catch (Exception e) { // some error writing this file, report log.error("Fatal DwC-A Generator Error encountered", e); // set last error report! setState(e); throw new GeneratorException("Error writing data file for mapping " + mapping.getExtension().getTitle() + " in source " + mapping.getSource().getName() + ", line " + line, e); } finally { if (iter != null) { // Exception on advancing cursor encountered? if (!iter.hasRowError() && iter.getErrorMessage() != null) { writePublicationLogMessage("Error reading data: " + iter.getErrorMessage()); } iter.close(); } } // common message part used in constructing all reporting messages below String mp = " for mapping " + mapping.getExtension().getTitle() + " in source " + mapping.getSource().getName(); // add lines incomplete message if (recordsWithError > 0) { addMessage(Level.WARN, String.valueOf(recordsWithError) + " record(s) skipped due to errors" + mp); } else { writePublicationLogMessage("No lines were skipped due to errors" + mp); } // add empty lines message if (emptyLines > 0) { addMessage(Level.WARN, String.valueOf(emptyLines) + " empty line(s) skipped" + mp); } else { writePublicationLogMessage("No lines were skipped due to errors" + mp); } // add wrong lines user message if (linesWithWrongColumnNumber > 0) { addMessage(Level.WARN, String.valueOf(linesWithWrongColumnNumber) + " line(s) with fewer columns than mapped" + mp); } else { writePublicationLogMessage("No lines with fewer columns than mapped" + mp); } // add filter message if (recordsFiltered > 0) { addMessage(Level.INFO, String.valueOf(recordsFiltered) + " line(s) did not match the filter criteria and got skipped " + mp); } else { writePublicationLogMessage("All lines match the filter criteria" + mp); } } /** * Sets an exception and state of the worker to FAILED. The final StatusReport is generated at the end. * * @param e exception */ private void setState(Exception e) { exception = e; state = (exception instanceof InterruptedException) ? STATE.CANCELLED : STATE.FAILED; report(); } /** * Sets only the state of the worker. The final StatusReport is generated at the end. * * @param s STATE of worker */ private void setState(STATE s) { state = s; report(); } /** * Generates a single tab delimited row from the list of values of the provided array. * </br> * Note all line breaking characters in the value get replaced with an empty string before its added to the row. * </br> * The row ends in a newline character. * * @param columns the array of values to join together, may not be null * * @return the tab delimited String, {@code null} if provided array only contained null values */ @VisibleForTesting protected String tabRow(String[] columns) { Preconditions.checkNotNull(columns); boolean empty = true; for (int i = 0; i < columns.length; i++) { if (columns[i] != null) { empty = false; columns[i] = StringUtils.trimToNull(escapeChars.matcher(columns[i]).replaceAll(" ")); } } if (empty) { return null; } return StringUtils.join(columns, '\t') + "\n"; } /** * Apply translations or default values to row, for all mapped properties. * </br> * The method starts by iterating through all mapped properties, checking each one if it has been translated or a * default value provided. The original value in the row is then replaced with the translated or default value. * A record array representing the values to be written to the data file is also updated. * * @param inCols values array, of columns in row that have been mapped * @param in values array, of all columns in row * @param doiUsedForDatasetId true if mapping should use resource DOI as datasetID, false otherwise * @param doi DOI assigned to resource */ private void applyTranslations(PropertyMapping[] inCols, String[] in, String[] record, boolean doiUsedForDatasetId, DOI doi) { for (int i = 1; i < inCols.length; i++) { PropertyMapping pm = inCols[i]; String val = null; if (pm != null) { if (pm.getIndex() != null) { val = in[pm.getIndex()]; // translate value? if (pm.getTranslation() != null && pm.getTranslation().containsKey(val)) { val = pm.getTranslation().get(val); // update value in original record in[pm.getIndex()] = val; } } // use default value for null values if (val == null) { val = pm.getDefaultValue(); } // use DOI for datasetID property? if (pm.getTerm().qualifiedName().equalsIgnoreCase(Constants.DWC_DATASET_ID) && doiUsedForDatasetId && doi != null) { val = doi.toString(); } } // add value to data file record record[i] = val; } } /** * Print a line representation of a string array used for logging. * * @param in String array * @return line */ private String printLine(String[] in) { StringBuilder sb = new StringBuilder(); sb.append("["); for (int i = 0; i < in.length; i++) { sb.append(in[i]); if (i != in.length - 1) { sb.append("; "); } } sb.append("]"); return sb.toString(); } /** * Write message from exception to publication log file as a new line but suffocate any exception thrown. * * @param e exception to write message from */ private void writeFailureToPublicationLog(Throwable e) { StringBuilder sb = new StringBuilder(); sb.append("Archive generation failed!\n"); // write exception as nicely formatted string StringWriter sw = new StringWriter(); e.printStackTrace(new PrintWriter(sw)); sb.append(sw.toString()); // write to publication log file writePublicationLogMessage(sb.toString()); } /** * First we need to find the union of all terms mapped (in all files) for a single Extension. Then make each mapped * term a field in the final archive. Static/default mappings are not stored for a field, since they are not * expressed in meta.xml but instead get written to the data file. * * @param mappings list of ExtensionMapping * @param af ArchiveFile * * @return set of conceptTerms that have been mapped (in all files) for a single Extension */ private Set<Term> addFieldsToArchive(List<ExtensionMapping> mappings, ArchiveFile af) throws GeneratorException{ Set<Term> mappedConceptTerms = new HashSet<Term>(); for (ExtensionMapping m : mappings) { // multi-value field delimiter, part of each source data configuration String delimitedBy = StringUtils.trimToNull(m.getSource().getMultiValueFieldsDelimitedBy()); for (PropertyMapping pm : m.getFields()) { Term term = TERM_FACTORY.findTerm(pm.getTerm().qualifiedName()); // ensure Extension has concept term if (term != null && m.getExtension().getProperty(term) != null) { if (af.hasTerm(term)) { ArchiveField field = af.getField(term); mappedConceptTerms.add(term); // multi-value delimiter must be same across all sources if (field.getDelimitedBy() != null && !field.getDelimitedBy().equals(delimitedBy)) { throw new GeneratorException( "More than one type of multi-value field delimiter is being used in the source files mapped to the " + m.getExtension().getName() + " extension. Please either ensure all source files mapped to this extension use the same delimiter, otherwise just leave the delimiter blank."); } } else { if ((pm.getIndex() != null && pm.getIndex() >= 0) || pm.getIndex() == null) { log.debug( "Handling property mapping for term: " + term.qualifiedName() + " (index " + pm.getIndex() + ")"); af.addField(buildField(term, delimitedBy)); mappedConceptTerms.add(term); } } } } // if Extension has datasetID concept term, check if resource DOI should be used as value for mapping ExtensionProperty ep = m.getExtension().getProperty(DwcTerm.datasetID.qualifiedName()); if (ep != null && m.isDoiUsedForDatasetId()) { log.debug("Detected that resource DOI to be used as value for datasetID mapping.."); // include datasetID field in ArchiveFile ArchiveField f = buildField(DwcTerm.datasetID, null); af.addField(f); // include datasetID field mapping in ExtensionMapping PropertyMapping pm = new PropertyMapping(f); pm.setTerm(ep); m.getFields().add(pm); // include datasetID in set of all terms mapped for Extension mappedConceptTerms.add(DwcTerm.datasetID); } } return mappedConceptTerms; } /** * Iterate through ordered list of those ExtensionProperty that have been mapped, and reassign the ArchiveFile * ArchiveField indexes, based on the order of their appearance in the ordered list be careful to reserve index 0 for * the ID column * * @param propertyList ordered list of those ExtensionProperty that have been mapped * @param af ArchiveFile */ private void assignIndexesOrderedByExtension(List<ExtensionProperty> propertyList, ArchiveFile af) { for (int propertyIndex = 0; propertyIndex < propertyList.size(); propertyIndex++) { ExtensionProperty extensionProperty = propertyList.get(propertyIndex); // retrieve the dwc-api Term corresponding to ExtensionProperty Term term = TERM_FACTORY.findTerm(extensionProperty.getQualname()); // lookup ArchiveField using dwc-api Term ArchiveField f = af.getField(term); if (f != null && f.getIndex() == null) { // create new field index corresponding to its position in ordered list of columns indexed // +1 because index 0 is reserved for ID column int fieldIndex = propertyIndex + 1; // assign ArchiveField new index so that meta.xml file mirrors the ordered field order f.setIndex(fieldIndex); } else { log.warn("Skipping ExtensionProperty: " + extensionProperty.getQualname()); } } } /** * Retrieve the ordered list of all Extension's mapped ExtensionProperty. Ordering is done according to Extension. * * @param ext Extension * @param mappedConceptTerms set of all mapped ConceptTerm * @return ordered list of mapped ExtensionProperty */ private List<ExtensionProperty> getOrderedMappedExtensionProperties(Extension ext, Set<Term> mappedConceptTerms) { List<ExtensionProperty> propertyList = new ArrayList<ExtensionProperty>(); // start with all Extension's ExtensionProperty, in natural order propertyList.addAll(ext.getProperties()); // matching (below) should be done on the qualified Normalised Name Set<String> names = new HashSet<String>(); for (Term conceptTerm : mappedConceptTerms) { names.add(conceptTerm.qualifiedName()); } // remove all ExtensionProperty that have not been mapped, leaving the ordered list of those that have been for (Iterator<ExtensionProperty> iterator = propertyList.iterator(); iterator.hasNext();) { ExtensionProperty extensionProperty = iterator.next(); if (extensionProperty.qualifiedName() != null) { if (!names.contains(extensionProperty.qualifiedName())) { iterator.remove(); } } } return propertyList; } /** * This method checks whether a competing file name exists in the folder where DwC-A files are written to. * If a competing file name exists, a numerical suffix is appended to the file name, to differentiate it from the * existing files' names. The numerical suffix is incrementing, and is equal to the number of existing files with * this name. * </br> * E.g. the initial name has no suffix (taxon.txt), but subsequent names look like (taxon2.txt, taxon3.txt, etc). * * Before IPT v2.2 the DwC-A file name has been determined from the extension name. When two extensions had the same * name, this caused one file to be overwritten - see Issue 1087. * * @param dwcaFolder folder where DwC-A files are written to * @param extensionName name of extension writing file for * * @return name of file for DwC-A file to be written */ protected String createFileName(File dwcaFolder, String extensionName) { String wildcard = extensionName + WILDCARD_CHARACTER + TEXT_FILE_EXTENSION; FileFilter fileFilter = new WildcardFileFilter(wildcard, IOCase.INSENSITIVE); File[] files = dwcaFolder.listFiles(fileFilter); if (files.length > 0) { int max = 1; String fileName = null; for (File file: files) { try { fileName = file.getName(); int suffixEndIndex = fileName.indexOf(TEXT_FILE_EXTENSION); String suffix = file.getName().substring(extensionName.length(), suffixEndIndex); int suffixInt = Integer.valueOf(suffix); if (suffixInt >= max) { max = suffixInt; } } catch (NumberFormatException e) { log.debug("No numerical suffix could be parsed from file name: " + Strings.nullToEmpty(fileName)); } } return extensionName + String.valueOf(max + 1) + TEXT_FILE_EXTENSION; } return extensionName + TEXT_FILE_EXTENSION; } /** * Required for preview mapping feature, on manage resource page. * * @param dwcaFolder DwC-A directory */ public void setDwcaFolder(File dwcaFolder) { this.dwcaFolder = dwcaFolder; } /** * Required for preview mapping feature, on manage resource page. * * @param archive DwC Archive */ public void setArchive(Archive archive) { this.archive = archive; } /** * Check if each string in array is empty. Method joins each string together and then checks if it is blank. A * blank string represents an empty line in a source data file. * * @param line string array * * @return true if each string in array is empty, false otherwise */ private boolean isEmptyLine(String[] line) { String joined = Joiner.on("").useForNull("").join(line); return StringUtils.isBlank(joined); } }