DwcaStreamWriter.java example

Explorer

dwca-io-master
- src
  - main
    - java
      - org
        gbif
        digester
        CallParamNoNSRule.java
        ThesaurusHandlingRule.java
        dwc
        extensions
        Extension.java
        ExtensionFactory.java
        ExtensionProperty.java
        VocabulariesManager.java
        Vocabulary.java
        VocabularyConcept.java
        VocabularyFactory.java
        VocabularyTerm.java
        package-info.java
        dwca
        io
        Archive.java
        ArchiveFactory.java
        ArchiveField.java
        ArchiveFile.java
        DwcaStreamWriter.java
        DwcaWriter.java
        MetaDescriptorWriter.java
        MetaXMLSaxHandler.java
        MetadataException.java
        SimpleSaxHandler.java
        UnsupportedArchiveException.java
        record
        CleanUtils.java
        DarwinCoreRecord.java
        DarwinCoreTaxon.java
        Record.java
        RecordImpl.java
        RecordIterator.java
        StarRecord.java
        StarRecordImpl.java
        tools
        ArchiveScanner.java
        IdGenerator.java
        IntSequenceGenerator.java
        MetaValidator.java
        io
        TabWriter.java
        tabular
        DwcTabularDataFileReader.java
        TermTabularDataFileReader.java
        TermTabularDataLine.java
        TermTabularFiles.java
        util
        CSVReaderHelper.java
        DownloadUtil.java
        xml
        SAXUtils.java
  - test
    - java
      - org
        gbif
        dwc
        extensions
        ExtensionFactoryTest.java
        ExtensionTest.java
        VocabularyConceptTest.java
        dwca
        io
        ArchiveFactoryTest.java
        ArchiveFileTest.java
        ArchivePlaziTest.java
        ArchiveTest.java
        DwcaStreamWriterTest.java
        DwcaWriterTest.java
        MetaDescriptorTest.java
        ScanArchiveForScientificName.java
        StarIteratorTest.java
        UsageExample.java
        record
        CleanUtilsTest.java
        DarwinCoreRecordTest.java
        DarwinCoreTaxonTest.java
        RecordImplTest.java
        tools
        IntSequenceGeneratorTest.java
        io
        DownloadUtilTest.java
        StrTokenizerPerformance.java
        StrTokenizerTest.java
        tabular
        TermTabularDataFileReaderTest.java

package org.gbif.dwca.io;

import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import com.google.common.io.Closer;
import org.apache.commons.io.IOUtils;
import org.gbif.api.model.registry.Dataset;
import org.gbif.dwc.terms.Term;
import org.gbif.io.TabWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Map;

/**
 * An archive writer that writes entire data files at once and does not check integrity of coreids.
 * In large archives using extensions this yields a much, much better performance than writing star record by star record.
 */
public class DwcaStreamWriter implements AutoCloseable {
    private static final Logger LOG = LoggerFactory.getLogger(DwcaStreamWriter.class);

    private final File dir;
    private final Term core;
    private final Term coreIdTerm;
    private final boolean useHeaders;
    private final Archive archive = new Archive();
    private Dataset metadata;
    private Map<String, Dataset> constituents = Maps.newHashMap();

    /**
     * @param dir the directory to use as the archive
     * @param coreRowType the archives core row type
     * @param coreIdTerm if given used to map the id column of the core
     * @param useHeaders if true write a single header row for each data file
     */
    public DwcaStreamWriter(File dir, Term coreRowType, @Nullable Term coreIdTerm, boolean useHeaders) {
        this.dir = dir;
        this.core = coreRowType;
        this.coreIdTerm = coreIdTerm;
        this.useHeaders = useHeaders;
        archive.setLocation(dir);
    }

    private File dataFile(Term rowType) {
        return new File(dir, rowType.simpleName()+".tsv");
    }

    private static ArchiveField idField(int column) {
        ArchiveField field = new ArchiveField();
        field.setIndex(column);
        return field;
    }
    /**
     * @param coreIdColumn zero based index to the rows coreid
     * @param mapping zero based indexed of the rows
     */
    public void write(Term rowType, int coreIdColumn, Map<Term, Integer> mapping, Iterable<String[]> rows) {
        Preconditions.checkNotNull(rows);

        final int maxMapping = maxMappingColumn(mapping);
        try (TabWriter writer = addArchiveFile(rowType, coreIdColumn, mapping, maxMapping) ) {
            // write data
            for (String[] row : rows) {
                write(writer, row, maxMapping);
            }
        } catch (IOException e) {
            Throwables.propagate(e);
        }
    }

    private static void write(TabWriter writer, String[] row, int maxMapping) throws IOException {
        if (row != null && row.length < maxMapping) {
            throw new IllegalArgumentException("Input rows are smaller than the defined mapping of " + maxMapping + " columns.");
        }
        writer.write(row);
    }

    private int maxMappingColumn(Map<Term, Integer> mapping) {
        return mapping.values().stream().max(Integer::compareTo).get();
    }

    private TabWriter addArchiveFile(Term rowType, int coreIdColumn, Map<Term, Integer> mapping, int maxMapping) throws IOException {
        Preconditions.checkNotNull(rowType);
        Preconditions.checkNotNull(mapping);
        Preconditions.checkArgument(!mapping.isEmpty());
        Preconditions.checkArgument(coreIdColumn >= 0);

        final File dataFile = dataFile(rowType);
        ArchiveFile af = ArchiveFile.buildTabFile();
        af.setEncoding("UTF8");
        af.setRowType(rowType);
        af.addLocation(dataFile.getName());
        af.setIgnoreHeaderLines(useHeaders ? 1 : 0);
        af.setId(idField(coreIdColumn));
        for (Map.Entry<Term, Integer> entry : mapping.entrySet()) {
            ArchiveField field = new ArchiveField();
            field.setTerm(entry.getKey());
            field.setIndex(entry.getValue());
            af.addField(field);
        }
        if (core.equals(rowType)) {
            if (coreIdTerm != null) {
                af.getId().setTerm(coreIdTerm);
            }
            archive.setCore(af);
        } else {
            archive.addExtension(af);
        }

        // write headers
        TabWriter writer = TabWriter.fromFile(dataFile);
        if (useHeaders){
            String[] header = new String[maxMapping+1];
            mapping.entrySet().stream().sorted(Map.Entry.comparingByValue()).forEach((e)->{
                header[e.getValue()] = e.getKey().simpleName();
            });
            if (coreIdTerm != null) {
                header[0] = coreIdTerm.simpleName();
            }
            writer.write(header);
        }

        return writer;
    }

    public interface RowWriteHandler extends AutoCloseable {
        void write(String[] row);
    }

    private class RowWriteHandlerImpl implements RowWriteHandler {
        private final TabWriter writer;
        private final int minColumns;

        RowWriteHandlerImpl(TabWriter writer, int minColumns) {
            this.writer = writer;
            this.minColumns = minColumns;
        }

        public void write(String[] row) {
            if (row != null && row.length < minColumns) {
                throw new IllegalArgumentException("Input rows are smaller than the defined mapping of " + minColumns + " columns.");
            }
            try {
                writer.write(row);
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
        }

        @Override
        public void close() throws IOException {
            writer.close();
        }
    }

    /**
     * Useful for glueing mybatis result handlers directly into the dwca writer.
     * Make sure to close the write handler properly!
     * @param rowType
     * @param coreIdColumn
     * @param mapping
     * @return a handler accepting single rows to write into the data file
     */
    public RowWriteHandler writeHandler(Term rowType, int coreIdColumn, Map<Term, Integer> mapping) {
        try {
            final int maxMapping = maxMappingColumn(mapping);
            TabWriter writer = addArchiveFile(rowType, coreIdColumn, mapping, maxMapping);
            return new RowWriteHandlerImpl(writer, maxMapping);

        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    }

    public void setMetadata(Dataset d) {
        metadata = d;
    }

    /**
     * Adds a constituent dataset using the dataset key as the datasetID
     */
    public void addConstituent(Dataset eml) {
        addConstituent(eml.getKey().toString(), eml);
    }

    /**
     * Adds a constituent dataset.
     * The eml file will be called as the datasetID which has to be unique.
     */
    public void addConstituent(String datasetID, Dataset eml) {
        this.constituents.put(datasetID, eml);
    }

    /**
     * Writes meta.xml and eml.xml to the archive.
     */
    @Override
    public void close() throws IOException {
        checkCoreRowType();
        addEml();
        addConstituents();
        MetaDescriptorWriter.writeMetaFile(archive);
        LOG.info("Wrote archive to {}", archive.getLocation().getAbsolutePath());
    }

    /**
     * check if core row type has been written
     */
    private void checkCoreRowType() {
        if (archive.getCore() == null) {
            throw new IllegalStateException("The core data file has not yet been written for " + core.qualifiedName());
        }
    }

    private void addEml() throws IOException {
        if (metadata != null) {
            DwcaWriter.writeEml(metadata, new File(dir, "eml.xml"));
            archive.setMetadataLocation("eml.xml");
        }
    }

    private void addConstituents() throws IOException {
        if (!constituents.isEmpty()) {
            File ddir = new File(dir, Archive.CONSTITUENT_DIR);
            ddir.mkdirs();
            for (Map.Entry<String, Dataset> de : constituents.entrySet()) {
                DwcaWriter.writeEml(de.getValue(), new File(ddir, de.getKey()+".xml"));
            }
        }
    }
}