package org.gbif.occurrence.download.file.dwca;
import org.gbif.api.model.common.MediaObject;
import org.gbif.api.vocabulary.MediaType;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.occurrence.common.TermUtils;
import org.gbif.occurrence.common.download.DownloadUtils;
import org.gbif.occurrence.download.file.DownloadFileWork;
import org.gbif.occurrence.download.file.OccurrenceMapReader;
import org.gbif.occurrence.download.file.Result;
import org.gbif.occurrence.download.file.common.DatasetUsagesCollector;
import org.gbif.occurrence.download.file.common.SolrQueryProcessor;
import org.gbif.occurrence.persistence.util.OccurrenceBuilder;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import akka.actor.UntypedActor;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Objects;
import com.google.common.base.Predicate;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.beanutils.ConvertUtils;
import org.apache.commons.beanutils.converters.DateConverter;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.supercsv.cellprocessor.constraint.NotNull;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvBeanWriter;
import org.supercsv.io.CsvMapWriter;
import org.supercsv.io.ICsvBeanWriter;
import org.supercsv.io.ICsvMapWriter;
import org.supercsv.prefs.CsvPreference;
import org.supercsv.util.CsvContext;
import static org.gbif.occurrence.common.download.DownloadUtils.DELIMETERS_MATCH_PATTERN;
/**
* Actor that creates part files of for the DwcA download format.
*/
public class DownloadDwcaActor extends UntypedActor {
private static final Logger LOG = LoggerFactory.getLogger(DownloadDwcaActor.class);
static {
//https://issues.apache.org/jira/browse/BEANUTILS-387
ConvertUtils.register(new DateConverter(null), Date.class);
}
private final static Function<Term, String> SIMPLE_NAME_FUNC = new Function<Term, String>() {
@Nullable
@Override
public String apply(@Nullable Term input) {
return input.simpleName();
}
};
private static final String[] INT_COLUMNS =
Lists.transform(Lists.newArrayList(TermUtils.interpretedTerms()), SIMPLE_NAME_FUNC).toArray(new String[0]);
private static final String[] VERB_COLUMNS =
Lists.transform(Lists.newArrayList(TermUtils.verbatimTerms()), SIMPLE_NAME_FUNC).toArray(new String[0]);
private static final String[] MULTIMEDIA_COLUMNS =
Lists.transform(Lists.newArrayList(TermUtils.multimediaTerms()), SIMPLE_NAME_FUNC).toArray(new String[0]);
private static final CellProcessor[] MEDIA_CELL_PROCESSORS = {new NotNull(), // coreid
new MediaTypeProcessor(), // type
new CleanStringProcessor(), // format
new URIProcessor(), // identifier
new URIProcessor(), // references
new CleanStringProcessor(), // title
new CleanStringProcessor(), // description
new DateProcessor(), // created
new CleanStringProcessor(), // creator
new CleanStringProcessor(), // contributor
new CleanStringProcessor(), // publisher
new CleanStringProcessor(), // audience
new CleanStringProcessor(), // source
new CleanStringProcessor(), // license
new CleanStringProcessor() // rightsHolder
};
/**
* Writes the multimedia objects into the file referenced by multimediaCsvWriter.
*/
private static void writeMediaObjects(ICsvBeanWriter multimediaCsvWriter,
org.apache.hadoop.hbase.client.Result result,
Integer occurrenceKey) throws IOException {
List<MediaObject> multimedia = OccurrenceBuilder.buildMedia(result);
if (multimedia != null) {
for (MediaObject mediaObject : multimedia) {
multimediaCsvWriter.write(new InnerMediaObject(mediaObject, occurrenceKey),
MULTIMEDIA_COLUMNS,
MEDIA_CELL_PROCESSORS);
}
}
}
/**
* Executes the job.query and creates a data file that will contains the records from job.from to job.to positions.
*/
public void doWork(final DownloadFileWork work) throws IOException {
final DatasetUsagesCollector datasetUsagesCollector = new DatasetUsagesCollector();
try (
ICsvMapWriter intCsvWriter = new CsvMapWriter(new FileWriterWithEncoding(work.getJobDataFileName()
+ TableSuffixes.INTERPRETED_SUFFIX,
Charsets.UTF_8),
CsvPreference.TAB_PREFERENCE);
ICsvMapWriter verbCsvWriter = new CsvMapWriter(new FileWriterWithEncoding(work.getJobDataFileName()
+ TableSuffixes.VERBATIM_SUFFIX,
Charsets.UTF_8),
CsvPreference.TAB_PREFERENCE);
ICsvBeanWriter multimediaCsvWriter = new CsvBeanWriter(new FileWriterWithEncoding(work.getJobDataFileName()
+ TableSuffixes.MULTIMEDIA_SUFFIX,
Charsets.UTF_8),
CsvPreference.TAB_PREFERENCE)) {
SolrQueryProcessor.processQuery(work, new Predicate<Integer>() {
@Override
public boolean apply(@Nullable Integer occurrenceKey) {
try {
// Writes the occurrence record obtained from HBase as Map<String,Object>.
org.apache.hadoop.hbase.client.Result result = work.getOccurrenceMapReader().get(occurrenceKey);
Map<String, String> occurrenceRecordMap = OccurrenceMapReader.buildInterpretedOccurrenceMap(result);
Map<String, String> verbOccurrenceRecordMap = OccurrenceMapReader.buildVerbatimOccurrenceMap(result);
if (occurrenceRecordMap != null) {
datasetUsagesCollector.incrementDatasetUsage(occurrenceRecordMap.get(GbifTerm.datasetKey.simpleName()));
intCsvWriter.write(occurrenceRecordMap, INT_COLUMNS);
verbCsvWriter.write(verbOccurrenceRecordMap, VERB_COLUMNS);
writeMediaObjects(multimediaCsvWriter, result, occurrenceKey);
return true;
} else {
LOG.error(String.format("Occurrence id %s not found!", occurrenceKey));
}
} catch (Exception e) {
throw Throwables.propagate(e);
}
return false;
}
});
} finally {
// Unlock the assigned lock.
work.getLock().unlock();
LOG.info("Lock released, job detail: {} ", work.toString());
}
getSender().tell(new Result(work, datasetUsagesCollector.getDatasetUsages()), getSelf());
}
@Override
public void onReceive(Object message) throws Exception {
if (message instanceof DownloadFileWork) {
doWork((DownloadFileWork) message);
} else {
unhandled(message);
}
}
/**
* Inner class used to export data into multimedia.txt files.
* The structure must match the headers defined in MULTIMEDIA_COLUMNS.
*/
public static class InnerMediaObject extends MediaObject {
private Integer gbifID;
/**
* Default constructor.
* Required by CVS serialization.
*/
public InnerMediaObject() {
// default constructor
}
/**
* Default constructor.
* Copies the fields of the media object parameter and assigns the coreid.
*/
public InnerMediaObject(MediaObject mediaObject, Integer gbifID) {
try {
BeanUtils.copyProperties(this, mediaObject);
this.gbifID = gbifID;
} catch (IllegalAccessException | InvocationTargetException e) {
throw Throwables.propagate(e);
}
}
@Override
public String toString() {
return Objects.toStringHelper(this).addValue(super.toString()).add("gbifID", gbifID).toString();
}
/**
* Id column for the multimedia.txt file.
*/
public Integer getGbifID() {
return gbifID;
}
public void setGbifID(Integer gbifID) {
this.gbifID = gbifID;
}
}
/**
* Produces a MediaType instance.
*/
private static class MediaTypeProcessor implements CellProcessor {
@Override
public String execute(Object value, CsvContext context) {
return value != null ? ((MediaType) value).name() : "";
}
}
/**
* Produces a String instance clean of delimiter.
* If the value is null an empty string is returned.
*/
private static class CleanStringProcessor implements CellProcessor {
@Override
public String execute(Object value, CsvContext context) {
return value != null ? DELIMETERS_MATCH_PATTERN.matcher((String) value).replaceAll(" ") : "";
}
}
/**
* Produces a URI instance.
*/
private static class URIProcessor implements CellProcessor {
@Override
public String execute(Object value, CsvContext context) {
return value != null ? ((URI) value).toString() : "";
}
}
/**
* Produces a date instance.
*/
private static class DateProcessor implements CellProcessor {
@Override
public String execute(Object value, CsvContext context) {
return value != null ? new SimpleDateFormat(DownloadUtils.ISO_8601_FORMAT).format((Date) value) : "";
}
}
}