package org.gbif.occurrence.search;
import org.gbif.api.model.common.MediaObject;
import org.gbif.api.model.occurrence.Occurrence;
import org.gbif.api.util.VocabularyUtils;
import org.gbif.api.vocabulary.BasisOfRecord;
import org.gbif.api.vocabulary.Continent;
import org.gbif.api.vocabulary.Country;
import org.gbif.api.vocabulary.EndpointType;
import org.gbif.api.vocabulary.EstablishmentMeans;
import org.gbif.api.vocabulary.License;
import org.gbif.api.vocabulary.OccurrenceIssue;
import org.gbif.api.vocabulary.TypeStatus;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.occurrence.common.json.MediaSerDeserUtils;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.UUID;
import com.google.common.base.Predicate;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import com.google.common.io.Resources;
import org.apache.commons.beanutils.PropertyUtils;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.supercsv.cellprocessor.Optional;
import org.supercsv.cellprocessor.ParseDate;
import org.supercsv.cellprocessor.ParseDouble;
import org.supercsv.cellprocessor.ParseInt;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvMapReader;
import org.supercsv.io.ICsvMapReader;
import org.supercsv.prefs.CsvPreference;
import org.supercsv.util.CsvContext;
/**
* Utility class that loads and processes occurrence records read from a CSV file.
* The expected columns in that file are:
* "key","altitude","basisOfRecord","catalogNumber","classKey","clazz","collectionCode","dataProviderId",
* "dataResourceId","datasetKey","depth","occurrenceId","family","familyKey","genus",
* "genusKey","institutionCode","country","continent","kingdom","kingdomKey","latitude","longitude","modified",
* "occurrenceMonth","taxonKey","occurrenceDate","order","orderKey","otherIssue",
* "publishingOrgKey","phylum","phylumKey","scientificName","species","speciesKey",
* "unitQualifier","year","locality","county","stateProvince","continent",
* "collectorName","collectorNumber","identifierName", "identificationDate".
* Each cvs line is interpreted into an Occurrence object; to process each object a predicate or list of predicates are
* passed as parameters to the function loadOccurrences.
*/
public class OccurrenceDataLoader {
/**
* Produces a Basis of Record instance.
*/
private static class EstablishmentMeansProcessor implements CellProcessor {
@Override
public EstablishmentMeans execute(Object value, CsvContext context) {
Enum<?> establishmentMeans = VocabularyUtils.lookupEnum((String) value, EstablishmentMeans.class);
if (establishmentMeans != null) {
return (EstablishmentMeans) establishmentMeans;
}
return null;
}
}
private static class IssueProcessor implements CellProcessor {
private ObjectMapper MAPPER = new ObjectMapper();
@Override
public Set<OccurrenceIssue> execute(Object value, CsvContext context) {
Set<OccurrenceIssue> occurrenceIssues = Sets.newHashSet();
try {
Set<String> issues = MAPPER.readValue(value.toString(), new TypeReference<Set<String>>() {
});
if(issues != null && !issues.isEmpty()) {
for (String issueStr : issues) {
occurrenceIssues.add(VocabularyUtils.lookupEnum(issueStr, OccurrenceIssue.class));
}
}
} catch (IOException e) {
e.printStackTrace();
}
return occurrenceIssues;
}
}
/**
* Produces a Basis of Record instance.
*/
private static class BasisOfRecordProcessor implements CellProcessor {
@Override
public BasisOfRecord execute(Object value, CsvContext context) {
Enum<?> basisOfRecord = VocabularyUtils.lookupEnum((String) value, BasisOfRecord.class);
if (basisOfRecord != null) {
return (BasisOfRecord) basisOfRecord;
}
return null;
}
}
/**
* Produces a TypeStatus instance.
*/
private static class TypeStatusProcessor implements CellProcessor {
@Override
public TypeStatus execute(Object value, CsvContext context) {
Enum<?> typeStatus = VocabularyUtils.lookupEnum((String) value, TypeStatus.class);
if (typeStatus != null) {
return (TypeStatus) typeStatus;
}
return null;
}
}
/**
* Produces a MediaType instance.
*/
private static class MediaListProcessor implements CellProcessor {
@Override
public List<MediaObject> execute(Object value, CsvContext context) {
if (value != null) {
return MediaSerDeserUtils.fromJson((String) value);
}
return null;
}
}
/**
* Produces a Continent instance.
*/
private static class ContinentProcessor implements CellProcessor {
@Override
public Continent execute(Object value, CsvContext context) {
Enum<?> continent = VocabularyUtils.lookupEnum((String) value, Continent.class);
if (continent != null) {
return (Continent) continent;
}
return null;
}
}
/**
* Produces a Country instance.
*/
private static class CountryProcessor implements CellProcessor {
@Override
public Country execute(Object value, CsvContext context) {
return Country.fromIsoCode((String) value);
}
}
/**
* Produces an UUID instance from a string object.
*/
private static class UUIDProcessor implements CellProcessor {
@Override
public UUID execute(Object value, CsvContext context) {
return UUID.fromString((String) value);
}
}
/**
* Produces a EndpointType instance.
*/
private static class EndpointTypeProcessor implements CellProcessor {
@Override
public EndpointType execute(Object value, CsvContext context) {
Enum<?> endpointType = VocabularyUtils.lookupEnum((String) value, EndpointType.class);
if (endpointType != null) {
return (EndpointType) endpointType;
}
return null;
}
}
/**
* Produces a License instance.
*/
private static class LicenseProcessor implements CellProcessor {
@Override
public License execute(Object value, CsvContext context) {
Enum<?> license = VocabularyUtils.lookupEnum((String) value, License.class);
if (license != null) {
return (License) license;
}
return null;
}
}
private static final Logger LOG = LoggerFactory.getLogger(OccurrenceDataLoader.class);
// Date format used in the CSV file.
private static final String DATE_FORMAT = "yyyy-MM-dd"; // Tue Nov 23 17:00:00 CST 1954
// List of processors, a processor is defined for each column
private final static CellProcessor[] CELL_PROCESSORS = new CellProcessor[] {
new ParseInt(), // key
new Optional(new ParseDouble()), // elevation
new Optional(new BasisOfRecordProcessor()), // basisOfRecord
new Optional(), // catalogNumber
new Optional(new ParseInt()), // classKey
new Optional(), // clazz
new Optional(), // collectionCode
new Optional(new UUIDProcessor()), // datasetKey
new Optional(new ParseDouble()),// depth
new Optional(),// occurrenceId
new Optional(),// family
new Optional(new ParseInt()),// familyKey
new Optional(),// genus
new Optional(new ParseInt()),// genusKey
new Optional(),// institutionCode
new Optional(new CountryProcessor()),// country
new Optional(),// kingdom
new Optional(new ParseInt()),// kingdomKey
new Optional(new ParseDouble()),// latitude
new Optional(new ParseDouble()),// longitude
new Optional(new ParseDate(DATE_FORMAT)),// lastInterpreted
new Optional(new ParseInt()),// month
new Optional(new ParseInt()),// taxonKey
new Optional(new ParseDate(DATE_FORMAT)),// eventDate
new Optional(),// order
new Optional(new ParseInt()),// orderKey
new Optional(new UUIDProcessor()),// publishingOrgKey
new Optional(),// phylum
new Optional(new ParseInt()),// phylumKey
new Optional(),// scientificName
new Optional(),// species
new Optional(new ParseInt()),// speciesKey
new Optional(),// unitQualifier
new Optional(new ParseInt()),// year
new Optional(),// locality
new Optional(),// county
new Optional(),// stateProvince
new Optional(new ContinentProcessor()),// continent
new Optional(),// collectorName
new Optional(),// recordNumber
new Optional(),// identifierName
new Optional(new ParseDate(DATE_FORMAT)),// identificationDate
new Optional(new TypeStatusProcessor()),// typeStatus
new Optional(new MediaListProcessor()),// List<Media> in JSON
new Optional(new EstablishmentMeansProcessor()),// establishmentMeans.
new Optional(new IssueProcessor()),// issues.
new Optional(),// organismId
new Optional(),// waterBody
new Optional(new EndpointTypeProcessor()), // protocol
new Optional(new LicenseProcessor()), // license
new Optional(new ParseInt()) // crawlId
};
// Column headers
private final static String[] HEADER = new String[] {
"key",
"elevation",
"basisOfRecord",
"catalogNumber",
"classKey",
"clazz",
"collectionCode",
"datasetKey",
"depth",
"occurrenceId",
"family",
"familyKey",
"genus",
"genusKey",
"institutionCode",
"country",
"kingdom",
"kingdomKey",
"decimalLatitude",
"decimalLongitude",
"lastInterpreted",
"month",
"taxonKey",
"eventDate",
"order",
"orderKey",
"publishingOrgKey",
"phylum",
"phylumKey",
"scientificName",
"species",
"speciesKey",
"unitQualifier",
"year",
"locality",
"county",
"stateProvince",
"continent",
"recordedBy",
"recordNumber",
"identifiedBy",
"dateIdentified",
"typeStatus",
"media",
"establishmentMeans",
"issues",
"organismID",
"waterBody",
"protocol",
"license",
"crawlId"
};
// Verbatim field names
private final static Set<String> VERBATIM_FIELDS = new ImmutableSet.Builder<String>().add(
"catalogNumber",
"collectionCode",
"occurrenceId",
"institutionCode",
"unitQualifier",
"locality",
"county",
"stateProvince",
"recordedBy",
"recordNumber",
"identifiedBy",
"organismID").build();
/**
* Reads a CSV file and produces occurrence records for each line.
* Each occurrence object is processed by the list of processors.
*
* @param fileName CSV file
* @param processors list of processors(predicates) that consume occurrence objects
*/
public static void processOccurrences(String fileName, Predicate<Occurrence>... processors) {
ICsvMapReader reader = null;
int line = 1;
try {
reader =
new CsvMapReader(new FileReader(new File(Resources.getResource(fileName).toURI())),
CsvPreference.STANDARD_PREFERENCE);
reader.getHeader(true);
Map<String, Object> occurrenceMap;
while ((occurrenceMap = reader.read(HEADER, CELL_PROCESSORS)) != null) {
Occurrence occurrence = convertMap(occurrenceMap);
for (Predicate<Occurrence> predicate : processors) {
predicate.apply(occurrence);
}
line++;
}
} catch (Exception e) {
LOG.error(String.format("Error parsing occurrence object from file %d", line), e);
} finally {
try {
Closeables.close(reader, false);
} catch (IOException io) {
LOG.warn("Failed to close reader", io);
}
}
}
private static Occurrence convertMap(Map<String, Object> occurrenceMap) {
Occurrence occurrence = new Occurrence();
for (Entry<String, Object> field : occurrenceMap.entrySet()) {
if (VERBATIM_FIELDS.contains(field.getKey())) {
Entry<? extends Term, String> verbatimField = toTermEntry(field);
occurrence.setVerbatimField(verbatimField.getKey(), verbatimField.getValue());
} else {
setInterpretedField(field, occurrence);
}
}
return occurrence;
}
private static Entry<? extends Term, String> toTermEntry(Entry<String, Object> field) {
String strValue = null;
if (field.getValue() != null) {
strValue = (String) field.getValue();
}
if (field.getKey().equals(GbifInternalTerm.unitQualifier.name())) {
return Maps.immutableEntry(GbifInternalTerm.unitQualifier, strValue);
} else {
Enum<?> term = VocabularyUtils.lookupEnum(field.getKey(), DwcTerm.class);
if (term != null) {
return Maps.immutableEntry((DwcTerm) term, strValue);
}
}
return null;
}
private static void setInterpretedField(Entry<String, Object> rawField, Occurrence occurrence) {
try {
PropertyUtils.setProperty(occurrence, rawField.getKey(), rawField.getValue());
} catch (IllegalAccessException e) {
Throwables.propagate(e);
} catch (InvocationTargetException e) {
Throwables.propagate(e);
} catch (NoSuchMethodException e) {
Throwables.propagate(e);
}
}
}