package org.gbif.occurrence.download.citations;
import org.gbif.api.model.registry.Dataset;
import org.gbif.api.model.registry.DatasetOccurrenceDownloadUsage;
import org.gbif.api.service.registry.DatasetOccurrenceDownloadUsageService;
import org.gbif.api.service.registry.DatasetService;
import org.gbif.occurrence.download.file.common.DownloadFileUtils;
import org.gbif.occurrence.download.inject.DownloadWorkflowModule;
import org.gbif.occurrence.download.util.RegistryClientUtil;
import org.gbif.utils.file.properties.PropertiesUtil;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.Properties;
import java.util.UUID;
import javax.annotation.Nullable;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Reads a datasets citations file and optionally persists teh data usages and return the usages into a Map object.
*/
public final class CitationsFileReader {
private static final Logger LOG = LoggerFactory.getLogger(CitationsFileReader.class);
private static final Splitter TAB_SPLITTER = Splitter.on('\t').trimResults();
/**
* Transforms tab-separated-line into a DatasetOccurrenceDownloadUsage instance.
*/
private static DatasetOccurrenceDownloadUsage toDatasetOccurrenceDownloadUsage(String tsvLine, String downloadKey) {
Iterator<String> tsvLineIterator = TAB_SPLITTER.split(tsvLine).iterator();
DatasetOccurrenceDownloadUsage datasetUsage = new DatasetOccurrenceDownloadUsage();
datasetUsage.setDatasetKey(UUID.fromString(tsvLineIterator.next()));
datasetUsage.setDownloadKey(downloadKey);
datasetUsage.setNumberRecords(Long.parseLong(tsvLineIterator.next()));
return datasetUsage;
}
/**
* Reads a dataset citations file with the form 'datasetkeyTABnumberOfRecords' and applies the listed predicates.
* Each line in read from the TSV file is transformed into a DatasetOccurrenceDownloadUsage.
*
* @param nameNode Hadoop name node uri
* @param citationPath path to the directory that contains the citation table files
* @param downloadKey occurrence download key
* @param predicates list of predicates to apply while reading the file
*/
public static void readCitations(String nameNode, String citationPath, String downloadKey,
Predicate<DatasetOccurrenceDownloadUsage>... predicates) throws IOException {
FileSystem hdfs = DownloadFileUtils.getHdfs(nameNode);
for (FileStatus fs : hdfs.listStatus(new Path(citationPath))) {
if (!fs.isDirectory()) {
try (BufferedReader citationReader = new BufferedReader(new InputStreamReader(hdfs.open(fs.getPath()),
Charsets.UTF_8))) {
for (String tsvLine = citationReader.readLine(); tsvLine != null; tsvLine = citationReader.readLine()) {
if (!Strings.isNullOrEmpty(tsvLine)) {
// catch all error to avoid breaking the loop
try {
for (Predicate<DatasetOccurrenceDownloadUsage> predicate : predicates) {
predicate.apply(toDatasetOccurrenceDownloadUsage(tsvLine, downloadKey));
}
} catch (Exception e) {
LOG.info(String.format("Error processing citation line: %s", tsvLine), e);
}
}
}
}
}
}
}
public static void main(String[] args) throws IOException {
Properties properties = PropertiesUtil.loadProperties(DownloadWorkflowModule.CONF_FILE);
readCitations(properties.getProperty(DownloadWorkflowModule.DefaultSettings.NAME_NODE_KEY),
Preconditions.checkNotNull(args[0]),
Preconditions.checkNotNull(args[1]),
new PersistUsage(properties.getProperty(DownloadWorkflowModule.DefaultSettings.REGISTRY_URL_KEY)));
}
/**
* Private constructor.
*/
private CitationsFileReader() {
//empty constructor
}
/**
* Persists the dataset usage into the Registry data base.
*/
public static class PersistUsage implements Predicate<DatasetOccurrenceDownloadUsage> {
private final DatasetService datasetService;
private final DatasetOccurrenceDownloadUsageService datasetUsageService;
public PersistUsage(String registryWsUrl) {
RegistryClientUtil registryClientUtil = new RegistryClientUtil();
datasetService = registryClientUtil.setupDatasetService(registryWsUrl);
datasetUsageService = registryClientUtil.setupDatasetUsageService(registryWsUrl);
}
public PersistUsage(DatasetService datasetService, DatasetOccurrenceDownloadUsageService datasetUsageService) {
this.datasetService = datasetService;
this.datasetUsageService = datasetUsageService;
}
@Override
public boolean apply(@Nullable DatasetOccurrenceDownloadUsage input) {
try {
Dataset dataset = datasetService.get(input.getDatasetKey());
if (dataset != null) { //the dataset still exists
input.setDatasetDOI(dataset.getDoi());
if (dataset.getCitation() != null && dataset.getCitation().getText() != null) {
input.setDatasetCitation(dataset.getCitation().getText());
}
input.setDatasetTitle(dataset.getTitle());
datasetUsageService.create(input);
}
} catch (Exception e) {
LOG.error("Error persisting dataset usage information {}", input, e);
return false;
}
return true;
}
}
}