package eu.dnetlib.iis.wf.importer;
import static eu.dnetlib.iis.common.WorkflowRuntimeParameters.DEFAULT_CSV_DELIMITER;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import eu.dnetlib.iis.common.WorkflowRuntimeParameters;
import eu.dnetlib.iis.common.java.PortBindings;
import eu.dnetlib.iis.common.java.Process;
import eu.dnetlib.iis.common.java.io.DataStore;
import eu.dnetlib.iis.common.java.io.FileSystemPath;
import eu.dnetlib.iis.common.java.porttype.AvroPortType;
import eu.dnetlib.iis.common.java.porttype.PortType;
import eu.dnetlib.iis.common.schemas.Identifier;
/**
* Process module writing identifiers provided in input parameter as {@link Identifier} avro records.
*
* This step is required for further records retrieval parallelization.
*
* @author mhorst
*
*/
public abstract class AbstractIdentifierDatastoreBuilder implements Process {
protected static final String PORT_OUT_IDENTIFIER = "identifier";
private final Logger log = Logger.getLogger(this.getClass());
private final Map<String, PortType> outputPorts = new HashMap<String, PortType>();
/**
* Parameter name holding CSV of identifiers to be written.
*/
private final String identifiersParamName;
/**
* Parameter name holding CSV of identifiers to be excluded.
*/
private final String blacklistedIdentifiersParamName;
// ------------------------ CONSTRUCTOR --------------------------
/**
* @param identifiersParamName parameter name holding CSV of identifiers to be written
* @param blacklistedIdentifiersParamName parameter name holding CSV of identifiers to be excluded, functionality disabled when set to null
*/
public AbstractIdentifierDatastoreBuilder(String identifiersParamName, String blacklistedIdentifiersParamName) {
this.identifiersParamName = identifiersParamName;
this.blacklistedIdentifiersParamName = blacklistedIdentifiersParamName;
this.outputPorts.put(PORT_OUT_IDENTIFIER, new AvroPortType(Identifier.SCHEMA$));
}
// ------------------------ LOGIC --------------------------
@Override
public Map<String, PortType> getInputPorts() {
return Collections.emptyMap();
}
@Override
public Map<String, PortType> getOutputPorts() {
return outputPorts;
}
@Override
public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception {
Preconditions.checkArgument(parameters.containsKey(identifiersParamName),
"unspecified identifiers, required parameter '%s' is missing!", identifiersParamName);
Set<String> blacklistedIdentifiers = Collections.emptySet();
if (StringUtils.isNotBlank(blacklistedIdentifiersParamName)) {
String blacklistedIdentifiersCSV = parameters.get(blacklistedIdentifiersParamName);
if (StringUtils.isNotBlank(blacklistedIdentifiersCSV)) {
blacklistedIdentifiers = Sets.newHashSet(StringUtils.split(blacklistedIdentifiersCSV, DEFAULT_CSV_DELIMITER));
}
}
FileSystemPath identifierOutput = new FileSystemPath(FileSystem.get(conf),
portBindings.getOutput().get(PORT_OUT_IDENTIFIER));
int counter = 0;
String identifiersCSV = parameters.get(identifiersParamName);
if (StringUtils.isNotBlank(identifiersCSV) && !WorkflowRuntimeParameters.UNDEFINED_NONEMPTY_VALUE.equals(identifiersCSV)) {
String[] identifiers = StringUtils.split(identifiersCSV, WorkflowRuntimeParameters.DEFAULT_CSV_DELIMITER);
for (String currentId : identifiers) {
if (!blacklistedIdentifiers.contains(currentId)) {
try (DataFileWriter<Identifier> mdStoreIdentifierWriter = createWriter(
identifierOutput, Identifier.SCHEMA$, DataStore.generateFileName(counter++))) {
Identifier.Builder identifierBuilder = Identifier.newBuilder();
identifierBuilder.setId(currentId);
mdStoreIdentifierWriter.append(identifierBuilder.build());
}
} else {
log.info("skipping blacklisted id: " + currentId);
}
}
}
if (counter==0) {
// writing empty datastore required for further processing
createEmptyDataStore(identifierOutput, Identifier.SCHEMA$);
}
}
/**
* Provides avro writer.
* @throws IOException
*/
protected DataFileWriter<Identifier> createWriter(FileSystemPath path,
Schema schema, String dataStoreFileName) throws IOException {
return DataStore.create(path, schema, dataStoreFileName);
}
/**
* Creates empty datastore.
* @throws IOException
*/
protected DataFileWriter<Identifier> createEmptyDataStore(
FileSystemPath path, Schema schema) throws IOException {
return DataStore.create(path, schema);
}
}