package eu.dnetlib.iis.wf.importer.stream.project;
import static eu.dnetlib.iis.common.WorkflowRuntimeParameters.OOZIE_ACTION_OUTPUT_FILENAME;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
import org.apache.avro.file.DataFileWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import eu.dnetlib.iis.common.counter.NamedCounters;
import eu.dnetlib.iis.common.counter.NamedCountersFileWriter;
import eu.dnetlib.iis.common.java.PortBindings;
import eu.dnetlib.iis.common.java.Process;
import eu.dnetlib.iis.common.java.io.DataStore;
import eu.dnetlib.iis.common.java.io.FileSystemPath;
import eu.dnetlib.iis.common.java.porttype.AvroPortType;
import eu.dnetlib.iis.common.java.porttype.PortType;
import eu.dnetlib.iis.importer.schemas.Project;
import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeUtils;
import eu.dnetlib.openaire.exporter.model.ProjectDetail;
/**
* {@link Project} importer reading data from stream.
*
* @author mhorst
*
*/
public class StreamingProjectImporter implements Process {
protected static final String PORT_OUT_PROJECT = "project";
protected static final String PROJECT_COUNTER_NAME = "PROJECT_COUNTER";
private static final Logger log = Logger.getLogger(StreamingProjectImporter.class);
private static final int progressLogInterval = 100000;
private final NamedCountersFileWriter countersWriter = new NamedCountersFileWriter();
private final Map<String, PortType> outputPorts = new HashMap<String, PortType>();
//------------------------ CONSTRUCTORS -------------------
public StreamingProjectImporter() {
outputPorts.put(PORT_OUT_PROJECT, new AvroPortType(Project.SCHEMA$));
}
//------------------------ LOGIC --------------------------
@Override
public Map<String, PortType> getInputPorts() {
return Collections.emptyMap();
}
@Override
public Map<String, PortType> getOutputPorts() {
return outputPorts;
}
@Override
public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception {
FileSystem fs = FileSystem.get(conf);
try (DataFileWriter<Project> projectWriter = getWriter(fs, portBindings)) {
NamedCounters counters = new NamedCounters(new String[] { PROJECT_COUNTER_NAME });
StreamingFacade streamingFacade = ServiceFacadeUtils.instantiate(parameters);
try (Scanner scanner = new Scanner(new BufferedInputStream(streamingFacade.getStream()), "utf8")) {
ProjectDetailConverter converter = new ProjectDetailConverter();
int currentCount = 0;
long startTime = System.currentTimeMillis();
while (scanner.hasNext()) {
String line = scanner.nextLine();
if (StringUtils.isNotBlank(line)) {
ProjectDetail project = ProjectDetail.fromJson(line);
projectWriter.append(converter.convert(project));
counters.increment(PROJECT_COUNTER_NAME);
currentCount++;
if (currentCount%progressLogInterval==0) {
log.info("current progress: " + currentCount + ", last package of " + progressLogInterval +
" processed in " + ((System.currentTimeMillis()-startTime)/1000) + " secs");
startTime = System.currentTimeMillis();
}
}
}
}
countersWriter.writeCounters(counters, System.getProperty(OOZIE_ACTION_OUTPUT_FILENAME));
}
}
/**
* Provides {@link Project} writer consuming records.
*/
protected DataFileWriter<Project> getWriter(FileSystem fs, PortBindings portBindings) throws IOException {
return DataStore.create(
new FileSystemPath(fs, portBindings.getOutput().get(PORT_OUT_PROJECT)), Project.SCHEMA$);
}
}