package eu.dnetlib.iis.wf.importer.concept;
import static eu.dnetlib.iis.common.WorkflowRuntimeParameters.OOZIE_ACTION_OUTPUT_FILENAME;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import org.xml.sax.InputSource;
import com.google.common.base.Preconditions;
import eu.dnetlib.iis.common.counter.NamedCounters;
import eu.dnetlib.iis.common.counter.NamedCountersFileWriter;
import eu.dnetlib.iis.common.java.PortBindings;
import eu.dnetlib.iis.common.java.Process;
import eu.dnetlib.iis.common.java.io.DataStore;
import eu.dnetlib.iis.common.java.io.FileSystemPath;
import eu.dnetlib.iis.common.java.porttype.AvroPortType;
import eu.dnetlib.iis.common.java.porttype.PortType;
import eu.dnetlib.iis.importer.schemas.Concept;
import eu.dnetlib.iis.wf.importer.DataFileRecordReceiverWithCounter;
import eu.dnetlib.iis.wf.importer.facade.ISLookupFacade;
import eu.dnetlib.iis.wf.importer.facade.ServiceFacadeUtils;
/**
* {@link ISLookupFacade} based concept importer.
* @author mhorst
*
*/
public class ISLookupServiceBasedConceptImporter implements Process {
public static final String PARAM_IMPORT_CONTEXT_IDS_CSV = "import.context.ids.csv";
protected static final String CONCEPT_COUNTER_NAME = "CONCEPT_COUNTER";
private static final Logger log = Logger.getLogger(ISLookupServiceBasedConceptImporter.class);
private final NamedCountersFileWriter countersWriter = new NamedCountersFileWriter();
protected static final String PORT_OUT_CONCEPTS = "concepts";
private final Map<String, PortType> outputPorts = new HashMap<String, PortType>();
//------------------------ CONSTRUCTORS -------------------
public ISLookupServiceBasedConceptImporter() {
outputPorts.put(PORT_OUT_CONCEPTS, new AvroPortType(Concept.SCHEMA$));
}
//------------------------ LOGIC --------------------------
@Override
public Map<String, PortType> getInputPorts() {
return Collections.emptyMap();
}
@Override
public Map<String, PortType> getOutputPorts() {
return outputPorts;
}
@Override
public void run(PortBindings portBindings, Configuration conf,
Map<String, String> parameters) throws Exception {
Preconditions.checkArgument(parameters.containsKey(PARAM_IMPORT_CONTEXT_IDS_CSV),
"unknown context identifier, required parameter '%s' is missing!", PARAM_IMPORT_CONTEXT_IDS_CSV);
String contextIdsCSV = parameters.get(PARAM_IMPORT_CONTEXT_IDS_CSV);
// initializing ISLookup
ISLookupFacade isLookupFacade = ServiceFacadeUtils.instantiate(parameters);
try (DataFileWriter<Concept> conceptWriter = getWriter(FileSystem.get(conf), portBindings)) {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser saxParser = parserFactory.newSAXParser();
NamedCounters counters = new NamedCounters(new String[] { CONCEPT_COUNTER_NAME });
int count = 0;
for (String contextXML : isLookupFacade.searchProfile(buildQuery(contextIdsCSV))) {
count++;
if (!StringUtils.isEmpty(contextXML)) {
DataFileRecordReceiverWithCounter<Concept> conceptReciever = new DataFileRecordReceiverWithCounter<Concept>(conceptWriter);
saxParser.parse(new InputSource(new StringReader(contextXML)), new ConceptXmlHandler(conceptReciever));
counters.increment(CONCEPT_COUNTER_NAME, conceptReciever.getReceivedCount());
} else {
log.error("got empty context when looking for for context ids: " + contextIdsCSV);
}
}
if (count==0) {
log.warn("got 0 profiles when looking for context ids: " + contextIdsCSV);
}
countersWriter.writeCounters(counters, System.getProperty(OOZIE_ACTION_OUTPUT_FILENAME));
}
}
/**
* Provides {@link Concept} writer consuming records.
*/
protected DataFileWriter<Concept> getWriter(FileSystem fs, PortBindings portBindings) throws IOException {
return DataStore.create(
new FileSystemPath(fs, portBindings.getOutput().get(PORT_OUT_CONCEPTS)), Concept.SCHEMA$);
}
//------------------------ PRIVATE --------------------------
/**
* Builds profile lookup query for given context identifiers.
* @param contextIdsCSV set of context identifiers for which profiles should be found
*/
private static String buildQuery(String contextIdsCSV) {
String[] contextIds = StringUtils.split(contextIdsCSV, ',');
if (contextIds.length==0) {
throw new RuntimeException("got 0 context ids, "
+ "unable to tokenize context identifiers: " + contextIdsCSV);
}
StringBuilder query = new StringBuilder("//BODY/CONFIGURATION/context[");
int tokensCount = 0;
for (String contextId : contextIds) {
if (tokensCount>0) {
query.append(" or ");
}
query.append("@id=\"" + contextId + "\"");
tokensCount++;
}
query.append(']');
return query.toString();
}
}