package eu.dnetlib.iis.wf.ingest.webcrawl.fundings; import java.io.IOException; import java.io.StringReader; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.avro.mapred.AvroKey; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.log4j.Logger; import org.xml.sax.InputSource; import org.xml.sax.XMLReader; import eu.dnetlib.iis.metadataextraction.schemas.DocumentText; /** * Module ingesting fundings details from webcrawl XML documents. * * @author mhorst */ public class WebcrawlFundingsIngester extends Mapper<AvroKey<DocumentText>, NullWritable, AvroKey<DocumentText>, NullWritable> { private static final Logger log = Logger.getLogger(WebcrawlFundingsIngester.class); @Override protected void map(AvroKey<DocumentText> key, NullWritable value, Context context) throws IOException, InterruptedException { DocumentText xmlText = key.datum(); if (!StringUtils.isBlank(xmlText.getText())) { try { // disabling validation SAXParserFactory saxFactory = SAXParserFactory.newInstance(); saxFactory.setValidating(false); SAXParser saxParser = saxFactory.newSAXParser(); XMLReader reader = saxParser.getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); reader.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); WebcrawlFundingsHandler xmlHandler = new WebcrawlFundingsHandler(); saxParser.parse(new InputSource(new StringReader(xmlText.getText().toString())), xmlHandler); if (!StringUtils.isBlank(xmlHandler.getFundingText())) { DocumentText.Builder output = DocumentText.newBuilder(); output.setId(xmlText.getId()); output.setText(xmlHandler.getFundingText()); context.write(new AvroKey<DocumentText>(output.build()), NullWritable.get()); } } catch (Exception e) { log.error("Fundings text extraction failed for id " + xmlText.getId() + " and text: " + xmlText.getText(), e); } } } }