package eu.dnetlib.iis.wf.metadataextraction; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.EXCLUDED_IDS; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.FAULT_CODE_PROCESSING_TIME_THRESHOLD_EXCEEDED; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.FAULT_SUPPLEMENTARY_DATA_PROCESSING_TIME; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.INTERRUPT_PROCESSING_TIME_THRESHOLD_SECS; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.LOG_FAULT_PROCESSING_TIME_THRESHOLD_SECS; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.NAMED_OUTPUT_FAULT; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.NAMED_OUTPUT_META; import static eu.dnetlib.iis.wf.metadataextraction.MetadataExtractorMapper.InvalidRecordCounters.INVALID_PDF_HEADER; import static eu.dnetlib.iis.wf.metadataextraction.NlmToDocumentWithBasicMetadataConverter.EMPTY_META; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.mockito.Matchers.any; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import java.io.IOException; import java.nio.ByteBuffer; import org.apache.avro.mapred.AvroKey; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper.Context; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.mockito.ArgumentCaptor; import org.mockito.Captor; import org.mockito.Mock; import org.mockito.runners.MockitoJUnitRunner; import com.itextpdf.text.exceptions.InvalidPdfException; import eu.dnetlib.iis.audit.schemas.Fault; import eu.dnetlib.iis.common.javamapreduce.MultipleOutputs; import eu.dnetlib.iis.importer.schemas.DocumentContent; import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata; import pl.edu.icm.cermine.tools.timeout.TimeoutException; /** * @author mhorst * */ @RunWith(MockitoJUnitRunner.class) @SuppressWarnings({"rawtypes", "unchecked"}) public class MetadataExtractorMapperTest { private static final String PDF_FILE = "/eu/dnetlib/iis/wf/metadataextraction/pdf-example.pdf"; private static final String NON_PDF_FILE = "/eu/dnetlib/iis/wf/metadataextraction/nlm-example.xml"; @Mock private Context context; @Mock private MultipleOutputs multipleOutputs; @Captor private ArgumentCaptor<String> mosKeyCaptor; @Captor private ArgumentCaptor<AvroKey<?>> mosValueCaptor; @Mock private Counter invalidPdfCounter; private MetadataExtractorMapper mapper; @Before public void init() throws Exception { mapper = new MetadataExtractorMapper() { @Override protected MultipleOutputs instantiateMultipleOutputs(Context context) { return multipleOutputs; } }; } // ------------------------------------- TESTS ----------------------------------- @Test(expected=RuntimeException.class) public void testSetupWithoutNamedOutputMeta() throws Exception { // given Configuration conf = new Configuration(); doReturn(conf).when(context).getConfiguration(); // execute mapper.setup(context); } @Test(expected=RuntimeException.class) public void testSetupWithoutNamedOutputFault() throws Exception { // given Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); doReturn(conf).when(context).getConfiguration(); // execute mapper.setup(context); } @Test public void testMap() throws Exception { // given Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); String id = "id"; DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId(id); docContentBuilder.setPdf(ByteBuffer.wrap(getContent(PDF_FILE))); // execute mapper.map(new AvroKey<>(docContentBuilder.build()), null, context); // assert verify(context, never()).write(any(), any()); verify(multipleOutputs, times(1)).write(mosKeyCaptor.capture(), mosValueCaptor.capture()); // doc meta assertEquals(conf.get(NAMED_OUTPUT_META), mosKeyCaptor.getValue()); ExtractedDocumentMetadata docMeta = (ExtractedDocumentMetadata) mosValueCaptor.getValue().datum(); assertNotNull(docMeta); assertEquals(id, docMeta.getId()); verify(invalidPdfCounter, never()).increment(1); } @Test public void testMapWithExcludedIds() throws Exception { // given String id = "id"; DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId(id); docContentBuilder.setPdf(ByteBuffer.wrap(getContent(PDF_FILE))); Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); conf.set(EXCLUDED_IDS, id); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); // execute mapper.map(new AvroKey<>(docContentBuilder.build()), null, context); // assert verify(context, never()).write(any(), any()); verify(multipleOutputs, never()).write(any(), any()); verify(invalidPdfCounter, never()).increment(1); } @Test public void testMapWithNullContent() throws Exception { // given String id = "id"; DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId(id); Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); // execute mapper.map(new AvroKey<>(docContentBuilder.build()), null, context); // assert verify(context, never()).write(any(), any()); verify(multipleOutputs, never()).write(any(), any()); verify(invalidPdfCounter, never()).increment(1); } @Test public void testMapWithIvalidPdf() throws Exception { // given Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); String id = "id"; DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId(id); docContentBuilder.setPdf(ByteBuffer.wrap(getContent(NON_PDF_FILE))); // execute mapper.map(new AvroKey<>(docContentBuilder.build()), null, context); // assert verify(context, never()).write(any(), any()); verify(multipleOutputs, times(2)).write(mosKeyCaptor.capture(), mosValueCaptor.capture()); // doc meta assertEquals(conf.get(NAMED_OUTPUT_META), mosKeyCaptor.getAllValues().get(0)); ExtractedDocumentMetadata docMeta = (ExtractedDocumentMetadata) mosValueCaptor.getAllValues().get(0).datum(); assertNotNull(docMeta); assertEquals(id, docMeta.getId()); assertEquals("", docMeta.getText()); assertEquals(EMPTY_META, docMeta.getPublicationTypeName()); // fault assertEquals(conf.get(NAMED_OUTPUT_FAULT), mosKeyCaptor.getAllValues().get(1)); Fault fault = (Fault) mosValueCaptor.getAllValues().get(1).datum(); assertNotNull(fault); assertEquals(id, fault.getInputObjectId()); assertEquals(InvalidPdfException.class.getName(), fault.getCode()); assertTrue(fault.getTimestamp() > 0); verify(invalidPdfCounter, times(1)).increment(1); } @Test public void testMapWithInterruption() throws Exception { // given Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); conf.set(INTERRUPT_PROCESSING_TIME_THRESHOLD_SECS, String.valueOf(1)); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); String id = "id"; DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId(id); docContentBuilder.setPdf(ByteBuffer.wrap(getContent(PDF_FILE))); // execute mapper.map(new AvroKey<>(docContentBuilder.build()), null, context); // assert verify(context, never()).write(any(), any()); verify(multipleOutputs, times(2)).write(mosKeyCaptor.capture(), mosValueCaptor.capture()); // doc meta assertEquals(conf.get(NAMED_OUTPUT_META), mosKeyCaptor.getAllValues().get(0)); ExtractedDocumentMetadata docMeta = (ExtractedDocumentMetadata) mosValueCaptor.getAllValues().get(0).datum(); assertNotNull(docMeta); assertEquals(id, docMeta.getId()); assertEquals("", docMeta.getText()); assertEquals(EMPTY_META, docMeta.getPublicationTypeName()); // fault assertEquals(conf.get(NAMED_OUTPUT_FAULT), mosKeyCaptor.getAllValues().get(1)); Fault fault = (Fault) mosValueCaptor.getAllValues().get(1).datum(); assertNotNull(fault); assertEquals(id, fault.getInputObjectId()); assertEquals(TimeoutException.class.getName(), fault.getCode()); assertTrue(fault.getTimestamp() > 0); } @Test public void testMapWithProcessingTimeExceeded() throws Exception { // given Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); conf.set(LOG_FAULT_PROCESSING_TIME_THRESHOLD_SECS, String.valueOf(1)); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); String id = "id"; DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId(id); docContentBuilder.setPdf(ByteBuffer.wrap(getContent(PDF_FILE))); // execute mapper.map(new AvroKey<>(docContentBuilder.build()), null, context); // assert verify(context, never()).write(any(), any()); verify(multipleOutputs, times(2)).write(mosKeyCaptor.capture(), mosValueCaptor.capture()); // doc meta assertEquals(conf.get(NAMED_OUTPUT_META), mosKeyCaptor.getAllValues().get(0)); ExtractedDocumentMetadata docMeta = (ExtractedDocumentMetadata) mosValueCaptor.getAllValues().get(0).datum(); assertNotNull(docMeta); assertEquals(id, docMeta.getId()); assertTrue(StringUtils.isNotBlank(docMeta.getText())); // fault assertEquals(conf.get(NAMED_OUTPUT_FAULT), mosKeyCaptor.getAllValues().get(1)); Fault fault = (Fault) mosValueCaptor.getAllValues().get(1).datum(); assertNotNull(fault); assertEquals(id, fault.getInputObjectId()); assertEquals(FAULT_CODE_PROCESSING_TIME_THRESHOLD_EXCEEDED, fault.getCode()); assertTrue(fault.getTimestamp() > 0); assertNotNull(fault.getSupplementaryData()); assertEquals(1, fault.getSupplementaryData().size()); assertNotNull(fault.getSupplementaryData().get(FAULT_SUPPLEMENTARY_DATA_PROCESSING_TIME)); } @Test public void testCleanup() throws Exception { // given Configuration conf = new Configuration(); conf.set(NAMED_OUTPUT_META, "meta"); conf.set(NAMED_OUTPUT_FAULT, "fault"); conf.set(NAMED_OUTPUT_FAULT, "fault"); doReturn(conf).when(context).getConfiguration(); doReturn(invalidPdfCounter).when(context).getCounter(INVALID_PDF_HEADER); mapper.setup(context); // execute mapper.cleanup(context); // assert verify(multipleOutputs, times(1)).close(); } // --------------------------------------- PRIVATE ---------------------------------------- private byte[] getContent(String location) throws IOException { return IOUtils.toByteArray(MetadataExtractorMapper.class.getResourceAsStream(location)); } }