package org.icij.extract.extractor; import org.apache.tika.io.TemporaryResources; import org.icij.extract.document.Document; import org.icij.extract.document.DocumentFactory; import org.icij.extract.document.PathIdentifier; import org.icij.extract.spewer.Spewer; import org.icij.extract.test.*; import java.io.InputStreamReader; import java.io.Reader; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.NoSuchFileException; import java.nio.file.Paths; import org.apache.tika.metadata.Metadata; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.junit.Ignore; import org.junit.Test; import org.junit.Assert; import org.junit.Rule; import org.junit.rules.ExpectedException; public class ExtractorTest { private final DocumentFactory factory = new DocumentFactory().withIdentifier(new PathIdentifier()); @Rule public final ExpectedException thrown = ExpectedException.none(); @Test public void testOcr() throws Throwable { final Extractor extractor = new Extractor(); final Document document = factory.create(getClass().getResource("/documents/ocr/simple.tiff")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("image/tiff", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals("HEAVY\nMETAL", text.trim()); } @Test public void testDisableOcr() throws Throwable { final Extractor extractor = new Extractor(); extractor.disableOcr(); final Document document = factory.create(getClass().getResource("/documents/ocr/simple.tiff")); final TemporaryResources tmp = new TemporaryResources(); final Reader reader = extractor.extract(document, tmp); final int read = reader.read(); tmp.close(); Assert.assertEquals("image/tiff", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals(-1, read); } @Test public void testFileNotFound() throws Throwable { final Extractor extractor = new Extractor(); final Document document = factory.create(Paths.get("nothing")); thrown.expect(NoSuchFileException.class); thrown.expectMessage("nothing"); try (TemporaryResources tmp = new TemporaryResources()) { extractor.extract(document, tmp); } } @Test public void testEncryptedPdf() throws Throwable { final Extractor extractor = new Extractor(); final Document document = factory.create(getClass().getResource("/documents/pdf/encrypted.pdf")); thrown.expect(IOException.class); thrown.expectMessage(""); thrown.expectCause(new CauseMatcher(EncryptedDocumentException.class, "Unable to process: document is encrypted")); final int read; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { read = reader.read(); } catch (IOException e) { Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE)); throw e; } Assert.fail(String.format("Read \"%d\" while expecting exception.", read)); } @Test public void testGarbage() throws Throwable { final Extractor extractor = new Extractor(); final Document document = factory.create(getClass().getResource("/documents/garbage.bin")); thrown.expect(IOException.class); thrown.expectMessage(""); thrown.expectCause(new CauseMatcher(TikaException.class, "Parse error")); final int read; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { read = reader.read(); } catch (IOException e) { Assert.assertEquals("application/octet-stream", document.getMetadata().get(Metadata.CONTENT_TYPE)); throw e; } Assert.fail(String.format("Read \"%d\" while expecting exception.", read)); } @Test public void testEmbeds() throws Throwable { final Extractor extractor = new Extractor(); final Document document = factory.create(getClass().getResource("/documents/ocr/embedded.pdf")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE)); //Assert.assertEquals("HEAVY\nMETAL\n\n\n\n\n\nHEAVY\nMETAL", text.trim()); Assert.assertThat(text, RegexMatcher.matchesRegex("^\\s+HEAVY\\sMETAL\\s+HEAVY\\sMETAL\\s+$")); } @Test public void testIgnoreEmbeds() throws Throwable { final Extractor extractor = new Extractor(); extractor.setEmbedHandling(Extractor.EmbedHandling.IGNORE); Assert.assertEquals(extractor.getEmbedHandling(), Extractor.EmbedHandling.IGNORE); final Document document = factory.create(getClass().getResource("/documents/ocr/embedded.pdf")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals("\n\n\n\n", text); } @Test public void testDisableOcrOnEmbed() throws Throwable { final Extractor extractor = new Extractor(); extractor.disableOcr(); final Document document = factory.create(getClass().getResource("/documents/ocr/embedded.pdf")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals("\n\n\n\n", text); } @Test @Ignore public void testHtmlOutput() throws Throwable { final Extractor extractor = new Extractor(); extractor.setOutputFormat(Extractor.OutputFormat.HTML); final Document document = factory.create(getClass().getResource("/documents/text/utf16.txt")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("text/plain; charset=UTF-16LE", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals(getExpected("/expected/text/utf16-txt.html"), text); } @Test @Ignore public void testHtmlOutputWithEmbeds() throws Throwable { final Extractor extractor = new Extractor(); extractor.setOutputFormat(Extractor.OutputFormat.HTML); final Document document = factory.create(getClass().getResource("/documents/ocr/embedded.pdf")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals(getExpected("/expected/text/embedded-pdf.html"), text); } @Test @Ignore public void testHtmlOutputWithEmbeddedEmbeds() throws Throwable { final Extractor extractor = new Extractor(); extractor.setOutputFormat(Extractor.OutputFormat.HTML); Assert.assertEquals(extractor.getOutputFormat(), Extractor.OutputFormat.HTML); //extractor.setEmbedHandling(Extractor.EmbedHandling.EMBED); //Assert.assertEquals(extractor.getEmbedHandling(), Extractor.EmbedHandling.EMBED); final Document document = factory.create(getClass().getResource("/documents/ocr/embedded.pdf")); String text; try (final TemporaryResources tmp = new TemporaryResources(); Reader reader = extractor.extract(document, tmp)) { text = Spewer.toString(reader); } Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals(getExpected("/expected/text/embedded-data-uri-pdf.html"), text); } private String getExpected(final String file) throws IOException { try (final Reader input = new InputStreamReader(getClass().getResourceAsStream(file), StandardCharsets.UTF_8)) { return Spewer.toString(input); } } }