/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.cli; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.ByteArrayOutputStream; import java.io.OutputStream; import java.io.PrintStream; import java.io.Reader; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.parser.RecursiveParserWrapper; import org.junit.After; import org.junit.Before; import org.junit.Test; public class TikaCLIBatchIntegrationTest { private Path testInputDir = Paths.get("src/test/resources/test-data"); private String testInputDirForCommandLine; private Path tempOutputDir; private String tempOutputDirForCommandLine; private OutputStream out = null; private OutputStream err = null; private ByteArrayOutputStream outBuffer = null; @Before public void setup() throws Exception { tempOutputDir = Files.createTempDirectory("tika-cli-test-batch-"); outBuffer = new ByteArrayOutputStream(); PrintStream outWriter = new PrintStream(outBuffer, true, UTF_8.name()); ByteArrayOutputStream errBuffer = new ByteArrayOutputStream(); PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name()); out = System.out; err = System.err; System.setOut(outWriter); System.setErr(errWriter); testInputDirForCommandLine = testInputDir.toAbsolutePath().toString(); tempOutputDirForCommandLine = tempOutputDir.toAbsolutePath().toString(); } @After public void tearDown() throws Exception { System.setOut(new PrintStream(out, true, UTF_8.name())); System.setErr(new PrintStream(err, true, UTF_8.name())); //TODO: refactor to use our deleteDirectory with straight path FileUtils.deleteDirectory(tempOutputDir.toFile()); } @Test public void testSimplestBatchIntegration() throws Exception { String[] params = {testInputDirForCommandLine, tempOutputDirForCommandLine}; TikaCLI.main(params); assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml")); assertFileExists(tempOutputDir.resolve("coffee.xls.xml")); } @Test public void testBasicBatchIntegration() throws Exception { String[] params = {"-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers", "2" }; TikaCLI.main(params); assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml")); assertFileExists(tempOutputDir.resolve("coffee.xls.xml")); } @Test public void testJsonRecursiveBatchIntegration() throws Exception { String[] params = {"-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers", "10", "-J", //recursive Json "-t" //plain text in content }; TikaCLI.main(params); Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json"); try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) { List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events")); } } @Test public void testProcessLogFileConfig() throws Exception { String[] params = {"-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers", "2", "-JDlog4j.configuration=log4j_batch_process_test.properties"}; TikaCLI.main(params); assertFileExists(tempOutputDir.resolve("bad_xml.xml.xml")); assertFileExists(tempOutputDir.resolve("coffee.xls.xml")); String sysOutString = new String(outBuffer.toByteArray(), UTF_8); assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG")); } @Test public void testDigester() throws Exception { /* try { String[] params = {"-i", escape(testDataFile.getAbsolutePath()), "-o", escape(tempOutputDir.getAbsolutePath()), "-numConsumers", "10", "-J", //recursive Json "-t" //plain text in content }; TikaCLI.main(params); reader = new InputStreamReader( new FileInputStream(new File(tempOutputDir, "test_recursive_embedded.docx.json")), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertEquals("59f626e09a8c16ab6dbc2800c685f772", metadataList.get(0).get("X-TIKA:digest:MD5")); assertEquals("22e6e91f408d018417cd452d6de3dede", metadataList.get(5).get("X-TIKA:digest:MD5")); } finally { IOUtils.closeQuietly(reader); } */ String[] params = {"-i", testInputDirForCommandLine, "-o", tempOutputDirForCommandLine, "-numConsumers", "10", "-J", //recursive Json "-t", //plain text in content "-digest", "sha512" }; TikaCLI.main(params); Path jsonFile = tempOutputDir.resolve("test_recursive_embedded.docx.json"); try (Reader reader = Files.newBufferedReader(jsonFile, UTF_8)) { List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512")); assertTrue(metadataList.get(0).get("X-TIKA:digest:SHA512").startsWith("ee46d973ee1852c01858")); } } private void assertFileExists(Path path) { assertTrue("File doesn't exist: "+path.toAbsolutePath(), Files.isRegularFile(path)); } }