/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.cli;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import java.net.URI;
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
/**
* Tests the Tika's cli
*/
public class TikaCLITest {
/* Test members */
private ByteArrayOutputStream outContent = null;
private PrintStream stdout = null;
private File testDataFile = new File("src/test/resources/test-data");
private URI testDataURI = testDataFile.toURI();
private String resourcePrefix;
@Before
public void setUp() throws Exception {
outContent = new ByteArrayOutputStream();
resourcePrefix = testDataURI.toString();
stdout = System.out;
System.setOut(new PrintStream(outContent, true, UTF_8.name()));
}
/**
* Tests --list-parser-detail option of the cli
*
* @throws Exception
*/
@Test
public void testListParserDetail() throws Exception{
String[] params = {"--list-parser-detail"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web"));
}
/**
* Tests --list-parser option of the cli
*
* @throws Exception
*/
@Test
public void testListParsers() throws Exception{
String[] params = {"--list-parser"};
TikaCLI.main(params);
//Assert was commented temporarily for finding the problem
// Assert.assertTrue(outContent != null && outContent.toString("UTF-8").contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
}
/**
* Tests -x option of the cli
*
* @throws Exception
*/
@Test
public void testXMLOutput() throws Exception{
String[] params = {"-x", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
params = new String[]{"-x", "--digest=SHA256", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name())
.contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee"));
}
/**
* Tests a -h option of the cli
*
* @throws Exception
*/
@Test
public void testHTMLOutput() throws Exception{
String[] params = {"-h", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString("UTF-8").contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue("Expanded <title></title> element should be present",
outContent.toString(UTF_8.name()).contains("<title></title>"));
params = new String[]{"-h", "--digest=SHA384", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString("UTF-8")
.contains("<meta name=\"X-TIKA:digest:SHA384\" content=\"c69ea023f5da95a026"));
}
/**
* Tests -t option of the cli
*
* @throws Exception
*/
@Test
public void testTextOutput() throws Exception{
String[] params = {"-t", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("finished off the cake"));
}
/**
* Tests -m option of the cli
* @throws Exception
*/
@Test
public void testMetadataOutput() throws Exception{
String[] params = {"-m", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
params = new String[]{"-m", "--digest=SHA512", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
assertTrue(outContent.toString(UTF_8.name())
.contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0"));
}
/**
* Basic tests for -json option
*
* @throws Exception
*/
@Test
public void testJsonMetadataOutput() throws Exception {
String[] params = {"--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
String json = outContent.toString(UTF_8.name());
//TIKA-1310
assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
//test legacy alphabetic sort of keys
int enc = json.indexOf("\"Content-Encoding\"");
int fb = json.indexOf("fb:admins");
int title = json.indexOf("\"title\"");
assertTrue(enc > -1 && fb > -1 && enc < fb);
assertTrue (fb > -1 && title > -1 && fb < title);
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
}
/**
* Test for -json with prettyprint option
*
* @throws Exception
*/
@Test
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"};
TikaCLI.main(params);
String json = outContent.toString(UTF_8.name());
assertTrue(json.contains(" \"X-Parsed-By\": [\n" +
" \"org.apache.tika.parser.DefaultParser\",\n" +
" \"org.apache.tika.parser.html.HtmlParser\"\n" +
" ],\n"));
//test legacy alphabetic sort of keys
int enc = json.indexOf("\"Content-Encoding\"");
int fb = json.indexOf("fb:admins");
int title = json.indexOf("\"title\"");
assertTrue(enc > -1 && fb > -1 && enc < fb);
assertTrue (fb > -1 && title > -1 && fb < title);
}
/**
* Tests -l option of the cli
*
* @throws Exception
*/
@Test
public void testLanguageOutput() throws Exception{
String[] params = {"-l", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("en"));
}
/**
* Tests -d option of the cli
*
* @throws Exception
*/
@Test
public void testDetectOutput() throws Exception{
String[] params = {"-d", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
}
/**
* Tests --list-met-models option of the cli
*
* @throws Exception
*/
@Test
public void testListMetModels() throws Exception{
String[] params = {"--list-met-models", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("text/plain"));
}
/**
* Tests --list-supported-types option of the cli
*
* @throws Exception
*/
@Test
public void testListSupportedTypes() throws Exception{
String[] params = {"--list-supported-types", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(UTF_8.name()).contains("supertype: application/octet-stream"));
}
/**
* Tears down the test. Returns the System.out
*/
@After
public void tearDown() throws Exception {
System.setOut(stdout);
}
@Test
public void testExtract() throws Exception {
File tempFile = File.createTempFile("tika-test-", "");
tempFile.delete();
tempFile.mkdir(); // not really good method for production usage, but ok for tests
// google guava library has better solution
try {
String[] params = {"--extract-dir="+tempFile.getAbsolutePath(),"-z", resourcePrefix + "/coffee.xls"};
TikaCLI.main(params);
StringBuffer allFiles = new StringBuffer();
for (String f : tempFile.list()) {
if (allFiles.length() > 0) allFiles.append(" : ");
allFiles.append(f);
}
// ChemDraw file
File expectedCDX = new File(tempFile, "MBD002B040A.cdx");
// Image of the ChemDraw molecule
File expectedIMG = new File(tempFile, "file4.png");
// OLE10Native
File expectedOLE10 = new File(tempFile, "MBD002B0FA6_file5.bin");
// Something that really isnt a text file... Not sure what it is???
File expected262FE3 = new File(tempFile, "MBD00262FE3.txt");
// Image of one of the embedded resources
File expectedEMF = new File(tempFile, "file0.emf");
assertExtracted(expectedCDX, allFiles.toString());
assertExtracted(expectedIMG, allFiles.toString());
assertExtracted(expectedOLE10, allFiles.toString());
assertExtracted(expected262FE3, allFiles.toString());
assertExtracted(expectedEMF, allFiles.toString());
} finally {
FileUtils.deleteDirectory(tempFile);
}
}
protected static void assertExtracted(File f, String allFiles) {
assertTrue(
"File " + f.getName() + " not found in " + allFiles,
f.exists()
);
assertFalse(
"File " + f.getName() + " is a directory!", f.isDirectory()
);
assertTrue(
"File " + f.getName() + " wasn't extracted with contents",
f.length() > 0
);
}
// TIKA-920
@Test
public void testMultiValuedMetadata() throws Exception {
String[] params = {"-m", resourcePrefix + "testMultipleSheets.numbers"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("sheetNames: Checking"));
assertTrue(content.contains("sheetNames: Secon sheet"));
assertTrue(content.contains("sheetNames: Logical Sheet 3"));
assertTrue(content.contains("sheetNames: Sheet 4"));
}
// TIKA-1031
@Test
public void testZipWithSubdirs() throws Exception {
String[] params = {"-z", "--extract-dir=target", resourcePrefix + "testWithSubdirs.zip"};
new File("subdir/foo.txt").delete();
new File("subdir").delete();
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("Extracting 'subdir/foo.txt'"));
// clean up. TODO: These should be in target.
new File("target/subdir/foo.txt").delete();
new File("target/subdir").delete();
}
@Test
public void testDefaultConfigException() throws Exception {
//default xml parser will throw TikaException
//this and TestConfig() are broken into separate tests so that
//setUp and tearDown() are called each time
String[] params = {resourcePrefix + "bad_xml.xml"};
boolean tikaEx = false;
try {
TikaCLI.main(params);
} catch (TikaException e) {
tikaEx = true;
}
assertTrue(tikaEx);
}
@Test
public void testConfig() throws Exception {
String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", resourcePrefix+"bad_xml.xml"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
}
@Test
public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("[\n" +
" {\n" +
" \"Application-Name\": \"Microsoft Office Word\",\n" +
" \"Application-Version\": \"15.0000\",\n" +
" \"Character Count\": \"28\",\n" +
" \"Character-Count-With-Spaces\": \"31\","));
assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\""));
assertFalse(content.contains("X-TIKA:content"));
}
@Test
public void testJsonRecursiveMetadataParserDefault() throws Exception {
String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
}
@Test
public void testJsonRecursiveMetadataParserText() throws Exception {
String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
@Test
public void testDigestInJson() throws Exception {
String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5", resourcePrefix+"test_recursive_embedded.docx"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"59f626e09a8c16ab6dbc2800c685f772\","));
assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\""));
}
@Test
public void testConfigSerializationStaticAndCurrent() throws Exception {
String[] params = new String[]{"--dump-static-config"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
//make sure at least one detector is there
assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
//make sure Executable is there because follow on tests of custom config
//test that it has been turned off.
assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
params = new String[]{"--dump-current-config"};
TikaCLI.main(params);
content = outContent.toString(UTF_8.name());
//make sure at least one detector is there
assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>"));
//and at least one parser
assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>"));
}
@Test
public void testConfigSerializationCustomMinimal() throws Exception {
String[] params = new String[]{
"--config=" + testDataFile.toString() + "/tika-config2.xml",
"--dump-minimal-config"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name()).replaceAll("[\r\n\t ]+", " ");
String expected =
"<parser class=\"org.apache.tika.parser.DefaultParser\">" +
" <mime-exclude>application/pdf</mime-exclude>" +
" <mime-exclude>image/jpeg</mime-exclude> " +
"</parser> " +
"<parser class=\"org.apache.tika.parser.EmptyParser\">" +
" <mime>application/pdf</mime> " +
"</parser>";
assertTrue(content.contains(expected));
}
@Test
public void testConfigSerializationCustomStatic() throws Exception {
String[] params = new String[]{
"--config=" + testDataFile.toString() + "/tika-config2.xml", "--dump-static-config"};
TikaCLI.main(params);
String content = outContent.toString(UTF_8.name());
assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
}
}