package org.apache.tika.batch.fs; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.nio.file.DirectoryStream; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.apache.commons.io.IOUtils; import org.apache.tika.TikaTest; import org.apache.tika.batch.BatchProcess; import org.apache.tika.batch.BatchProcessDriverCLI; import org.apache.tika.batch.ParallelFileProcessingResult; import org.apache.tika.batch.builders.BatchProcessBuilder; import org.junit.AfterClass; import org.junit.BeforeClass; /** * This is the base class for file-system batch tests. * <p/> * There are a few areas for improvement in this test suite. * <ol> * <li>For the heavy load tests, the test cases leave behind files that * cannot be deleted from within the same jvm. A thread is still actively writing to an * OutputStream when tearDown() is called. The current solution is to create * the temp dir within the target/tika-batch/test-classes so that they will at least * be removed during each maven "clean"</li> * <li>The "mock" tests are time-based. This is not * extremely reliable across different machines with different number/power of cpus. * </li> * </ol> */ public abstract class FSBatchTestBase extends TikaTest { private static Path outputRoot = null; @BeforeClass public static void setUp() throws Exception { Path testOutput = Paths.get("target/test-classes/test-output"); Files.createDirectories(testOutput); outputRoot = Files.createTempDirectory(testOutput, "tika-batch-output-root-"); } @AfterClass public static void tearDown() throws Exception { //not ideal, but should be ok for testing //see caveat in TikaCLITest's textExtract try { deleteDirectory(outputRoot); } catch (IOException e) { e.printStackTrace(); } } protected void destroyProcess(Process p) { if (p == null) return; try { p.exitValue(); } catch (IllegalThreadStateException e) { p.destroy(); } } Path getNewOutputDir(String subdirPrefix) throws IOException { Path outputDir = Files.createTempDirectory(outputRoot, subdirPrefix); assert(countChildren(outputDir) == 0); return outputDir; } Map<String, String> getDefaultArgs(String inputSubDir, Path outputDir) throws Exception { Map<String, String> args = new HashMap<>(); args.put("inputDir", "\""+getInputRoot(inputSubDir).toString()+"\""); if (outputDir != null) { args.put("outputDir", "\""+outputDir.toString()+"\""); } return args; } public String[] getDefaultCommandLineArgsArr(String inputSubDir, Path outputDir, Map<String, String> commandLine) throws Exception { List<String> args = new ArrayList<>(); //need to include "-" because these are going to the commandline! if (inputSubDir != null) { args.add("-inputDir"); args.add(getInputRoot(inputSubDir).toAbsolutePath().toString()); } if (outputDir != null) { args.add("-outputDir"); args.add(outputDir.toAbsolutePath().toString()); } if (commandLine != null) { for (Map.Entry<String, String> e : commandLine.entrySet()) { args.add(e.getKey()); args.add(e.getValue()); } } return args.toArray(new String[args.size()]); } public Path getInputRoot(String subdir) throws Exception { String path = (subdir == null || subdir.length() == 0) ? "/test-input" : "/test-input/"+subdir; return Paths.get(this.getClass().getResource(path).toURI()); } BatchProcess getNewBatchRunner(String testConfig, Map<String, String> args) throws IOException { InputStream is = this.getClass().getResourceAsStream(testConfig); BatchProcessBuilder b = new BatchProcessBuilder(); BatchProcess runner = b.build(is, args); IOUtils.closeQuietly(is); return runner; } public ProcessBuilder getNewBatchRunnerProcess(String testConfig, String loggerProps, Map<String, String> args) { List<String> argList = new ArrayList<>(); for (Map.Entry<String, String> e : args.entrySet()) { argList.add("-"+e.getKey()); argList.add(e.getValue()); } String[] fullCommandLine = commandLine(testConfig, loggerProps, argList.toArray(new String[argList.size()])); return new ProcessBuilder(fullCommandLine); } private String[] commandLine(String testConfig, String loggerProps, String[] args) { List<String> commandLine = new ArrayList<>(); commandLine.add("java"); commandLine.add("-Dlog4j.configuration=file:"+ this.getClass().getResource(loggerProps).getFile()); commandLine.add("-Xmx128m"); commandLine.add("-cp"); String cp = System.getProperty("java.class.path"); //need to test for " " on *nix, can't just add double quotes //across platforms. if (cp.contains(" ")){ cp = "\""+cp+"\""; } commandLine.add(cp); commandLine.add("org.apache.tika.batch.fs.FSBatchProcessCLI"); String configFile = null; try { configFile = Paths.get(this.getClass().getResource(testConfig).toURI()).toAbsolutePath().toString(); } catch (URISyntaxException e) { e.printStackTrace(); } commandLine.add("-bc"); commandLine.add(configFile); for (String s : args) { commandLine.add(s); } return commandLine.toArray(new String[commandLine.size()]); } public BatchProcessDriverCLI getNewDriver(String testConfig, String[] args) throws Exception { List<String> commandLine = new ArrayList<>(); commandLine.add("java"); commandLine.add("-Xmx128m"); commandLine.add("-cp"); String cp = System.getProperty("java.class.path"); //need to test for " " on *nix, can't just add double quotes //across platforms. if (cp.contains(" ")){ cp = "\""+cp+"\""; } commandLine.add(cp); commandLine.add("org.apache.tika.batch.fs.FSBatchProcessCLI"); String configFile = Paths.get( this.getClass().getResource(testConfig).toURI()).toAbsolutePath().toString(); commandLine.add("-bc"); commandLine.add(configFile); for (String s : args) { commandLine.add(s); } BatchProcessDriverCLI driver = new BatchProcessDriverCLI( commandLine.toArray(new String[commandLine.size()])); driver.setRedirectChildProcessToStdOut(false); return driver; } protected ParallelFileProcessingResult run(BatchProcess process) throws Exception { ExecutorService executor = Executors.newSingleThreadExecutor(); Future<ParallelFileProcessingResult> futureResult = executor.submit(process); return futureResult.get(10, TimeUnit.SECONDS); } /** * Counts immediate children only, does not work recursively * @param p * @return * @throws IOException */ public static int countChildren(Path p) throws IOException { int i = 0; try (DirectoryStream<Path> ds = Files.newDirectoryStream(p)) { Iterator<Path> it = ds.iterator(); while (it.hasNext()) { i++; it.next(); } } return i; } //REMOVE THIS AND USE FileUtils, once a java 7 option has been added. public static String readFileToString(Path p, Charset cs) throws IOException { StringBuilder sb = new StringBuilder(); try (BufferedReader r = Files.newBufferedReader(p, cs)) { String line = r.readLine(); while (line != null) { sb.append(line).append("\n"); line = r.readLine(); } } return sb.toString(); } //TODO: move this into FileUtils public static void deleteDirectory(Path dir) throws IOException { Files.walkFileTree(dir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { Files.delete(dir); return FileVisitResult.CONTINUE; } }); } /** * helper method equivalent to File#listFiles() * grabs children only, does not walk recursively * @param p * @return */ public static List<Path> listPaths(Path p) throws IOException { List<Path> list = new ArrayList<>(); try (DirectoryStream<Path> ds = Files.newDirectoryStream(p)) { Iterator<Path> it = ds.iterator(); while (it.hasNext()) { list.add(it.next()); } } return list; } }