/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.H2Util;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReaderException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
@Ignore
public class ProfilerBatchTest {
public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI";
private static Path dbDir;
private static Connection conn;
private final static String profileTable = ExtractProfiler.PROFILE_TABLE.getName();
private final static String exTable = ExtractProfiler.EXCEPTION_TABLE.getName();
private final static String fpCol = Cols.FILE_PATH.name();
@BeforeClass
public static void setUp() throws Exception {
Path inputRoot = Paths.get(new ComparerBatchTest().getClass().getResource("/test-dirs/extractsA").toURI());
dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-");
Map<String, String> args = new HashMap<>();
Path db = dbDir.resolve("profiler_test");
args.put("-db", db.toString());
//for debugging, you can use this to select only one file pair to load
//args.put("-includeFilePat", "file8.*");
/* BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
"/single-file-profiler-crawl-input-config.xml");
StreamStrings streamStrings = ex.execute();
System.out.println(streamStrings.getErrString());
System.out.println(streamStrings.getOutString());*/
H2Util dbUtil = new H2Util(db);
conn = dbUtil.getConnection();
}
@AfterClass
public static void tearDown() throws IOException {
try{
conn.close();
} catch (SQLException e) {
throw new RuntimeException(e);
}
DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir);
Iterator<Path> it = dStream.iterator();
while (it.hasNext()) {
Path p = it.next();
Files.delete(p);
}
dStream.close();
Files.delete(dbDir);
}
@Test
public void testSimpleDBWriteAndRead() throws Exception {
Statement st = null;
List<String> fNameList = new ArrayList<>();
try {
String sql = "select * from "+ ExtractProfiler.CONTAINER_TABLE.getName();
st = conn.createStatement();
ResultSet rs = st.executeQuery(sql);
while (rs.next()) {
String fileName = rs.getString(Cols.FILE_PATH.name());
fNameList.add(fileName);
}
} finally {
if (st != null) {
st.close();
}
}
debugTable(ExtractProfiler.CONTAINER_TABLE);
debugTable(ExtractProfiler.PROFILE_TABLE);
debugTable(ExtractProfiler.CONTENTS_TABLE);
debugTable(ExtractProfiler.EXCEPTION_TABLE);
debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
assertEquals(10, fNameList.size());
assertTrue("file1.pdf", fNameList.contains("file1.pdf"));
assertTrue("file2_attachANotB.doc", fNameList.contains("file2_attachANotB.doc"));
assertTrue("file3_attachBNotA.doc", fNameList.contains("file3_attachBNotA.doc"));
assertTrue("file4_emptyB.pdf", fNameList.contains("file4_emptyB.pdf"));
assertTrue("file7_badJson.pdf", fNameList.contains("file7_badJson.pdf"));
}
@Test
public void testExtractErrors() throws Exception {
String sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" +
" join containers c on c.container_id = e.container_id "+
" where c.file_path='file9_noextract.txt'";
assertEquals("missing extract: file9_noextract.txt", "0",
getSingleResult(sql));
debugTable(ExtractProfiler.CONTAINER_TABLE);
debugTable(ExtractProfiler.PROFILE_TABLE);
debugTable(ExtractProfiler.CONTENTS_TABLE);
debugTable(ExtractProfiler.EXCEPTION_TABLE);
debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
sql = "select EXTRACT_EXCEPTION_ID from errors e" +
" join containers c on c.container_id = e.container_id "+
" where c.file_path='file5_emptyA.pdf'";
assertEquals("empty extract: file5_emptyA.pdf", "1",
getSingleResult(sql));
sql = "select EXTRACT_EXCEPTION_ID from errors e" +
" join containers c on c.container_id = e.container_id "+
" where c.file_path='file7_badJson.pdf'";
assertEquals("extract error:file7_badJson.pdf", "2",
getSingleResult(sql));
}
@Test
public void testParseErrors() throws Exception {
debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
String sql = "select file_path from errors where container_id is null";
assertEquals("file10_permahang.txt",
getSingleResult(sql));
sql = "select extract_error_id from extract_exceptions " +
"where file_path='file11_oom.txt'";
assertEquals(Integer.toString(
ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
getSingleResult(sql));
sql = "select parse_error_id from extract_exceptions where file_path='file11_oom.txt'";
assertEquals(Integer.toString(AbstractProfiler.
PARSE_ERROR_TYPE.
OOM.ordinal()),
getSingleResult(sql));
}
@Test
public void testParseExceptions() throws Exception {
debugTable(ExtractProfiler.EXCEPTION_TABLE);
}
private String getSingleResult(String sql) throws Exception {
Statement st = null;
st = conn.createStatement();
ResultSet rs = st.executeQuery(sql);
int hits = 0;
String val = "";
while (rs.next()) {
assertEquals("must have only one column in result",
1, rs.getMetaData().getColumnCount());
val = rs.getString(1);
hits++;
}
assertEquals("must have only one hit", 1, hits);
return val;
}
//TODO: lots more testing!
public void debugTable(TableInfo table) throws Exception {
Statement st = null;
try {
String sql = "select * from "+table.getName();
st = conn.createStatement();
ResultSet rs = st.executeQuery(sql);
int colCount = rs.getMetaData().getColumnCount();
System.out.println("TABLE: "+table.getName());
for (int i = 1; i <= colCount; i++) {
if (i > 1) {
System.out.print(" | ");
}
System.out.print(rs.getMetaData().getColumnName(i));
}
System.out.println("");
int rowCount = 0;
while (rs.next()) {
for (int i = 1; i <= colCount; i++) {
if (i > 1) {
System.out.print(" | ");
}
System.out.print(rs.getString(i));
rowCount++;
}
System.out.println("");
}
if (rowCount == 0) {
System.out.println(table.getName() + " was empty");
}
} finally {
if (st != null) {
st.close();
}
}
}
}