/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.tika.batch.fs.FSBatchTestBase;
import org.apache.tika.eval.db.Cols;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
@Ignore("move these tests to TikaEvalCLITest")
public class ComparerBatchTest extends FSBatchTestBase {
public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI";
private static Path dbDir;
private static Connection conn;
private final static String compJoinCont = "";
/*ExtractComparer.COMPARISONS_TABLE+" cmp " +
"join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+
"on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+
" = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/
@BeforeClass
public static void setUp() throws Exception {
File inputRoot = new File(ComparerBatchTest.class.getResource("/test-dirs").toURI());
dbDir = Files.createTempDirectory(inputRoot.toPath(), "tika-test-db-dir-");
Map<String, String> args = new HashMap<>();
Path db = FileSystems.getDefault().getPath(dbDir.toString(), "comparisons_test");
args.put("-db", db.toString());
//for debugging, you can use this to select only one file pair to load
//args.put("-includeFilePat", "file8.*");
/*
BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
"/tika-batch-comparison-eval-config.xml");
StreamStrings streamStrings = ex.execute();
System.out.println(streamStrings.getErrString());
System.out.println(streamStrings.getOutString());
H2Util dbUtil = new H2Util(db);
conn = dbUtil.getConnection();*/
}
@AfterClass
public static void tearDown() throws Exception {
conn.close();
FileUtils.deleteDirectory(dbDir.toFile());
}
@Test
public void testSimpleDBWriteAndRead() throws Exception {
Set<String> set = new HashSet<>();
//filenames
List<String> list = getColStrings(Cols.FILE_NAME.name(),
ExtractComparer.PROFILES_A.getName(), "");
assertEquals(7, list.size());
assertTrue(list.contains("file1.pdf"));
//container ids in comparisons table
list = getColStrings(Cols.CONTAINER_ID.name(),
ExtractComparer.COMPARISON_CONTAINERS.getName(),"");
assertEquals(10, list.size());
set.clear(); set.addAll(list);
assertEquals(10, set.size());
/*
//ids in comparisons table
list = getColStrings(AbstractProfiler.HEADERS.ID.name(),
compTable,"");
assertEquals(9, list.size());
set.clear(); set.addAll(list);
assertEquals(9, set.size());*/
}
/*
@Test
public void testFile1PDFRow() throws Exception {
String where = fp+"='file1.pdf'";
Map<String, String> data = getRow(compJoinCont, where);
String result = data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_A");
assertTrue(result.startsWith("over: 1"));
result = data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_B");
assertTrue(result.startsWith("aardvark: 3 | bear: 2"));
assertEquals("aardvark: 3 | bear: 2",
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.toString()));
assertEquals("fox: 2 | lazy: 1 | over: 1",
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.toString()));
assertEquals("12", data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_A"));
assertEquals("13", data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_B"));
assertEquals("8", data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_A"));
assertEquals("9", data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_B"));
assertEquals(ExtractComparer.COMPARISON_HEADERS.OVERLAP.name(),
0.64f, Float.parseFloat(data.get("OVERLAP")), 0.0001f);
assertEquals(ExtractComparer.COMPARISON_HEADERS.DICE_COEFFICIENT.name(),
0.8235294f, Float.parseFloat(data.get("DICE_COEFFICIENT")), 0.0001f);
assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A", 3.83333d,
Double.parseDouble(
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A")), 0.0001d);
assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B", 4.923d,
Double.parseDouble(
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B")), 0.0001d);
assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A", 1.0298d,
Double.parseDouble(
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A")), 0.0001d);
assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B", 1.9774d,
Double.parseDouble(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B")), 0.0001d);
assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A", 46,
Integer.parseInt(
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A")));
assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B", 64,
Integer.parseInt(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B")));
assertEquals("TOKEN_ENTROPY_RATE_A", 0.237949,
Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_A")), 0.0001d);
assertEquals("TOKEN_ENTROPY_RATE_B", 0.232845,
Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_B")), 0.0001d);
}
@Test
public void testEmpty() throws Exception {
String where = fp+"='file4_emptyB.pdf'";
Map<String, String> data = getRow(contTable, where);
assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
ExtractComparer.aExtension));
assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
ExtractComparer.bExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION));
where = fp+"='file5_emptyA.pdf'";
data = getRow(contTable, where);
assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
ExtractComparer.bExtension));
assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+
ExtractComparer.aExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION));
}
@Test
public void testMissingAttachment() throws Exception {
String where = fp+"='file2_attachANotB.doc' and "+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+
"='inner.txt'";
Map<String, String> data = getRow(compJoinCont, where);
assertContains("attachment: 1", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name()));
assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name()));
assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS +
ExtractComparer.bExtension));
assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS +
ExtractComparer.bExtension));
assertEquals("3", data.get("NUM_METADATA_VALUES_A"));
assertNull(data.get("DIFF_NUM_ATTACHMENTS"));
assertNull(data.get("NUM_METADATA_VALUES_B"));
assertEquals("0", data.get("NUM_UNIQUE_TOKENS_B"));
assertNull(data.get("TOKEN_ENTROPY_RATE_B"));
assertNull(data.get("NUM_EN_STOPS_TOP_N_B"));
where = fp+"='file3_attachBNotA.doc' and "+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+
"='inner.txt'";
data = getRow(compJoinCont, where);
assertContains("attachment: 1", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name()));
assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name()));
assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS +
ExtractComparer.aExtension));
assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS +
ExtractComparer.aExtension));
assertEquals("3", data.get("NUM_METADATA_VALUES_B"));
assertNull(data.get("DIFF_NUM_ATTACHMENTS"));
assertNull(data.get("NUM_METADATA_VALUES_A"));
assertEquals("0", data.get("NUM_UNIQUE_TOKENS_A"));
assertNull(data.get("TOKEN_ENTROPY_RATE_A"));
assertNull(data.get("NUM_EN_STOPS_TOP_N_A"));
}
@Test
public void testBothBadJson() throws Exception {
debugDumpAll(contTable);
String where = fp+"='file7_badJson.pdf'";
Map<String, String> data = getRow(contTable, where);
assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION,
data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ ExtractComparer.aExtension));
assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION,
data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ ExtractComparer.bExtension));
assertEquals("file7_badJson.pdf",
data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_PATH.name()));
assertEquals("61", data.get("JSON_FILE_LENGTH_A"));
assertEquals("0", data.get("JSON_FILE_LENGTH_B"));
assertEquals("pdf", data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_EXTENSION.name()));
}
@Test
public void testAccessPermissionException() throws Exception {
String sql = "select "+
AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() +
" from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+
" join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exA.ID "+
" join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
" where "+fp+"='file6_accessEx.pdf'";
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery(sql);
List<String> results = new ArrayList<String>();
while (rs.next()) {
results.add(rs.getString(1));
}
assertEquals(1, results.size());
assertEquals("TRUE", results.get(0));
sql = "select "+
AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() +
" from " + AbstractProfiler.EXCEPTIONS_TABLE+"_B exB "+
" join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exB.ID "+
" join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
" where "+fp+"='file6_accessEx.pdf'";
st = conn.createStatement();
rs = st.executeQuery(sql);
results = new ArrayList<String>();
while (rs.next()) {
results.add(rs.getString(1));
}
assertEquals(1, results.size());
assertEquals("TRUE", results.get(0));
}
@Test
public void testContainerException() throws Exception {
String sql = "select * "+
" from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+
" join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exA.ID "+
" join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
"where "+fp+"='file8_IOEx.pdf'";
Statement st = conn.createStatement();
ResultSet rs = st.executeQuery(sql);
Map<String, String> data = new HashMap<String,String>();
ResultSetMetaData rsM = rs.getMetaData();
while (rs.next()) {
for (int i = 1; i <= rsM.getColumnCount(); i++)
data.put(rsM.getColumnName(i), rs.getString(i));
}
String sortStack = data.get(AbstractProfiler.EXCEPTION_HEADERS.SORT_STACK_TRACE.name());
sortStack = sortStack.replaceAll("[\r\n]", "<N>");
assertTrue(sortStack.startsWith("java.lang.RuntimeException<N>"));
String fullStack = data.get(AbstractProfiler.EXCEPTION_HEADERS.ORIG_STACK_TRACE.name());
assertTrue(
fullStack.startsWith("java.lang.RuntimeException: java.io.IOException: Value is not an integer"));
}
private void debugDumpAll(String table) throws Exception {
Statement st = conn.createStatement();
String sql = "select * from "+table;
ResultSet rs = st.executeQuery(sql);
ResultSetMetaData m = rs.getMetaData();
for (int i = 1; i <= m.getColumnCount(); i++) {
System.out.print(m.getColumnName(i) + ", ");
}
System.out.println("\n");
while (rs.next()) {
for (int i = 1; i <= m.getColumnCount(); i++) {
System.out.print(rs.getString(i)+", ");
}
System.out.println("\n");
}
st.close();
}
*/
private void debugShowColumns(String table) throws Exception {
Statement st = conn.createStatement();
String sql = "select * from "+table;
ResultSet rs = st.executeQuery(sql);
ResultSetMetaData m = rs.getMetaData();
for (int i = 1; i <= m.getColumnCount(); i++) {
System.out.println(i+" : "+m.getColumnName(i));
}
st.close();
}
//return the string value for one cell
private String getString(String colName, String table, String where) throws Exception {
List<String> results = getColStrings(colName, table, where);
if (results.size() > 1) {
throw new RuntimeException("more than one result");
} else if (results.size() == 0) {
throw new RuntimeException("no results");
}
return results.get(0);
}
private Map<String, String> getRow(String table, String where) throws Exception {
String sql = getSql("*", table, where);
Map<String, String> results = new HashMap<String, String>();
Statement st = null;
try {
st = conn.createStatement();
ResultSet rs = st.executeQuery(sql);
ResultSetMetaData m = rs.getMetaData();
int rows = 0;
while (rs.next()) {
if (rows > 0) {
throw new RuntimeException("returned more than one row!");
}
for (int i = 1; i <= m.getColumnCount(); i++) {
results.put(m.getColumnName(i), rs.getString(i));
}
rows++;
}
} finally {
if (st != null) {
st.close();
}
}
return results;
}
//return the string representations of the column values for one column
//as a list of strings
private List<String> getColStrings(String colName) throws Exception {
return getColStrings(colName, ExtractComparer.CONTENT_COMPARISONS.getName(), null);
}
private List<String> getColStrings(String colName, String table, String where) throws Exception {
String sql = getSql(colName, table, where);
List<String> results = new ArrayList<>();
Statement st = null;
try {
st = conn.createStatement();
System.out.println("SQL: "+sql);
ResultSet rs = st.executeQuery(sql);
while (rs.next()) {
results.add(rs.getString(1));
}
} finally {
if (st != null) {
st.close();
}
}
return results;
}
private String getSql(String colName, String table, String where) {
StringBuilder sb = new StringBuilder();
sb.append("select ").append(colName).append(" from ").append(table);
if (where != null && ! where.equals("")) {
sb.append(" where ").append(where);
}
return sb.toString();
}
}