/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval;
import static org.apache.tika.eval.AbstractProfiler.EXCEPTION_TYPE;
import static org.apache.tika.eval.AbstractProfiler.getContent;
import static org.apache.tika.eval.io.ExtractReader.IGNORE_LENGTH;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.tika.MockDBWriter;
import org.apache.tika.TikaTest;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReader;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.util.LanguageIDWrapper;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
//These tests ensure that the comparer is extracting the right information
//into a Map<String,String>. A full integration test
//should also ensure that the elements are properly being written to the db
public class SimpleComparerTest extends TikaTest {
private ExtractComparer comparer = null;
private MockDBWriter writer = null;
@Before
public void setUp() throws Exception {
writer = new MockDBWriter();
comparer = new ExtractComparer(null, null,
Paths.get("extractsA"), Paths.get("extractsB"),
new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
IGNORE_LENGTH, IGNORE_LENGTH),
writer);
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath(), "en");
LanguageIDWrapper.loadBuiltInModels();
}
@Test
public void testBasic() throws Exception {
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath());
comparer.compareFiles(fpsA, fpsB);
List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
Map<Cols, String> row = tableInfos.get(0);
assertEquals("0", row.get(Cols.ID));
assertTrue(
row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A)
.startsWith("1,200: 1 | 120000: 1 | over: 1"));
tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
row = tableInfos.get(0);
assertEquals("0", row.get(Cols.ID));
assertEquals("70", row.get(Cols.CONTENT_LENGTH));
assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("14", row.get(Cols.NUM_TOKENS));
assertEquals("12", row.get(Cols.NUM_ALPHABETIC_TOKENS));
assertEquals("6", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_B);
row = tableInfos.get(0);
assertEquals("0", row.get(Cols.ID));
assertEquals("76", row.get(Cols.CONTENT_LENGTH));
assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("13", row.get(Cols.NUM_TOKENS));
assertEquals("4", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
tableInfos = writer.getTable(ExtractComparer.PROFILES_A);
row = tableInfos.get(0);
assertEquals("2", row.get(Cols.NUM_PAGES));
}
@Test
public void testBasicSpanish() throws Exception {
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsA/file12_es.txt.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsB/file12_es.txt.json").toPath());
comparer.compareFiles(fpsA, fpsB);
List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
Map<Cols, String> row = tableInfos.get(0);
assertEquals("133", row.get(Cols.CONTENT_LENGTH));
assertEquals("7", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("24", row.get(Cols.NUM_TOKENS));
assertEquals("3", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("108", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("es", row.get(Cols.COMMON_TOKENS_LANG));
assertEquals("24", row.get(Cols.NUM_ALPHABETIC_TOKENS));
}
@Test
public void testChinese() throws Exception {
//make sure that language id matches common words
//file names. The test file contains MT'd Simplified Chinese with
//known "common words" appended at end.
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file13_attachANotB.doc.json"),
getResourceAsFile("/test-dirs/extractsA/file13_attachANotB.doc.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("non-existent.json"),
getResourceAsFile("/test-dirs/extractsB/non-existent.json").toPath());
comparer.compareFiles(fpsA, fpsB);
List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
Map<Cols, String> row = tableInfos.get(0);
assertEquals("122", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("3", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("zh-cn", row.get(Cols.COMMON_TOKENS_LANG));
}
@Test
public void testEmpty() throws Exception {
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf"),
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file1.pdf"),
getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()
);
comparer.compareFiles(fpsA, fpsB);
List<Map<Cols, String>> table = writer.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
Map<Cols, String> row = table.get(0);
//debugPrintRow(row);
assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
row.get(Cols.EXTRACT_EXCEPTION_ID));
}
@Test
public void testGetContent() throws Exception {
Metadata m = new Metadata();
m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
Map<Cols, String> data = new HashMap<>();
String content = getContent(m, 10, data);
assertEquals(10, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
content = getContent(m, 4, data);
assertEquals(4, content.length());
assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test Metadata with no content
content = getContent(new Metadata(), 10, data);
assertEquals(0, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test null Metadata
content = getContent(null, 10, data);
assertEquals(0, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
}
@Test
public void testAccessException() throws Exception {
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file6_accessEx.pdf.json"),
getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file6_accessEx.pdf.json"),
getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath()
);
comparer.compareFiles(fpsA, fpsB);
for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B}) {
List<Map<Cols, String>> table = writer.getTable(t);
Map<Cols, String> rowA = table.get(0);
//debugPrintRow(rowA);
assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()),
rowA.get(Cols.PARSE_EXCEPTION_ID));
assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
assertNull(rowA.get(Cols.SORT_STACK_TRACE));
}
}
@Test
public void testAttachmentCounts() {
List<Metadata> list = new ArrayList<>();
Metadata m0 = new Metadata();
m0.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored
//in the first metadata object
list.add(m0);
Metadata m1 = new Metadata();
m1.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt");
list.add(m1);
Metadata m2 = new Metadata();
m2.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text2.txt");
list.add(m2);
Metadata m3 = new Metadata();
m3.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip");
list.add(m3);
Metadata m4 = new Metadata();
m4.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx");
list.add(m4);
Metadata m5 = new Metadata();
m5.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/text3.txt");
list.add(m5);
List<Integer> counts = AbstractProfiler.countAttachments(list);
List<Integer> expected = new ArrayList<>();
expected.add(5);
expected.add(0);
expected.add(0);
expected.add(2);
expected.add(4);
expected.add(0);
assertEquals(expected, counts);
}
@Test
public void testDifferentlyOrderedAttachments() throws Exception {
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file14_diffAttachOrder.json"),
getResourceAsFile("/test-dirs/extractsA/file14_diffAttachOrder.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file6_accessEx.pdf.json"),
getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()
);
comparer.compareFiles(fpsA, fpsB);
List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
assertEquals(3, tableInfos.size());
for (int i = 0; i < tableInfos.size(); i++) {
assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP));
}
}
@Test
@Ignore
public void testDebug() throws Exception {
Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
AbstractProfiler.loadCommonTokens(commonTokens, "en");
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()
);
comparer.compareFiles(fpsA, fpsB);
for (TableInfo t : new TableInfo[]{
ExtractComparer.COMPARISON_CONTAINERS,
ExtractComparer.EXTRACT_EXCEPTION_TABLE_A,
ExtractComparer.EXTRACT_EXCEPTION_TABLE_B,
ExtractComparer.EXCEPTION_TABLE_A,
ExtractComparer.EXCEPTION_TABLE_B,
ExtractComparer.PROFILES_A,
ExtractComparer.PROFILES_B,
ExtractComparer.CONTENTS_TABLE_A,
ExtractComparer.CONTENTS_TABLE_B,
ExtractComparer.CONTENT_COMPARISONS}) {
//debugPrintTable(t);
}
}
private void debugPrintTable(TableInfo tableInfo) {
List<Map<Cols, String>> table = writer.getTable(tableInfo);
if (table == null) {
return;
}
int i = 0;
System.out.println("TABLE: "+tableInfo.getName());
for (Map<Cols, String> row : table) {
SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet());
for (Cols key : keys) {
System.out.println( i + " :: " + key + " : " + row.get(key));
}
i++;
}
System.out.println("");
}
private void debugPrintRow(Map<Cols, String> row) {
SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet());
for (Cols key : keys) {
System.out.println(key + " : " + row.get(key));
}
}
@Test
@Ignore("useful for testing 2 files not in test set")
public void oneOff() throws Exception {
Path p1 = Paths.get("");
Path p2 = Paths.get("");
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf.json"),
p1
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("file1.pdf.json"),
p2
);
comparer.compareFiles(fpsA, fpsB);
for (TableInfo t : new TableInfo[]{
ExtractComparer.COMPARISON_CONTAINERS,
ExtractComparer.EXTRACT_EXCEPTION_TABLE_A,
ExtractComparer.EXTRACT_EXCEPTION_TABLE_B,
ExtractComparer.EXCEPTION_TABLE_A,
ExtractComparer.EXCEPTION_TABLE_B,
ExtractComparer.PROFILES_A,
ExtractComparer.PROFILES_B,
ExtractComparer.CONTENTS_TABLE_A,
ExtractComparer.CONTENTS_TABLE_B,
ExtractComparer.CONTENT_COMPARISONS}) {
debugPrintTable(t);
}
}
}