/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.utils.CommonsDigester;
import org.junit.Test;
public class DigestingParserTest extends TikaTest {
private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
"digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
private final static int UNLIMITED = 1000000;//well, not really, but longer than input file
private final static long SEED = new Random().nextLong();
private final Random random = new Random(SEED);
private final Parser p = new AutoDetectParser();
@Test
public void testBasic() throws Exception {
Map<CommonsDigester.DigestAlgorithm, String> expected =
new HashMap<>();
expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
"82bc53764a0f1430d134ae3b70c32654");
expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
"8b8a6923fdf251ddab72c6e4b5d54160" +
"9db917ba4260d1767995a844d8d654df");
expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
"da4c21f36b54d7acd06fcf68e974663b"+
"fed1d256875be58d22beacf178154cc3"+
"a1178cb73443deaa53aa0840324708bb");
//test each one
for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
}
//test comma separated
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA256,
CommonsDigester.DigestAlgorithm.SHA384,
CommonsDigester.DigestAlgorithm.SHA512}) {
assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
}
assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
}
@Test
public void testReset() throws Exception {
String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
assertEquals(expectedMD5, m.get(P+"MD5"));
}
@Test
public void testNegativeMaxMarkLength() throws Exception {
Metadata m = new Metadata();
boolean ex = false;
try {
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
} catch (IllegalArgumentException e) {
ex = true;
}
assertTrue("Exception not thrown", ex);
}
@Test
public void testMultipleCombinations() throws Exception {
Path tmp = Files.createTempFile("tika-digesting-parser-test", "");
try {
//try some random lengths
for (int i = 0; i < 10; i++) {
testMulti(tmp, random.nextInt(100000), random.nextInt(100000), random.nextBoolean());
}
//try specific lengths
testMulti(tmp, 1000, 100000, true);
testMulti(tmp, 1000, 100000, false);
testMulti(tmp, 10000, 10001, true);
testMulti(tmp, 10000, 10001, false);
testMulti(tmp, 10000, 10000, true);
testMulti(tmp, 10000, 10000, false);
testMulti(tmp, 10000, 9999, true);
testMulti(tmp, 10000, 9999, false);
testMulti(tmp, 1000, 100, true);
testMulti(tmp, 1000, 100, false);
testMulti(tmp, 1000, 10, true);
testMulti(tmp, 1000, 10, false);
testMulti(tmp, 1000, 0, true);
testMulti(tmp, 1000, 0, false);
testMulti(tmp, 0, 100, true);
testMulti(tmp, 0, 100, false);
} finally {
Files.delete(tmp);
}
}
private void testMulti(Path tmp, int fileLength, int markLimit,
boolean useTikaInputStream) throws IOException {
OutputStream os = new BufferedOutputStream(Files.newOutputStream(tmp,
StandardOpenOption.CREATE));
for (int i = 0; i < fileLength; i++) {
os.write(random.nextInt());
}
os.flush();
os.close();
Metadata truth = new Metadata();
addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
CommonsDigester.DigestAlgorithm.SHA512,
CommonsDigester.DigestAlgorithm.SHA1,
CommonsDigester.DigestAlgorithm.MD5);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA1);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
CommonsDigester.DigestAlgorithm.SHA1,
CommonsDigester.DigestAlgorithm.SHA512,
CommonsDigester.DigestAlgorithm.MD5);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
CommonsDigester.DigestAlgorithm.SHA1);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
CommonsDigester.DigestAlgorithm.MD5);
}
private void checkMulti(Metadata truth, Path tmp,
int fileLength, int markLimit,
boolean useTikaInputStream, CommonsDigester.DigestAlgorithm... algos) throws IOException {
Metadata result = new Metadata();
CommonsDigester digester = new CommonsDigester(markLimit, algos);
try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp) :
new BufferedInputStream(Files.newInputStream(tmp))) {
digester.digest(is, result, new ParseContext());
}
for (CommonsDigester.DigestAlgorithm algo : algos) {
String truthValue = truth.get(P+algo.name());
String resultValue = result.get(P+algo.name());
assertNotNull("truth", truthValue);
assertNotNull("result", resultValue);
assertEquals("fileLength("+fileLength+") markLimit("+
markLimit+") useTikaInputStream("+useTikaInputStream+")"+
"algorithm("+algo.name()+") seed("+SEED+")",
truthValue, resultValue);
}
}
private void addTruth(Path tmp, CommonsDigester.DigestAlgorithm algo, Metadata truth) throws IOException {
String digest = null;
try (InputStream is = Files.newInputStream(tmp)) {
switch (algo) {
case MD2:
digest = DigestUtils.md2Hex(is);
break;
case MD5:
digest = DigestUtils.md5Hex(is);
break;
case SHA1:
digest = DigestUtils.sha1Hex(is);
break;
case SHA256:
digest = DigestUtils.sha256Hex(is);
break;
case SHA384:
digest = DigestUtils.sha384Hex(is);
break;
case SHA512:
digest = DigestUtils.sha512Hex(is);
break;
default:
throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algo.toString());
}
}
truth.set(P+algo.name(), digest);
}
}