/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.chm;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
import org.apache.tika.parser.chm.core.ChmExtractor;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.SAXException;
public class TestChmExtraction {
private final Parser parser = new ChmParser();
private final List<String> files = Arrays.asList(
"/test-documents/testChm.chm",
"/test-documents/testChm2.chm",
"/test-documents/testChm3.chm");
@Test
public void testGetText() throws Exception {
BodyContentHandler handler = new BodyContentHandler();
new ChmParser().parse(
new ByteArrayInputStream(TestParameters.chmData),
handler, new Metadata(), new ParseContext());
assertTrue(handler.toString().contains(
"The TCard method accepts only numeric arguments"));
}
@Test
public void testChmParser() throws Exception{
for (String fileName : files) {
InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName);
testingChm(stream);
}
}
private void testingChm(InputStream stream) throws IOException, SAXException, TikaException {
try {
BodyContentHandler handler = new BodyContentHandler(-1);
parser.parse(stream, handler, new Metadata(), new ParseContext());
assertTrue(!handler.toString().isEmpty());
} finally {
stream.close();
}
}
@Test
public void testExtractChmEntries() throws TikaException, IOException{
for (String fileName : files) {
try (InputStream stream = TestChmExtraction.class.getResourceAsStream(fileName)) {
testExtractChmEntry(stream);
}
}
}
protected boolean findZero(byte[] textData) {
for (byte b : textData) {
if (b==0) {
return true;
}
}
return false;
}
protected boolean niceAscFileName(String name) {
for (char c : name.toCharArray()) {
if (c>=127 || c<32) {
//non-ascii char or control char
return false;
}
}
return true;
}
protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException{
ChmExtractor chmExtractor = new ChmExtractor(stream);
ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E"
, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
Set<String> names = new HashSet<String>();
for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
//Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
if (! niceAscFileName(directoryListingEntry.getName())) {
throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
}
final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
//check duplicate entry name which is seen before.
if (names.contains(lowName)) {
throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
}
names.add(lowName);
if (lowName.endsWith(".html")
|| lowName.endsWith(".htm")
|| lowName.endsWith(".hhk")
|| lowName.endsWith(".hhc")
//|| name.endsWith(".bmp")
) {
if (findZero(data)) {
throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
}
//validate html
String html = new String(data, ISO_8859_1);
if (! htmlPairP.matcher(html).find()) {
System.err.println(lowName + " is invalid.");
System.err.println(html);
throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
}
// else {
// System.err.println(directoryListingEntry.getName() + " is valid.");
// }
}
}
}
@Test
public void testMultiThreadedChmExtraction() throws InterruptedException {
ExecutorService executor = Executors.newFixedThreadPool(TestParameters.NTHREADS);
for (int i = 0; i < TestParameters.NTHREADS; i++) {
executor.execute(new Runnable() {
public void run() {
for (String fileName : files) {
InputStream stream = null;
try {
stream = TestChmExtraction.class.getResourceAsStream(fileName);
BodyContentHandler handler = new BodyContentHandler(-1);
parser.parse(stream, handler, new Metadata(), new ParseContext());
assertTrue(!handler.toString().isEmpty());
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
});
}
executor.shutdown();
// Waits until all threads will have finished
while (!executor.isTerminated()) {
Thread.sleep(500);
}
}
@Test
public void test_TIKA_1446() throws Exception {
URL chmDir = TestChmExtraction.class.getResource("/test-documents/chm/");
File chmFolder = new File(chmDir.toURI());
for (String fileName : chmFolder.list()) {
File file = new File(chmFolder, fileName);
InputStream stream = new FileInputStream(file);
testingChm(stream);
}
}
}