/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.bundle; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.ops4j.pax.exam.CoreOptions.bundle; import static org.ops4j.pax.exam.CoreOptions.junitBundles; import static org.ops4j.pax.exam.CoreOptions.mavenBundle; import static org.ops4j.pax.exam.CoreOptions.options; import static org.ops4j.pax.exam.CoreOptions.systemProperty; import javax.inject.Inject; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.io.Writer; import java.net.URISyntaxException; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.jar.Attributes; import java.util.jar.JarInputStream; import java.util.jar.Manifest; import org.apache.tika.Tika; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; import org.apache.tika.fork.ForkParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.internal.Activator; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.junit.runner.RunWith; import org.ops4j.pax.exam.Configuration; import org.ops4j.pax.exam.Option; import org.ops4j.pax.exam.junit.PaxExam; import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; import org.ops4j.pax.exam.spi.reactors.PerMethod; import org.ops4j.pax.exam.util.PathUtils; import org.osgi.framework.Bundle; import org.osgi.framework.BundleContext; import org.osgi.framework.ServiceReference; import org.xml.sax.ContentHandler; @RunWith(PaxExam.class) @ExamReactorStrategy(PerMethod.class) public class BundleIT { @Inject private Parser defaultParser; @Inject private Detector contentTypeDetector; @Inject private BundleContext bc; private String log4jConfigPath = "file:" + PathUtils.getBaseDir() + "/src/test/resources/log4j.properties"; private String testBundlesPath = "file:" + PathUtils.getBaseDir() + "/target/test-bundles/"; @Configuration public Option[] configuration() throws IOException, URISyntaxException, ClassNotFoundException { return options( bundle(testBundlesPath + "tika-core.jar"), bundle(testBundlesPath + "tika-bundle.jar"), junitBundles(), mavenBundle("org.slf4j", "slf4j-api", "1.7.24"), mavenBundle("org.slf4j", "slf4j-log4j12", "1.7.24").noStart(), mavenBundle("org.slf4j", "jcl-over-slf4j", "1.7.24"), mavenBundle("org.slf4j", "jul-to-slf4j", "1.7.24"), mavenBundle("log4j", "log4j", "1.2.17"), systemProperty("log4j.configuration").value(log4jConfigPath) ); } @Test public void testBundleLoaded() throws Exception { boolean hasCore = false, hasBundle = false; for (Bundle b : bc.getBundles()) { if ("org.apache.tika.core".equals(b.getSymbolicName())) { hasCore = true; assertEquals("Core not activated", Bundle.ACTIVE, b.getState()); } if ("org.apache.tika.bundle".equals(b.getSymbolicName())) { hasBundle = true; assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState()); } } assertTrue("Core bundle not found", hasCore); assertTrue("Bundle bundle not found", hasBundle); } @Test public void testManifestNoJUnit() throws Exception { File TARGET = new File("target"); File base = new File(TARGET, "test-bundles"); File tikaBundle = new File(base, "tika-bundle.jar"); JarInputStream jarIs = new JarInputStream(new FileInputStream(tikaBundle)); Manifest mf = jarIs.getManifest(); Attributes main = mf.getMainAttributes(); String importPackage = main.getValue("Import-Package"); boolean containsJunit = importPackage.contains("junit"); assertFalse("The bundle should not import junit", containsJunit); } @Test public void testBundleDetection() throws Exception { Metadata metadataTXT = new Metadata(); metadataTXT.set(Metadata.RESOURCE_NAME_KEY, "test.txt"); Metadata metadataPDF = new Metadata(); metadataPDF.set(Metadata.RESOURCE_NAME_KEY, "test.pdf"); // Simple type detection assertEquals(MediaType.TEXT_PLAIN, contentTypeDetector.detect(null, metadataTXT)); assertEquals(MediaType.application("pdf"), contentTypeDetector.detect(null, metadataPDF)); } @Test public void testForkParser() throws Exception { ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser); parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Dlog4j.configuration=" + log4jConfigPath)); String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>"; InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); Writer writer = new StringWriter(); ContentHandler contentHandler = new BodyContentHandler(writer); Metadata metadata = new Metadata(); MediaType type = contentTypeDetector.detect(stream, metadata); assertEquals(type.toString(), "text/html"); metadata.add(Metadata.CONTENT_TYPE, type.toString()); ParseContext parseCtx = new ParseContext(); parser.parse(stream, contentHandler, metadata, parseCtx); writer.flush(); String content = writer.toString(); assertTrue(content.length() > 0); assertEquals("test content", content.trim()); } @Test public void testBundleSimpleText() throws Exception { Tika tika = new Tika(); // Simple text extraction String xml = tika.parseToString(new File("pom.xml")); assertTrue(xml.contains("tika-bundle")); } @Test public void testBundleDetectors() throws Exception { //For some reason, the detector created by OSGi has a flat //list of detectors, whereas the detector created by the traditional //service loading method has children: DefaultDetector, MimeTypes. //We have to flatten the service loaded DefaultDetector to get equivalence. //Detection behavior should all be the same. // Get the classes found within OSGi ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class); DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef); Set<String> osgiDetectors = new HashSet<>(); for (Detector d : detectorService.getDetectors()) { osgiDetectors.add(d.getClass().getName()); } // Check we did get a few, just in case... assertTrue("Should have several Detector names, found " + osgiDetectors.size(), osgiDetectors.size() > 3); // Get the raw detectors list from the traditional service loading mechanism DefaultDetector detector = new DefaultDetector(); Set<String> rawDetectors = new HashSet<String>(); for (Detector d : detector.getDetectors()) { if (d instanceof DefaultDetector) { for (Detector dChild : ((DefaultDetector) d).getDetectors()) { rawDetectors.add(dChild.getClass().getName()); } } else { rawDetectors.add(d.getClass().getName()); } } assertEquals(osgiDetectors, rawDetectors); } @Test public void testBundleParsers() throws Exception { // Get the classes found within OSGi ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class); DefaultParser parserService = (DefaultParser) bc.getService(parserRef); Set<String> osgiParsers = new HashSet<>(); for (Parser p : parserService.getAllComponentParsers()) { osgiParsers.add(p.getClass().getName()); } // Check we did get a few, just in case... assertTrue("Should have lots Parser names, found " + osgiParsers.size(), osgiParsers.size() > 15); // Get the raw parsers list from the traditional service loading mechanism CompositeParser parser = (CompositeParser) defaultParser; Set<String> rawParsers = new HashSet<>(); for (Parser p : parser.getAllComponentParsers()) { if (p instanceof DefaultParser) { for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) { rawParsers.add(pChild.getClass().getName()); } } else { rawParsers.add(p.getClass().getName()); } } assertEquals(rawParsers, osgiParsers); } @Test public void testTesseractParser() throws Exception { ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); Parser tesseractParser = new TesseractOCRParser(); try (InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg")) { tesseractParser.parse(stream, handler, new Metadata(), context); } } @Test public void testTikaBundle() throws Exception { Tika tika = new Tika(); // Package extraction ContentHandler handler = new BodyContentHandler(); Parser parser = tika.getParser(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = new FileInputStream("src/test/resources/test-documents.zip")) { parser.parse(stream, handler, new Metadata(), context); } String content = handler.toString(); assertTrue(content.contains("testEXCEL.xls")); assertTrue(content.contains("Sample Excel Worksheet")); assertTrue(content.contains("testHTML.html")); assertTrue(content.contains("Test Indexation Html")); assertTrue(content.contains("testOpenOffice2.odt")); assertTrue(content.contains("This is a sample Open Office document")); assertTrue(content.contains("testPDF.pdf")); assertTrue(content.contains("Apache Tika")); assertTrue(content.contains("testPPT.ppt")); assertTrue(content.contains("Sample Powerpoint Slide")); assertTrue(content.contains("testRTF.rtf")); assertTrue(content.contains("indexation Word")); assertTrue(content.contains("testTXT.txt")); assertTrue(content.contains("Test d'indexation de Txt")); assertTrue(content.contains("testWORD.doc")); assertTrue(content.contains("This is a sample Microsoft Word Document")); assertTrue(content.contains("testXML.xml")); assertTrue(content.contains("Rida Benjelloun")); } }