BundleIT.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.bundle;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.ops4j.pax.exam.CoreOptions.bundle;
import static org.ops4j.pax.exam.CoreOptions.junitBundles;
import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
import static org.ops4j.pax.exam.CoreOptions.options;
import static org.ops4j.pax.exam.CoreOptions.systemProperty;

import javax.inject.Inject;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.jar.Attributes;
import java.util.jar.JarInputStream;
import java.util.jar.Manifest;

import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.internal.Activator;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.ops4j.pax.exam.Configuration;
import org.ops4j.pax.exam.Option;
import org.ops4j.pax.exam.junit.PaxExam;
import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
import org.ops4j.pax.exam.spi.reactors.PerMethod;
import org.ops4j.pax.exam.util.PathUtils;
import org.osgi.framework.Bundle;
import org.osgi.framework.BundleContext;
import org.osgi.framework.ServiceReference;
import org.xml.sax.ContentHandler;

@RunWith(PaxExam.class)
@ExamReactorStrategy(PerMethod.class)
public class BundleIT {
    @Inject
    private Parser defaultParser;

    @Inject
    private Detector contentTypeDetector;

    @Inject
    private BundleContext bc;

    private String log4jConfigPath = "file:" + PathUtils.getBaseDir() + "/src/test/resources/log4j.properties";
    private String testBundlesPath = "file:" + PathUtils.getBaseDir() + "/target/test-bundles/";

    @Configuration
    public Option[] configuration() throws IOException, URISyntaxException, ClassNotFoundException {
        return options(
                bundle(testBundlesPath + "tika-core.jar"),
                bundle(testBundlesPath + "tika-bundle.jar"),
                junitBundles(),
                mavenBundle("org.slf4j", "slf4j-api", "1.7.24"),
                mavenBundle("org.slf4j", "slf4j-log4j12", "1.7.24").noStart(),
                mavenBundle("org.slf4j", "jcl-over-slf4j", "1.7.24"),
                mavenBundle("org.slf4j", "jul-to-slf4j", "1.7.24"),
                mavenBundle("log4j", "log4j", "1.2.17"),
                systemProperty("log4j.configuration").value(log4jConfigPath)
        );
    }

    @Test
    public void testBundleLoaded() throws Exception {
        boolean hasCore = false, hasBundle = false;
        for (Bundle b : bc.getBundles()) {
            if ("org.apache.tika.core".equals(b.getSymbolicName())) {
                hasCore = true;
                assertEquals("Core not activated", Bundle.ACTIVE, b.getState());
            }
            if ("org.apache.tika.bundle".equals(b.getSymbolicName())) {
                hasBundle = true;
                assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState());
            }
        }
        assertTrue("Core bundle not found", hasCore);
        assertTrue("Bundle bundle not found", hasBundle);
    }

    @Test
    public void testManifestNoJUnit() throws Exception {
        File TARGET = new File("target");
        File base = new File(TARGET, "test-bundles");
        File tikaBundle = new File(base, "tika-bundle.jar");

        JarInputStream jarIs = new JarInputStream(new FileInputStream(tikaBundle));
        Manifest mf = jarIs.getManifest();

        Attributes main = mf.getMainAttributes();

        String importPackage = main.getValue("Import-Package");

        boolean containsJunit = importPackage.contains("junit");

        assertFalse("The bundle should not import junit", containsJunit);
    }

    @Test
    public void testBundleDetection() throws Exception {
        Metadata metadataTXT = new Metadata();
        metadataTXT.set(Metadata.RESOURCE_NAME_KEY, "test.txt");

        Metadata metadataPDF = new Metadata();
        metadataPDF.set(Metadata.RESOURCE_NAME_KEY, "test.pdf");

        // Simple type detection
        assertEquals(MediaType.TEXT_PLAIN, contentTypeDetector.detect(null, metadataTXT));
        assertEquals(MediaType.application("pdf"), contentTypeDetector.detect(null, metadataPDF));
    }

    @Test
    public void testForkParser() throws Exception {
        ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser);
        parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Dlog4j.configuration=" + log4jConfigPath));
        String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>";
        InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
        Writer writer = new StringWriter();
        ContentHandler contentHandler = new BodyContentHandler(writer);
        Metadata metadata = new Metadata();
        MediaType type = contentTypeDetector.detect(stream, metadata);
        assertEquals(type.toString(), "text/html");
        metadata.add(Metadata.CONTENT_TYPE, type.toString());
        ParseContext parseCtx = new ParseContext();
        parser.parse(stream, contentHandler, metadata, parseCtx);
        writer.flush();
        String content = writer.toString();
        assertTrue(content.length() > 0);
        assertEquals("test content", content.trim());
    }

    @Test
    public void testBundleSimpleText() throws Exception {
        Tika tika = new Tika();

        // Simple text extraction
        String xml = tika.parseToString(new File("pom.xml"));
        assertTrue(xml.contains("tika-bundle"));
    }

    @Test
    public void testBundleDetectors() throws Exception {
        //For some reason, the detector created by OSGi has a flat
        //list of detectors, whereas the detector created by the traditional
        //service loading method has children: DefaultDetector, MimeTypes.
        //We have to flatten the service loaded DefaultDetector to get equivalence.
        //Detection behavior should all be the same.

        // Get the classes found within OSGi
        ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class);
        DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef);

        Set<String> osgiDetectors = new HashSet<>();
        for (Detector d : detectorService.getDetectors()) {
            osgiDetectors.add(d.getClass().getName());
        }

        // Check we did get a few, just in case...
        assertTrue("Should have several Detector names, found " + osgiDetectors.size(),
                osgiDetectors.size() > 3);

        // Get the raw detectors list from the traditional service loading mechanism
        DefaultDetector detector = new DefaultDetector();
        Set<String> rawDetectors = new HashSet<String>();
        for (Detector d : detector.getDetectors()) {
            if (d instanceof DefaultDetector) {
                for (Detector dChild : ((DefaultDetector) d).getDetectors()) {
                    rawDetectors.add(dChild.getClass().getName());
                }
            } else {
                rawDetectors.add(d.getClass().getName());
            }
        }
        assertEquals(osgiDetectors, rawDetectors);
    }

    @Test
    public void testBundleParsers() throws Exception {
        // Get the classes found within OSGi
        ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class);
        DefaultParser parserService = (DefaultParser) bc.getService(parserRef);

        Set<String> osgiParsers = new HashSet<>();
        for (Parser p : parserService.getAllComponentParsers()) {
            osgiParsers.add(p.getClass().getName());
        }

        // Check we did get a few, just in case...
        assertTrue("Should have lots Parser names, found " + osgiParsers.size(),
                osgiParsers.size() > 15);

        // Get the raw parsers list from the traditional service loading mechanism
        CompositeParser parser = (CompositeParser) defaultParser;
        Set<String> rawParsers = new HashSet<>();
        for (Parser p : parser.getAllComponentParsers()) {
            if (p instanceof DefaultParser) {
                for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) {
                    rawParsers.add(pChild.getClass().getName());
                }
            } else {
                rawParsers.add(p.getClass().getName());
            }
        }
        assertEquals(rawParsers, osgiParsers);
    }

    @Test
    public void testTesseractParser() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser tesseractParser = new TesseractOCRParser();
        try (InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg")) {
            tesseractParser.parse(stream, handler, new Metadata(), context);
        }

    }

    @Test
    public void testTikaBundle() throws Exception {
        Tika tika = new Tika();

        // Package extraction
        ContentHandler handler = new BodyContentHandler();

        Parser parser = tika.getParser();
        ParseContext context = new ParseContext();
        context.set(Parser.class, parser);

        try (InputStream stream =
                     new FileInputStream("src/test/resources/test-documents.zip")) {
            parser.parse(stream, handler, new Metadata(), context);
        }

        String content = handler.toString();
        assertTrue(content.contains("testEXCEL.xls"));
        assertTrue(content.contains("Sample Excel Worksheet"));
        assertTrue(content.contains("testHTML.html"));
        assertTrue(content.contains("Test Indexation Html"));
        assertTrue(content.contains("testOpenOffice2.odt"));
        assertTrue(content.contains("This is a sample Open Office document"));
        assertTrue(content.contains("testPDF.pdf"));
        assertTrue(content.contains("Apache Tika"));
        assertTrue(content.contains("testPPT.ppt"));
        assertTrue(content.contains("Sample Powerpoint Slide"));
        assertTrue(content.contains("testRTF.rtf"));
        assertTrue(content.contains("indexation Word"));
        assertTrue(content.contains("testTXT.txt"));
        assertTrue(content.contains("Test d'indexation de Txt"));
        assertTrue(content.contains("testWORD.doc"));
        assertTrue(content.contains("This is a sample Microsoft Word Document"));
        assertTrue(content.contains("testXML.xml"));
        assertTrue(content.contains("Rida Benjelloun"));
    }
}