/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.fork; import static org.apache.tika.TikaTest.assertContains; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; import java.io.IOException; import java.io.InputStream; import java.io.NotSerializableException; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import org.apache.tika.Tika; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.fork.ForkParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Test that the ForkParser correctly behaves when * wired in to the regular Parsers and their test data */ public class ForkParserIntegrationTest { private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works /** * Simple text parsing */ @Test public void testForkedTextParsing() throws Exception { ForkParser parser = new ForkParser( ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); try { ContentHandler output = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( "/test-documents/testTXT.txt"); ParseContext context = new ParseContext(); parser.parse(stream, output, new Metadata(), context); String content = output.toString(); assertContains("Test d'indexation", content); assertContains("http://www.apache.org", content); } finally { parser.close(); } } /** * This error has a message and an equals() implementation as to be able * to match it against the serialized version of itself. */ static class AnError extends Error { private static final long serialVersionUID = -6197267350768803348L; private String message; AnError(String message) { super(message); this.message = message; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; AnError anError = (AnError) o; if (!message.equals(anError.message)) return false; return true; } @Override public int hashCode() { return message.hashCode(); } } /** * This error isn't serializable on the server, so can't be sent back * to the Fork Client once it has occured */ static class WontBeSerializedError extends RuntimeException { private static final long serialVersionUID = 1L; WontBeSerializedError(String message) { super(message); } private void writeObject(java.io.ObjectOutputStream out) { RuntimeException e = new RuntimeException("Bang!"); boolean found = false; for (StackTraceElement ste : e.getStackTrace()) { if (ste.getClassName().equals(ForkParser.class.getName())) { found = true; break; } } if (!found) { throw e; } } } static class BrokenParser implements Parser { private static final long serialVersionUID = 995871497930817839L; public Error err = new AnError("Simulated fail"); public RuntimeException re = null; public Set<MediaType> getSupportedTypes(ParseContext context) { return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN)); } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (re != null) throw re; throw err; } } /** * TIKA-831 Parsers throwing errors should be caught and * properly reported */ @Test public void testParsingErrorInForkedParserShouldBeReported() throws Exception { BrokenParser brokenParser = new BrokenParser(); ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser); InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt"); // With a serializable error, we'll get that back try { ContentHandler output = new BodyContentHandler(); ParseContext context = new ParseContext(); parser.parse(stream, output, new Metadata(), context); fail("Expected TikaException caused by Error"); } catch (TikaException e) { assertEquals(brokenParser.err, e.getCause()); } finally { parser.close(); } // With a non serializable one, we'll get something else // TODO Fix this test brokenParser = new BrokenParser(); brokenParser.re= new WontBeSerializedError("Can't Serialize"); parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser); // try { // ContentHandler output = new BodyContentHandler(); // ParseContext context = new ParseContext(); // parser.parse(stream, output, new Metadata(), context); // fail("Expected TikaException caused by Error"); // } catch (TikaException e) { // assertEquals(TikaException.class, e.getCause().getClass()); // assertEquals("Bang!", e.getCause().getMessage()); // } } /** * If we supply a non serializable object on the ParseContext, * check we get a helpful exception back */ @Test public void testParserHandlingOfNonSerializable() throws Exception { ForkParser parser = new ForkParser( ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); ParseContext context = new ParseContext(); context.set(Detector.class, new Detector() { public MediaType detect(InputStream input, Metadata metadata) { return MediaType.OCTET_STREAM; } }); try { ContentHandler output = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( "/test-documents/testTXT.txt"); parser.parse(stream, output, new Metadata(), context); fail("Should have blown up with a non serializable ParseContext"); } catch(TikaException e) { // Check the right details assertNotNull(e.getCause()); assertEquals(NotSerializableException.class, e.getCause().getClass()); assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage()); } finally { parser.close(); } } /** * TIKA-832 */ @Test public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception { ParseContext context = new ParseContext(); context.set(Parser.class, tika.getParser()); ForkParser parser = new ForkParser( ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n")); try { ContentHandler body = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( "/test-documents/testTXT.txt"); parser.parse(stream, body, new Metadata(), context); String content = body.toString(); assertContains("Test d'indexation", content); assertContains("http://www.apache.org", content); } finally { parser.close(); } } /** * TIKA-808 - Ensure that parsing of our test PDFs work under * the Fork Parser, to ensure that complex parsing behaves */ @Test public void testForkedPDFParsing() throws Exception { ForkParser parser = new ForkParser( ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); try { ContentHandler output = new BodyContentHandler(); InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( "/test-documents/testPDF.pdf"); ParseContext context = new ParseContext(); context.set(Parser.class, new EmptyParser()); parser.parse(stream, output, new Metadata(), context); String content = output.toString(); assertContains("Apache Tika", content); assertContains("Tika - Content Analysis Toolkit", content); assertContains("incubator", content); assertContains("Apache Software Foundation", content); } finally { parser.close(); } } }