WordParserTest.java example

Explorer
tika-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;

public class WordParserTest extends TikaTest {

    @Test
    public void testWordParser() throws Exception {
        try (InputStream input = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD.doc")) {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new OfficeParser().parse(input, handler, metadata, new ParseContext());

            assertEquals(
                    "application/msword",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            assertContains("Sample Word Document", handler.toString());
        }
    }

    @Test
    public void testWordWithWAV() throws Exception {
        try (InputStream input = WordParserTest.class.getResourceAsStream(
                "/test-documents/Doc1_ole.doc")) {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new OfficeParser().parse(input, handler, metadata, new ParseContext());

            assertContains("MSj00974840000[1].wav", handler.toString());
        }
    }

    /**
     * Test that the word converter is able to generate the
     *  correct HTML for the document
     */
    @Test
    public void testWordHTML() throws Exception {

        // Try with a document containing various tables and
        // formattings
        XMLResult result = getXML("testWORD.doc");
        String xml = result.xml;
        Metadata metadata = result.metadata;

        assertEquals(
                "application/msword",
                metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        assertTrue(xml.contains("Sample Word Document"));

        // Check that custom headings came through
        assertTrue(xml.contains("<h1 class=\"title\">"));
        // Regular headings
        assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
        assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
        // Bold and italic
        assertTrue(xml.contains("<b>BOLD</b>"));
        assertTrue(xml.contains("<i>ITALIC</i>"));
        // Table
        assertTrue(xml.contains("<table>"));
        assertTrue(xml.contains("<td>"));
        // TODO - Check for the nested table
        // Links
        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
        // Paragraphs with other styles
        assertTrue(xml.contains("<p class=\"signature\">This one"));

        // Try with a document that contains images
        xml = getXML("testWORD_3imgs.doc").xml;

        // Images 1-3
        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\""));
        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\""));
        assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\""));

        // Text too
        assertTrue(xml.contains("<p>The end!"));

        // TIKA-692: test document containing multiple
        // character runs within a bold tag:
        xml = getXML("testWORD_bold_character_runs.doc").xml;

        // Make sure bold text arrived as single
        // contiguous string even though Word parser
        // handled this as 3 character runs
        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));

        // TIKA-692: test document containing multiple
        // character runs within a bold tag:
        xml = getXML("testWORD_bold_character_runs2.doc").xml;

        // Make sure bold text arrived as single
        // contiguous string even though Word parser
        // handled this as 3 character runs
        assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
    }

    @Test
    public void testEmbeddedNames() throws Exception {
        String result = getXML("testWORD_embedded_pdf.doc").xml;

        // Make sure the embedded div comes out after "Here
        // is the pdf file" and before "Bye Bye":
        int i = result.indexOf("Here is the pdf file:");
        assertTrue(i != -1);
        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\" />");
        assertTrue(j != -1);
        int k = result.indexOf("Bye Bye");
        assertTrue(k != -1);

        assertTrue(i < j);
        assertTrue(j < k);
    }

    // TIKA-982
    @Test
    public void testEmbeddedRTF() throws Exception {
        String result = getXML("testWORD_embedded_rtf.doc").xml;
        assertTrue(result.contains("<div class=\"embedded\" id=\"_1404039792\" />"));
        assertTrue(result.contains("_1404039792.rtf"));
    }

    // TIKA-1019
    @Test
    public void testDocumentLink() throws Exception {
        String result = getXML("testDocumentLink.doc").xml;
        assertTrue(result.contains("<div class=\"embedded\" id=\"_1327495610\" />"));
        assertTrue(result.contains("_1327495610.unknown"));
    }

    @Test
    public void testWord6Parser() throws Exception {
        try (InputStream input = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD6.doc")) {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new OfficeParser().parse(input, handler, metadata, new ParseContext());

            assertEquals(
                    "application/msword",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
            assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
            assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
        }
    }

    //TIKA-2346
    @Test
    public void testTurningOffTextBox() throws Exception {
        ParseContext pc = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setIncludeShapeBasedContent(false);
        pc.set(OfficeParserConfig.class, officeParserConfig);
        String xml = getXML("testWORD_various.doc", pc).xml;
        assertNotContained("text box", xml);
    }

    @Test
    public void testVarious() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        try (InputStream stream = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD_various.doc")) {
            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
        }

        String content = handler.toString();
        //content = content.replaceAll("\\s+"," ");
        assertContains("Footnote appears here", content);
        assertContains("This is a footnote.", content);
        assertContains("This is the header text.", content);
        assertContains("This is the footer text.", content);
        assertContains("Here is a text box", content);
        assertContains("Bold", content);
        assertContains("italic", content);
        assertContains("underline", content);
        assertContains("superscript", content);
        assertContains("subscript", content);
        assertContains("Here is a citation:", content);
        assertContains("Figure 1 This is a caption for Figure 1", content);
        assertContains("(Kramer)", content);
        assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
        assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
        assertContains("This is a hyperlink", content);
        assertContains("Here is a list:", content);
        for(int row=1;row<=3;row++) {
            //assertContains("·\tBullet " + row, content);
            //assertContains("\u00b7\tBullet " + row, content);
            assertContains("Bullet " + row, content);
        }
        assertContains("Here is a numbered list:", content);
        for(int row=1;row<=3;row++) {
            //assertContains(row + ")\tNumber bullet " + row, content);
            //assertContains(row + ") Number bullet " + row, content);
            // TODO: WordExtractor fails to number the bullets:
            assertContains("Number bullet " + row, content);
        }

        for(int row=1;row<=2;row++) {
            for(int col=1;col<=3;col++) {
                assertContains("Row " + row + " Col " + col, content);
            }
        }

        assertContains("Keyword1 Keyword2", content);
        assertEquals("Keyword1 Keyword2",
                     metadata.get(TikaCoreProperties.KEYWORDS));

        assertContains("Subject is here", content);
        // TODO: Move to OO subject in Tika 2.0
        assertEquals("Subject is here",
                     metadata.get(Metadata.SUBJECT));
        assertEquals("Subject is here",
                     metadata.get(OfficeOpenXMLCore.SUBJECT));

        assertContains("Suddenly some Japanese text:", content);
        // Special version of (GHQ)
        assertContains("\uff08\uff27\uff28\uff31\uff09", content);
        // 6 other characters
        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);

        assertContains("And then some Gothic text:", content);
        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }

    /**
     * TIKA-1044 - Handle documents where parts of the
     *  text have no formatting or styles applied to them
     */
    @Test
    public void testNoFormat() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        try (InputStream stream = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD_no_format.doc")) {
            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
        }

        String content = handler.toString();
        assertContains("Will generate an exception", content);
    }

    /**
     * Ensures that custom OLE2 (HPSF) properties are extracted
     */
    @Test
    public void testCustomProperties() throws Exception {
        Metadata metadata = new Metadata();

        try (InputStream input = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD_custom_props.doc")) {
            ContentHandler handler = new BodyContentHandler(-1);
            ParseContext context = new ParseContext();
            context.set(Locale.class, Locale.US);
            new OfficeParser().parse(input, handler, metadata, context);
        }

        assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
        assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
        assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
        assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE));
        assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
        assertEquals("1", metadata.get(Office.PAGE_COUNT));
        assertEquals("2", metadata.get(Office.WORD_COUNT));
        assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
        assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
        assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS));
        // TODO: Move to OO subject in Tika 2.0
        assertEquals("My subject", metadata.get(Metadata.SUBJECT));
        assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
        assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY));
        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
        assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
    }

    @Test
    public void testExceptions1() throws Exception {
        XMLResult xml;
        Level logLevelStart = Logger.getRootLogger().getLevel();
        Logger.getRootLogger().setLevel(Level.ERROR);
        try {
            xml = getXML("testException1.doc");
            assertContains("total population", xml.xml);
            xml = getXML("testException2.doc");
            assertContains("electric charge", xml.xml);
        } finally {
            Logger.getRootLogger().setLevel(logLevelStart);
        }
    }

    @Test
    public void testTabularSymbol() throws Exception {
        assertContains("one two", getXML("testWORD_tabular_symbol.doc").xml.replaceAll("\\s+", " "));
    }

    /**
     * TIKA-1229 Hyperlinks in Headers should be output as such,
     *  not plain text with control characters
     */
    @Test
    public void testHeaderHyperlinks() throws Exception {
        XMLResult result = getXML("testWORD_header_hyperlink.doc");
        String xml = result.xml;
        Metadata metadata = result.metadata;

        assertEquals(
                "application/msword",
                metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR));
        assertContains("example.com", xml);

        // Check we don't have the special text HYPERLINK
        assertFalse(xml.contains("HYPERLINK"));

        // Check we do have the link
        assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);

        // Check we do have the email
        assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
    }

    @Test
    public void testControlCharacter() throws Exception {
        assertContains("1. Introduzione<b> </b></a> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
    }

    @Test
    public void testParagraphsAfterTables() throws Exception {
        XMLResult result = getXML("test_TIKA-1251.doc");

        String xml = result.xml;
        Metadata metadata = result.metadata;

        assertEquals(
                "application/msword",
                metadata.get(Metadata.CONTENT_TYPE));

        assertContains("<p>1. Organisering av vakten:</p>", xml);

    }

    @Test
    public void testHyperlinkStringIOOBESmartQuote() throws Exception {
        //TIKA-1512, one cause: closing double quote is a smart quote
        //test file contributed by user
        XMLResult result = getXML("testWORD_closingSmartQInHyperLink.doc");
        assertContains("href=\"https://issues.apache.org/jira/browse/TIKA-1512", result.xml);
    }

    @Test
    @Ignore //until we determine whether we can include test docs or not
    public void testHyperlinkStringLongNoCloseQuote() throws Exception {
        //TIKA-1512, one cause: no closing quote on really long string
        //test file derived from govdocs1 012152.doc
        XMLResult result = getXML("testWORD_longHyperLinkNoCloseQuote.doc");
        assertContains("href=\"http://www.lexis.com", result.xml);
    }

    @Test
    @Ignore //until we determine whether we can include test docs or not
    public void testHyperlinkStringLongCarriageReturn() throws Exception {
        //TIKA-1512, one cause: no closing quote, but carriage return
        //test file derived from govdocs1 040044.doc
        XMLResult result = getXML("testWORD_hyperLinkCarriageReturn.doc");
        assertContains("href=\"http://www.nib.org", result.xml);
    }

    @Test
    public void testDOCParagraphNumbering() throws Exception {
        String xml = getXML("testWORD_numbered_list.doc").xml;
        assertContains("1) This", xml);
        assertContains("a) Is", xml);
        assertContains("i) A multi", xml);
        assertContains("ii) Level", xml);
        assertContains("1. Within cell 1", xml);
        assertContains("b. Cell b", xml);
        assertContains("iii) List", xml);
        assertContains("2) foo", xml);
        assertContains("ii) baz", xml);
        assertContains("ii) foo", xml);
        assertContains("II. bar", xml);
        assertContains("6. six", xml);
        assertContains("7. seven", xml);
        assertContains("a. seven a", xml);
        assertContains("e. seven e", xml);
        assertContains("2. A ii 2", xml);
        assertContains("3. page break list 3", xml);
        assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
        assertContains("1.1.1. 1.1.1", xml);
        assertContains("1.1. 1.2->1.1  //set the value", xml);

        assertContains("add a list here", xml);
        //TODO: not currently pulling numbers out of comments
        assertContains(">comment list 1", xml);

    }

    @Test
    public void testDOCOverrideParagraphNumbering() throws Exception {
        String xml = getXML("testWORD_override_list_numbering.doc").xml;

        //Test 1
        assertContains("1.1.1.1...1 1.1.1.1...1", xml);
        assertContains("1st.2.3someText 1st.2.3someText", xml);
        assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
        assertContains("5th 5th", xml);


        //Test 2
        assertContains("1.a.I 1.a.I", xml);
        //test no reset because level 2 is not sufficient to reset
        assertContains("1.b.III 1.b.III", xml);
        //test restarted because of level 0's increment to 2
        assertContains("2.a.I 2.a.I", xml);
        //test handling of skipped level
        assertContains("2.b 2.b", xml);

        //Test 3
        assertContains("(1)) (1))", xml);
        //tests start level 1 at 17 and
        assertContains("2.17 2.17", xml);
        //tests that isLegal turns everything into decimal
        assertContains("2.18.2.1 2.18.2.1", xml);
        assertContains(">2 2", xml);

        //Test4
        assertContains(">1 1", xml);
        assertContains(">A A", xml);
        assertContains(">B B", xml);
        assertContains(">C C", xml);
        assertContains(">4 4", xml);

        //Test5
        assertContains(">00 00", xml);
        assertContains(">01 01", xml);
        assertContains(">01. 01.", xml);
        assertContains(">01..1 01..1", xml);
        assertContains(">02 02", xml);
    }

    @Test
    public void testMultiAuthorsManagers() throws Exception {
        XMLResult r = getXML("testWORD_multi_authors.doc");
        String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
        assertEquals(3, authors.length);
        assertEquals("author2", authors[1]);

        String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
        assertEquals(2, managers.length);
        assertEquals("manager1", managers[0]);
        assertEquals("manager2", managers[1]);
    }

    @Test
    public void testOrigLocation() throws Exception {
        Metadata metadata = getXML("testException2.doc").metadata;
        List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
        assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
        assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
    }

    @Test
    public void testOrigSourcePath() throws Exception {
        Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.doc").get(11);
        assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
        assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
    }

    @Test
    public void testBoldHyperlink() throws Exception {
        //TIKA-1255
        String xml = getXML("testWORD_boldHyperlink.doc").xml;
        xml = xml.replaceAll("\\s+", " ");
        assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
    }

    @Test
    public void testMacros() throws  Exception {

        //test default is "don't extract macros"
        for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) {
            if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
                fail("Shouldn't have extracted macros as default");
            }
        }

        //now test that they were extracted
        ParseContext context = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setExtractMacros(true);
        context.set(OfficeParserConfig.class, officeParserConfig);


        Metadata minExpected = new Metadata();
        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
        minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
        minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
        minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

        List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc", context);
        assertContainsAtLeast(minExpected, metadataList);

        //test configuring via config file
        TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
        AutoDetectParser parser = new AutoDetectParser(tikaConfig);

        metadataList = getRecursiveMetadata("testWORD_macros.doc", parser);
        assertContainsAtLeast(minExpected, metadataList);
    }

    @Test
    public void testDeleted() throws Exception {
        //test classic behavior
        String xml = getXML("testWORD_2006ml.doc").xml;
        assertNotContained("frog", xml);

        //moveFrom is deleted in .doc files
        assertContainsCount("Second paragraph", xml, 1);
        //now test inclusion of deleted text
        ParseContext context = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setIncludeDeletedContent(true);
        context.set(OfficeParserConfig.class, officeParserConfig);
        XMLResult r = getXML("testWORD_2006ml.doc", context);
        assertContains("frog", r.xml);

        //moveFrom is deleted in .doc files
        assertContainsCount("Second paragraph", r.xml, 2);
    }

    @Test
    public void testProtected() throws Exception {
        try {
            getXML("testWORD_protected_passtika.doc");
            fail("should have thrown encrypted document exception");
        } catch (org.apache.tika.exception.EncryptedDocumentException e) {

        }
    }
}