/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Before;
import org.junit.Test;
import org.xml.sax.ContentHandler;
public class SXWPFExtractorTest extends TikaTest {
private ParseContext parseContext;
@Before
public void setUp() {
parseContext = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
}
@Test
public void basicTest() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", parseContext);
assertEquals(8, metadataList.size());
Metadata m = metadataList.get(0);
assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED));
assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
assertEquals("3", m.get(OfficeOpenXMLCore.REVISION));
assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
//assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
assertEquals("260", m.get(Office.WORD_COUNT));
assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
assertEquals("12", m.get(Office.LINE_COUNT));
assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
assertContainsCount("engaging title page", content, 1);
//need \n to differentiate from metadata values
assertContainsCount("This is the Author\n", content, 1);
assertContainsCount("This is an engaging title page", content, 1);
assertContains("My Document Title", content);
assertContains("My Document Subtitle", content);
assertContains("<p class=\"toc_1\">\t<a href=\"#_Toc467647605\">Heading1\t3</a></p>", content);
assertContains("2. Really basic 2.", content);
assertContainsCount("This is a text box", content, 1);
assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content);
assertContains("<p>This is 10 spaces</p>", content);
//caption
assertContains("<p class=\"table_of_figures\">\t<a href=\"#_Toc467647797\">Table 1: Table1 Caption\t2</a></p>", content);
//embedded table
//TODO: figure out how to handle embedded tables in html
assertContains("<td>Embedded table r1c1", content);
//shape
assertContainsCount("<p>This is text within a shape", content, 1);
//sdt rich text
assertContains("<p>Rich text content control", content);
//sdt simple text
assertContains("<p>Simple text content control", content);
//sdt repeating
assertContains("Repeating content", content);
//sdt dropdown
//TODO: get options for dropdown
assertContains("Drop down1", content);
//sdt date
assertContains("<p>11/16/2016</p>", content);
//test that <tab/> works
assertContains("tab\ttab", content);
assertContainsCount("serious word art", content, 1);
assertContainsCount("Wordartr1c1", content, 1);
//glossary document contents
assertContains("Click or tap to enter a date", content);
//basic b/i tags...make sure not to overlap!
assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
content);
assertContains("This is a comment", content);
assertContains("This is an endnote", content);
assertContains("this is the footnote", content);
assertContains("First page header", content);
assertContains("Even page header", content);
assertContains("Odd page header", content);
assertContains("First page footer", content);
assertContains("Even page footer", content);
assertContains("Odd page footer", content);
//test default does not include deleted
assertNotContained("frog", content);
assertContains("Mattmann", content);
//TODO: extract chart text
// assertContains("This is the chart title", content);
//TODO: add chart parsing
// assertContains("This is the chart", content);
}
/**
* Test the plain text output of the Word converter
*
* @throws Exception
*/
@Test
public void testWord() throws Exception {
XMLResult xmlResult = getXML("testWORD.docx", parseContext);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", xmlResult.metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", xmlResult.metadata.get(Metadata.AUTHOR));
assertTrue(xmlResult.xml.contains("Sample Word Document"));
}
/**
* Test the plain text output of the Word converter
*
* @throws Exception
*/
@Test
public void testWordFootnote() throws Exception {
XMLResult xmlResult = getXML("footnotes.docx", parseContext);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertTrue(xmlResult.xml.contains("snoska"));
}
/**
* Test that the word converter is able to generate the
* correct HTML for the document
*/
@Test
public void testWordHTML() throws Exception {
XMLResult result = getXML("testWORD.docx", parseContext);
String xml = result.xml;
Metadata metadata = result.metadata;
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
assertTrue(xml.contains("Sample Word Document"));
// Check that custom headings came through
assertTrue(xml.contains("<h1 class=\"title\">"));
// Regular headings
assertContains("<h1>Heading Level 1</h1>", xml);
assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
// Headings with anchor tags in them
//TODO: still not getting bookmarks
assertTrue(xml.contains("<h3>Heading Level 3<a name=\"OnLevel3\" /></h3>"));
// assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
// Bold and italic
assertTrue(xml.contains("<b>BOLD</b>"));
assertTrue(xml.contains("<i>ITALIC</i>"));
// Table
assertTrue(xml.contains("<table>"));
assertTrue(xml.contains("<td>"));
// Links
assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
// Anchor links
assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml);
// Paragraphs with other styles
assertTrue(xml.contains("<p class=\"signature\">This one"));
result = getXML("testWORD_3imgs.docx", parseContext);
xml = result.xml;
// Images 2-4 (there is no 1!)
assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />"));
assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />"));
assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />"));
// Text too
assertTrue(xml.contains("<p>The end!</p>"));
}
@Test
public void testContiguousHTMLFormatting() throws Exception {
// TIKA-692: test document containing multiple
// character runs within a bold tag:
String xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml;
// Make sure bold text arrived as single
// contiguous string even though Word parser
// handled this as 3 character runs
assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
// TIKA-692: test document containing multiple
// character runs within a bold tag:
xml = getXML("testWORD_bold_character_runs2.docx", parseContext).xml;
// Make sure bold text arrived as single
// contiguous string even though Word parser
// handled this as 3 character runs
assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
/**
* Test that we can extract image from docx header
*/
@Test
public void testWordPicturesInHeader() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("headerPic.docx", parseContext);
assertEquals(2, metadataList.size());
Metadata m = metadataList.get(0);
String mainContent = m.get(RecursiveParserWrapper.TIKA_CONTENT);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
m.get(Metadata.CONTENT_TYPE));
// Check that custom headings came through
assertTrue(mainContent.contains("<img"));
}
@Test
public void testPicturesInVariousPlaces() throws Exception {
//test that images are actually extracted from
//headers, footers, comments, endnotes, footnotes
List<Metadata> metadataList = getRecursiveMetadata("testWORD_embedded_pics.docx", parseContext);
//only process embedded resources once
assertEquals(3, metadataList.size());
String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
for (int i = 1; i < 4; i++) {
assertContains("header" + i + "_pic", content);
assertContains("footer" + i + "_pic", content);
}
assertContains("body_pic.jpg", content);
assertContains("sdt_pic.jpg", content);
assertContains("deeply_embedded_pic", content);
assertContains("deleted_pic", content);//TODO: don't extract this
assertContains("footnotes_pic", content);
assertContains("comments_pic", content);
assertContains("endnotes_pic", content);
// assertContains("sdt2_pic.jpg", content);//name of file is not stored in image-sdt
assertContainsCount("<img src=", content, 14);
}
/**
* Test docx without headers
* TIKA-633
*/
@Test
public void testNullHeaders() throws Exception {
XMLResult xmlResult = getXML("NullHeader.docx", parseContext);
assertEquals("Should have found some text", false,
xmlResult.xml.isEmpty());
}
@Test
public void testVarious() throws Exception {
Metadata metadata = new Metadata();
String content = getText(getResourceAsStream("/test-documents/testWORD_various.docx"),
new AutoDetectParser(), parseContext, metadata);
//content = content.replaceAll("\\s+"," ");
assertContains("Footnote appears here", content);
assertContains("This is a footnote.", content);
assertContains("This is the header text.", content);
assertContains("This is the footer text.", content);
assertContains("Here is a text box", content);
assertContains("Bold", content);
assertContains("italic", content);
assertContains("underline", content);
assertContains("superscript", content);
assertContains("subscript", content);
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
for (int row = 1; row <= 3; row++) {
//assertContains("ยท\tBullet " + row, content);
//assertContains("\u00b7\tBullet " + row, content);
assertContains("Bullet " + row, content);
}
assertContains("Here is a numbered list:", content);
for (int row = 1; row <= 3; row++) {
//assertContains(row + ")\tNumber bullet " + row, content);
//assertContains(row + ") Number bullet " + row, content);
// TODO: OOXMLExtractor fails to number the bullets:
assertContains("Number bullet " + row, content);
}
for (int row = 1; row <= 2; row++) {
for (int col = 1; col <= 3; col++) {
assertContains("Row " + row + " Col " + col, content);
}
}
assertContains("Keyword1 Keyword2", content);
assertEquals("Keyword1 Keyword2",
metadata.get(Metadata.KEYWORDS));
assertContains("Subject is here", content);
// TODO: Remove subject in Tika 2.0
assertEquals("Subject is here",
metadata.get(Metadata.SUBJECT));
assertEquals("Subject is here",
metadata.get(OfficeOpenXMLCore.SUBJECT));
assertContains("Suddenly some Japanese text:", content);
// Special version of (GHQ)
assertContains("\uff08\uff27\uff28\uff31\uff09", content);
// 6 other characters
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
assertContains("And then some Gothic text:", content);
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
}
@Test
public void testWordCustomProperties() throws Exception {
Metadata metadata = new Metadata();
try (InputStream input = OOXMLParserTest.class.getResourceAsStream(
"/test-documents/testWORD_custom_props.docx")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OOXMLParser().parse(input, handler, metadata, context);
}
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME));
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("2", metadata.get(Office.WORD_COUNT));
assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
// TODO: Remove subject in Tika 2.0
assertEquals("My subject", metadata.get(Metadata.SUBJECT));
assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
assertEquals("3", metadata.get("custom:myCustomNumber"));
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
// TIKA-989:
@Test
public void testEmbeddedPDF() throws Exception {
String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml;
int i = xml.indexOf("Here is the pdf file:");
int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\" />");
int k = xml.indexOf("Bye Bye");
int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\" />");
int m = xml.indexOf("Bye for real.");
assertTrue(i != -1);
assertTrue(j != -1);
assertTrue(k != -1);
assertTrue(l != -1);
assertTrue(m != -1);
assertTrue(i < j);
assertTrue(j < k);
assertTrue(k < l);
assertTrue(l < m);
}
// TIKA-1006
@Test
public void testWordNullStyle() throws Exception {
String xml = getXML("testWORD_null_style.docx").xml;
assertContains("Test av styrt dokument", xml);
}
/**
* TIKA-1044 - Handle word documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
assertContains("This is a piece of text that causes an exception",
getXML("testWORD_no_format.docx", parseContext).xml);
}
@Test
public void testSkipDeleted() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setUseSAXDocxExtractor(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.docx", pc);
assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
// TIKA-1005:
@Test
public void testTextInsideTextBox() throws Exception {
String xml = getXML("testWORD_text_box.docx", parseContext).xml;
assertContains("This text is directly in the body of the document.", xml);
assertContains("This text is inside of a text box in the body of the document.", xml);
assertContains("This text is inside of a text box in the header of the document.", xml);
assertContains("This text is inside of a text box in the footer of the document.", xml);
}
//TIKA-2346
@Test
public void testTurningOffTextBoxExtraction() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeShapeBasedContent(false);
officeParserConfig.setUseSAXDocxExtractor(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
String xml = getXML("testWORD_text_box.docx", pc).xml;
assertContains("This text is directly in the body of the document.", xml);
assertNotContained("This text is inside of a text box in the body of the document.", xml);
assertNotContained("This text is inside of a text box in the header of the document.", xml);
assertNotContained("This text is inside of a text box in the footer of the document.", xml);
}
/**
* Test for missing text described in
* <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
* and TIKA-1317
*/
@Test
public void testMissingText() throws Exception {
XMLResult xmlResult = getXML("testWORD_missing_text.docx", parseContext);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContains("BigCompany", xmlResult.xml);
assertContains("Seasoned", xmlResult.xml);
assertContains("Rich_text_in_cell", xmlResult.xml);
}
//TIKA-792; with room for future missing bean tests
@Test
public void testWordMissingOOXMLBeans() throws Exception {
//If a bean is missing, POI prints stack trace to stderr
String[] fileNames = new String[]{
"testWORD_missing_ooxml_bean1.docx",//TIKA-792
};
PrintStream origErr = System.err;
for (String fileName : fileNames) {
//grab stderr
ByteArrayOutputStream errContent = new ByteArrayOutputStream();
System.setErr(new PrintStream(errContent, true, UTF_8.name()));
getXML(fileName, parseContext);
//return stderr
System.setErr(origErr);
String err = errContent.toString(UTF_8.name());
assertTrue(err.length() == 0);
}
}
@Test
public void testDOCXThumbnail() throws Exception {
String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml;
int a = xml.indexOf("This file contains a thumbnail");
int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />");
assertTrue(a != -1);
assertTrue(b != -1);
assertTrue(a < b);
}
@Test
public void testEncrypted() throws Exception {
Map<String, String> tests = new HashMap<String, String>();
tests.put("testWORD_protected_passtika.docx",
"This is an encrypted Word 2007 File");
Parser parser = new AutoDetectParser();
Metadata m = new Metadata();
PasswordProvider passwordProvider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "tika";
}
};
OfficeParserConfig opc = new OfficeParserConfig();
opc.setUseSAXDocxExtractor(true);
ParseContext passwordContext = new ParseContext();
passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
passwordContext.set(OfficeParserConfig.class, opc);
for (Map.Entry<String, String> e : tests.entrySet()) {
assertContains(e.getValue(), getXML(e.getKey(), passwordContext).xml);
}
//now try with no password
for (Map.Entry<String, String> e : tests.entrySet()) {
boolean exc = false;
try {
getXML(e.getKey(), parseContext);
} catch (EncryptedDocumentException ex) {
exc = true;
}
assertTrue(exc);
}
}
@Test
public void testDOCXParagraphNumbering() throws Exception {
String xml = getXML("testWORD_numbered_list.docx", parseContext).xml;
//SAX parser is getting this. DOM parser is not!
assertContains("add a list here", xml);
assertContains("1) This", xml);
assertContains("a) Is", xml);
assertContains("i) A multi", xml);
assertContains("ii) Level", xml);
assertContains("1. Within cell 1", xml);
assertContains("b. Cell b", xml);
assertContains("iii) List", xml);
assertContains("2) foo", xml);
assertContains("ii) baz", xml);
assertContains("ii) foo", xml);
assertContains("II. bar", xml);
assertContains("6. six", xml);
assertContains("7. seven", xml);
assertContains("a. seven a", xml);
assertContains("e. seven e", xml);
assertContains("2. A ii 2", xml);
assertContains("3. page break list 3", xml);
assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml);
assertContains("1.1.1. 1.1.1", xml);
assertContains("1.1. 1.2->1.1 //set the value", xml);
}
@Test
public void testDOCXOverrideParagraphNumbering() throws Exception {
String xml = getXML("testWORD_override_list_numbering.docx").xml;
//Test 1
assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml);
assertContains("1st.2.3someText 1st.2.3someText", xml);
assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml);
assertContains("5th 5th", xml);
//Test 2
assertContains("1.a.I 1.a.I", xml);
//test no reset because level 2 is not sufficient to reset
assertContains("<p>1.b.III 1.b.III</p>", xml);
//test restarted because of level 0's increment to 2
assertContains("2.a.I 2.a.I", xml);
//test handling of skipped level
assertContains("<p>2.b 2.b</p>", xml);
//Test 3
assertContains("(1)) (1))", xml);
//tests start level 1 at 17 and
assertContains("2.17 2.17", xml);
//tests that isLegal turns everything into decimal
assertContains("2.18.2.1 2.18.2.1", xml);
assertContains("<p>2 2</p>", xml);
//Test4
assertContains("<p>1 1</p>", xml);
assertContains("<p>A A</p>", xml);
assertContains("<p>B B</p>", xml);
//this tests overrides
assertContains("<p>C C</p>", xml);
assertContains("<p>4 4</p>", xml);
//Test5
assertContains(">00 00", xml);
assertContains(">01 01", xml);
assertContains(">01. 01.", xml);
assertContains(">01..1 01..1", xml);
assertContains(">02 02", xml);
}
@Test
public void testMultiAuthorsManagers() throws Exception {
XMLResult r = getXML("testWORD_multi_authors.docx", parseContext);
String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
assertEquals(3, authors.length);
assertEquals("author2", authors[1]);
String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER);
assertEquals(2, managers.length);
assertEquals("manager1", managers[0]);
assertEquals("manager2", managers[1]);
}
@Test
public void testOrigSourcePath() throws Exception {
Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.docx", parseContext).get(11);
assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
}
@Test
public void testBoldHyperlink() throws Exception {
//TIKA-1255
String xml = getXML("testWORD_boldHyperlink.docx", parseContext).xml;
xml = xml.replaceAll("\\s+", " ");
assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml);
}
@Test
public void testLongForIntExceptionInSummaryDetails() throws Exception {
//TIKA-2055
assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx", parseContext).xml);
}
@Test
public void testMacrosInDocm() throws Exception {
Metadata parsedBy = new Metadata();
parsedBy.add("X-Parsed-By",
"org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
//test default is "don't extract macros"
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
assertContainsAtLeast(parsedBy, metadataList);
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
officeParserConfig.setUseSAXDocxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
//check that content came out of the .docm file
assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
assertContainsAtLeast(parsedBy, metadataList);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
}
@Test
public void testEmbedded() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testWORD_embeded.docx", parseContext);
Metadata main = metadataList.get(0);
String content = main.get(RecursiveParserWrapper.TIKA_CONTENT);
//make sure mark up is there
assertContains("<img src=\"embedded:image2.jpeg\" alt=\"A description...\" />",
content);
assertContains("<div class=\"embedded\" id=\"rId8\" />",
content);
assertEquals(16, metadataList.size());
}
@Test
public void testDotx() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testWORD_template.docx", parseContext);
String content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("Metallica", content);
assertContains("Hetfield", content);
assertContains("one eye open", content);
assertContains("Getting the perfect", content);
//from glossary document
assertContains("table rows", content);
metadataList = getRecursiveMetadata("testWORD_template.dotx", parseContext);
content = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
//from glossary document
assertContainsCount("ready to write", content, 2);
}
}