/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.iwork;
import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Before;
import org.junit.Test;
import org.xml.sax.ContentHandler;
/**
* Tests if the IWork parser parses the content and metadata properly of the supported formats.
*/
public class IWorkParserTest {
private IWorkPackageParser iWorkParser;
private ParseContext parseContext;
@Before
public void setUp() {
iWorkParser = new IWorkPackageParser();
parseContext = new ParseContext();
parseContext.set(Parser.class, new AutoDetectParser());
}
/**
* Check the given InputStream is not closed by the Parser (TIKA-1117).
*
* @throws Exception
*/
@Test
public void testStreamNotClosed() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
input.read(); // Will throw an Exception if the stream was already closed.
}
@Test
public void testParseKeynote() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
// Make sure enough keys came through
// (Exact numbers will vary based on composites)
assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
List<String> metadataKeys = Arrays.asList(metadata.names());
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
// assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
// Check the metadata values
assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
String content = handler.toString();
assertContains("A sample presentation", content);
assertContains("For the Apache Tika project", content);
assertContains("Slide 1", content);
assertContains("Some random text for the sake of testability.", content);
assertContains("A nice comment", content);
assertContains("A nice note", content);
// test table data
assertContains("Cell one", content);
assertContains("Cell two", content);
assertContains("Cell three", content);
assertContains("Cell four", content);
assertContains("Cell 5", content);
assertContains("Cell six", content);
assertContains("7", content);
assertContains("Cell eight", content);
assertContains("5/5/1985", content);
}
// TIKA-910
@Test
public void testKeynoteTextBoxes() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
}
// TIKA-910
@Test
public void testKeynoteBulletPoints() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
}
// TIKA-923
@Test
public void testKeynoteTables() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
content = content.replaceAll("\\s+", " ");
assertContains("row 1 row 2 row 3", content);
}
// TIKA-923
@Test
public void testKeynoteMasterSlideTable() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
content = content.replaceAll("\\s+", " ");
assertContains("master row 1", content);
assertContains("master row 2", content);
assertContains("master row 3", content);
}
@Test
public void testParsePages() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
// Make sure enough keys came through
// (Exact numbers will vary based on composites)
assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
List<String> metadataKeys = Arrays.asList(metadata.names());
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
// Check the metadata values
assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
String content = handler.toString();
// text on page 1
assertContains("Sample pages document", content);
assertContains("Some plain text to parse.", content);
assertContains("Cell one", content);
assertContains("Cell two", content);
assertContains("Cell three", content);
assertContains("Cell four", content);
assertContains("Cell five", content);
assertContains("Cell six", content);
assertContains("Cell seven", content);
assertContains("Cell eight", content);
assertContains("Cell nine", content);
assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
// text on page 2
assertContains("A second page....", content);
assertContains("Extensible Markup Language", content); // ...
}
// TIKA-904
@Test
public void testPagesLayoutMode() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
assertContains("text box 1 - here is some text", content);
assertContains("created in a text box in layout mode", content);
assertContains("text box 2 - more text!@!$@#", content);
assertContains("this is text inside of a green box", content);
assertContains("text inside of a green circle", content);
}
@Test
public void testParseNumbers() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
// Make sure enough keys came through
// (Exact numbers will vary based on composites)
assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
List<String> metadataKeys = Arrays.asList(metadata.names());
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
// Check the metadata values
assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
String content = handler.toString();
assertContains("Category", content);
assertContains("Home", content);
assertContains("-226", content);
assertContains("-137.5", content);
assertContains("Checking Account: 300545668", content);
assertContains("4650", content);
assertContains("Credit Card", content);
assertContains("Groceries", content);
assertContains("-210", content);
assertContains("Food", content);
assertContains("Try adding your own account transactions to this table.", content);
}
// TIKA- 924
@Test
public void testParseNumbersTableNames() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
assertContains("This is the main table", content);
}
@Test
public void testParseNumbersTableHeaders() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String content = handler.toString();
for(int header=1;header<=5;header++) {
assertContains("header" + header, content);
}
for(int row=1;row<=3;row++) {
assertContains("row" + row, content);
}
}
/**
* We don't currently support password protected Pages files, as
* we don't know how the encryption works (it's not regular Zip
* Encryption). See TIKA-903 for details
*/
@Test
public void testParsePagesPasswordProtected() throws Exception {
// Document password is "tika", but we can't use that yet...
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
// Content will be empty
String content = handler.toString();
assertEquals("", content);
// Will have been identified as encrypted
assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
}
/**
* Check we get headers, footers and footnotes from Pages
*/
@Test
public void testParsePagesHeadersFootersFootnotes() throws Exception {
String footnote = "Footnote: Do a lot of people really use iWork?!?!";
String header = "THIS IS SOME HEADER TEXT";
String footer = "THIS IS SOME FOOTER TEXT\t1";
String footer2 = "THIS IS SOME FOOTER TEXT\t2";
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String contents = handler.toString();
// Check regular text
assertContains("Both Pages 1.x", contents); // P1
assertContains("understanding the Pages document", contents); // P1
assertContains("should be page 2", contents); // P2
// Check for headers, footers and footnotes
assertContains(header, contents);
assertContains(footer, contents);
assertContains(footer2, contents);
assertContains(footnote, contents);
}
/**
* Check we get upper-case Roman numerals within the footer for AutoPageNumber.
*/
@Test
public void testParsePagesHeadersFootersRomanUpper() throws Exception {
String header = "THIS IS SOME HEADER TEXT";
String footer = "THIS IS SOME FOOTER TEXT\tI";
String footer2 = "THIS IS SOME FOOTER TEXT\tII";
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, new Metadata(), parseContext);
String contents = handler.toString();
// Check for headers, footers and footnotes
assertContains(header, contents);
assertContains(footer, contents);
assertContains(footer2, contents);
}
/**
* Check we get lower-case Roman numerals within the footer for AutoPageNumber.
*/
@Test
public void testParsePagesHeadersFootersRomanLower() throws Exception {
String header = "THIS IS SOME HEADER TEXT";
String footer = "THIS IS SOME FOOTER TEXT\ti";
String footer2 = "THIS IS SOME FOOTER TEXT\tii";
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, new Metadata(), parseContext);
String contents = handler.toString();
// Check for headers, footers and footnotes
assertContains(header, contents);
assertContains(footer, contents);
assertContains(footer2, contents);
}
/**
* Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
*/
@Test
public void testParsePagesHeadersAlphaUpper() throws Exception {
String header = "THIS IS SOME HEADER TEXT\tA";
String footer = "THIS IS SOME FOOTER TEXT\tA";
String footer2 = "THIS IS SOME FOOTER TEXT\tB";
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, new Metadata(), parseContext);
String contents = handler.toString();
// Check for headers, footers and footnotes
assertContains(header, contents);
assertContains(footer, contents);
assertContains(footer2, contents);
}
/**
* Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
*/
@Test
public void testParsePagesHeadersAlphaLower() throws Exception {
String header = "THIS IS SOME HEADER TEXT";
String footer = "THIS IS SOME FOOTER TEXT\ta";
String footer2 = "THIS IS SOME FOOTER TEXT\tb";
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, new Metadata(), parseContext);
String contents = handler.toString();
// Check for headers, footers and footnotes
assertContains(header, contents);
assertContains(footer, contents);
assertContains(footer2, contents);
}
/**
* Check we get annotations (eg comments) from Pages
*/
@Test
public void testParsePagesAnnotations() throws Exception {
String commentA = "comment about the APXL file";
String commentB = "comment about UIMA";
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String contents = handler.toString();
// Check regular text
assertContains("Both Pages 1.x", contents); // P1
assertContains("understanding the Pages document", contents); // P1
assertContains("should be page 2", contents); // P2
// Check for comments
assertContains(commentA, contents);
assertContains(commentB, contents);
}
// TIKA-918
@Test
public void testNumbersExtractChartNames() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
iWorkParser.parse(input, handler, metadata, parseContext);
String contents = handler.toString();
assertContains("Expenditure by Category", contents);
assertContains("Currency Chart name", contents);
assertContains("Chart 2", contents);
}
}