/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
import java.util.List;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.opendocument.OpenOfficeParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
public class ODFParserTest extends TikaTest {
/**
* For now, allow us to run some tests against both
* the old and the new parser
*/
private Parser[] getParsers() {
return new Parser[] {
new OpenDocumentParser(),
new OpenOfficeParser()
};
}
@Test
public void testOO3() throws Exception {
for (Parser parser : getParsers()) {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testODFwithOOo3.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(input, handler, metadata, new ParseContext());
assertEquals(
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
assertContains("Solr", content);
assertContains("one embedded", content);
assertContains("Rectangle Title", content);
assertContains("a blue background and dark border", content);
}
}
}
@Test
public void testOO2() throws Exception {
for (Parser parser : getParsers()) {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testOpenOffice2.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(input, handler, metadata, new ParseContext());
assertEquals(
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
assertEquals(
"NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
metadata.get("generator"));
// Check date metadata, both old-style and new-style
assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
// Check the document statistics
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
assertEquals("14", metadata.get(Office.WORD_COUNT));
assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
assertEquals("0", metadata.get(Office.TABLE_COUNT));
assertEquals("0", metadata.get(Office.OBJECT_COUNT));
assertEquals("0", metadata.get(Office.IMAGE_COUNT));
// Check the Tika-1.0 style document statistics
assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
assertEquals("14", metadata.get(Metadata.WORD_COUNT));
assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
// Check the very old style statistics (these will be removed shortly)
assertEquals("0", metadata.get("nbTab"));
assertEquals("0", metadata.get("nbObject"));
assertEquals("0", metadata.get("nbImg"));
assertEquals("1", metadata.get("nbPage"));
assertEquals("1", metadata.get("nbPara"));
assertEquals("14", metadata.get("nbWord"));
assertEquals("78", metadata.get("nbCharacter"));
// Custom metadata tags present but without values
assertEquals(null, metadata.get("custom:Info 1"));
assertEquals(null, metadata.get("custom:Info 2"));
assertEquals(null, metadata.get("custom:Info 3"));
assertEquals(null, metadata.get("custom:Info 4"));
String content = handler.toString();
assertTrue(content.contains(
"This is a sample Open Office document,"
+ " written in NeoOffice 2.2.1 for the Mac."));
}
}
}
/**
* Similar to {@link #testOO2()}, but using a different
* OO2 file with different metadata in it
*/
@Test
public void testOO2Metadata() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testOpenOffice2.odf")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new OpenDocumentParser().parse(input, handler, metadata);
assertEquals(
"application/vnd.oasis.opendocument.formula",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
assertEquals("The quick brown fox jumps over the lazy dog",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("Gym class featuring a brown fox and lazy dog",
metadata.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Gym class featuring a brown fox and lazy dog",
metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Gym class featuring a brown fox and lazy dog",
metadata.get(Metadata.SUBJECT));
assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
assertEquals("1", metadata.get("editing-cycles"));
assertEquals(
"OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
metadata.get("generator"));
assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
// User defined metadata
assertEquals("Text 1", metadata.get("custom:Info 1"));
assertEquals("2", metadata.get("custom:Info 2"));
assertEquals("false", metadata.get("custom:Info 3"));
assertEquals("true", metadata.get("custom:Info 4"));
// No statistics present
assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
assertEquals(null, metadata.get(Metadata.WORD_COUNT));
assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
assertEquals(null, metadata.get("nbTab"));
assertEquals(null, metadata.get("nbObject"));
assertEquals(null, metadata.get("nbImg"));
assertEquals(null, metadata.get("nbPage"));
assertEquals(null, metadata.get("nbPara"));
assertEquals(null, metadata.get("nbWord"));
assertEquals(null, metadata.get("nbCharacter"));
// Note - contents of maths files not currently supported
String content = handler.toString().trim();
assertEquals("", content);
}
}
/**
* Similar to {@link #testOO2()} )}, but using an OO3 file
*/
@Test
public void testOO3Metadata() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testODFwithOOo3.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new OpenDocumentParser().parse(input, handler, metadata);
assertEquals(
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Test document", metadata.get(Metadata.SUBJECT));
assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Bart Hanssens", metadata.get("initial-creator"));
assertEquals("2", metadata.get("editing-cycles"));
assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
assertEquals(
"OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
metadata.get("generator"));
assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
// User defined metadata
assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
assertEquals(null, metadata.get("custom:Info 2"));
assertEquals(null, metadata.get("custom:Info 3"));
assertEquals(null, metadata.get("custom:Info 4"));
// Check the document statistics
assertEquals("2", metadata.get(Office.PAGE_COUNT));
assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
assertEquals("54", metadata.get(Office.WORD_COUNT));
assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
assertEquals("0", metadata.get(Office.TABLE_COUNT));
assertEquals("2", metadata.get(Office.OBJECT_COUNT));
assertEquals("0", metadata.get(Office.IMAGE_COUNT));
// Check the Tika-1.0 style document statistics
assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
assertEquals("54", metadata.get(Metadata.WORD_COUNT));
assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
// Check the old style statistics (these will be removed shortly)
assertEquals("0", metadata.get("nbTab"));
assertEquals("2", metadata.get("nbObject"));
assertEquals("0", metadata.get("nbImg"));
assertEquals("2", metadata.get("nbPage"));
assertEquals("13", metadata.get("nbPara"));
assertEquals("54", metadata.get("nbWord"));
assertEquals("351", metadata.get("nbCharacter"));
String content = handler.toString();
assertTrue(content.contains(
"Apache Tika Tika is part of the Lucene project."
));
}
}
@Test
public void testODPMasterFooter() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testMasterFooter.odp")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
String content = handler.toString();
assertContains("Master footer is here", content);
}
}
@Test
public void testODTFooter() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testFooter.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
String content = handler.toString();
assertContains("Here is some text...", content);
assertContains("Here is some text on page 2", content);
assertContains("Here is footer text", content);
}
}
@Test
public void testODSFooter() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/testFooter.ods")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
String content = handler.toString();
assertContains("Here is a footer in the center area", content);
}
}
@Test
public void testFromFile() throws Exception {
try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
"/test-documents/testODFwithOOo3.odt"))) {
assertEquals(true, tis.hasFile());
OpenDocumentParser parser = new OpenDocumentParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
assertEquals(
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
}
}
@Test
public void testNPEFromFile() throws Exception {
OpenDocumentParser parser = new OpenDocumentParser();
try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
"/test-documents/testNPEOpenDocument.odt"))) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
assertEquals(
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("primero hay que generar un par de claves", content);
}
}
// TIKA-1063: Test basic style support.
@Test
public void testODTStyles() throws Exception {
String xml = getXML("testStyles.odt").xml;
assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
assertContains("<ol>\t<li><p>One</p>", xml);
assertContains("</ol>", xml);
assertContains("<ul>\t<li><p>First</p>", xml);
assertContains("</ul>", xml);
}
//TIKA-1600: Test that null pointer doesn't break parsing.
@Test
public void testNullStylesInODTFooter() throws Exception {
Parser parser = new OpenDocumentParser();
try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(input, handler, metadata, getNonRecursingParseContext());
assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Utilisation de ce document", content);
assertContains("Copyright and License", content);
assertContains("Changer la langue", content);
assertContains("La page d’accueil permet de faire une recherche simple", content);
}
}
@Test //TIKA-1916
public void testMissingMeta() throws Exception {
String xml = getXML("testODTNoMeta.odt").xml;
assertContains("Test text", xml);
}
@Test //TIKA-2242
public void testParagraphLevelFontStyles() throws Exception {
String xml = getXML("testODTStyles2.odt", getNonRecursingParseContext()).xml;
//test text span font-style properties
assertContains("<p><b>name</b>, advocaat", xml);
//test paragraph's font-style properties
assertContains("<p><b>Publicatie Onbekwaamverklaring", xml);
}
@Test //TIKA-2242
public void testAnnotationsAndPDepthGt1() throws Exception {
//not allowed in html: <p> <annotation> <p> this is an annotation </p> </annotation> </p>
String xml = getXML("testODTStyles3.odt").xml;
assertContains("<p><b>WOUTERS Rolf</b><span class=\"annotation\"> Beschermde persoon is overleden </annotation>", xml);
}
@Test
public void testEmbedded() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
assertEquals(3, metadataList.size());
}
private ParseContext getNonRecursingParseContext() {
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
return parseContext;
}
}