/* ==================================================================== Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ package org.apache.poi.xslf.extractor; import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.POITestCase.assertNotContained; import static org.apache.poi.POITestCase.assertStartsWith; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.io.InputStream; import org.apache.poi.POIDataSamples; import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.xmlbeans.XmlException; import org.junit.Test; /** * Tests for XSLFPowerPointExtractor */ public class TestXSLFPowerPointExtractor { private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); /** * Get text out of the simple file * @throws XmlException * @throws OpenXML4JException */ @Test public void testGetSimpleText() throws IOException, XmlException, OpenXML4JException { XMLSlideShow xmlA = openPPTX("sample.pptx"); @SuppressWarnings("resource") OPCPackage pkg = xmlA.getPackage(); new XSLFPowerPointExtractor(xmlA).close(); new XSLFPowerPointExtractor(pkg).close(); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xmlA); extractor.getText(); String text = extractor.getText(); assertTrue(text.length() > 0); // Check Basics assertStartsWith(text, "Lorem ipsum dolor sit amet\n"); assertContains(text, "amet\n\n"); // Our placeholder master text // This shouldn't show up in the output // String masterText = // "Click to edit Master title style\n" + // "Click to edit Master subtitle style\n" + // "\n\n\n\n\n\n" + // "Click to edit Master title style\n" + // "Click to edit Master text styles\n" + // "Second level\n" + // "Third level\n" + // "Fourth level\n" + // "Fifth level\n"; // Just slides, no notes text = extractor.getText(true, false, false); String slideText = "Lorem ipsum dolor sit amet\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "\n" + "Lorem ipsum dolor sit amet\n" + "Lorem\n" + "ipsum\n" + "dolor\n" + "sit\n" + "amet\n" + "\n"; assertEquals(slideText, text); // Just notes, no slides text = extractor.getText(false, true); assertEquals("\n\n1\n\n\n2\n", text); // Both text = extractor.getText(true, true, false); String bothText = "Lorem ipsum dolor sit amet\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "\n\n\n1\n" + "Lorem ipsum dolor sit amet\n" + "Lorem\n" + "ipsum\n" + "dolor\n" + "sit\n" + "amet\n" + "\n\n\n2\n"; assertEquals(bothText, text); // With Slides and Master Text text = extractor.getText(true, false, true); String smText = "Lorem ipsum dolor sit amet\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "\n" + "Lorem ipsum dolor sit amet\n" + "Lorem\n" + "ipsum\n" + "dolor\n" + "sit\n" + "amet\n" + "\n"; assertEquals(smText, text); // With Slides, Notes and Master Text text = extractor.getText(true, true, true); String snmText = "Lorem ipsum dolor sit amet\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "\n\n\n1\n" + "Lorem ipsum dolor sit amet\n" + "Lorem\n" + "ipsum\n" + "dolor\n" + "sit\n" + "amet\n" + "\n\n\n2\n"; assertEquals(snmText, text); // Via set defaults extractor.setSlidesByDefault(false); extractor.setNotesByDefault(true); text = extractor.getText(); assertEquals("\n\n1\n\n\n2\n", text); extractor.close(); xmlA.close(); } public void testGetComments() throws IOException { XMLSlideShow xml = openPPTX("45545_Comment.pptx"); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); String text = extractor.getText(); assertTrue(text.length() > 0); // Check comments are there assertContains(text, "testdoc"); assertContains(text, "test phrase"); // Check the authors came through too assertContains(text, "XPVMWARE01"); extractor.close(); xml.close(); } public void testGetMasterText() throws Exception { XMLSlideShow xml = openPPTX("WithMaster.pptx"); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); extractor.setSlidesByDefault(true); extractor.setNotesByDefault(false); extractor.setMasterByDefault(true); String text = extractor.getText(); assertTrue(text.length() > 0); // Check master text is there assertContains(text, "Footer from the master slide"); // Theme text shouldn't show up // String themeText = // "Theme Master Title\n" + // "Theme Master first level\n" + // "And the 2nd level\n" + // "Our 3rd level goes here\n" + // "And onto the 4th, such fun....\n" + // "Finally is the Fifth level\n"; // Check the whole text String wholeText = "First page title\n" + "First page subtitle\n" + "This is the Master Title\n" + "This text comes from the Master Slide\n" + "\n" + // TODO Detect we didn't have a title, and include the master one "2nd page subtitle\n" + "Footer from the master slide\n" + "This is the Master Title\n" + "This text comes from the Master Slide\n"; assertEquals(wholeText, text); extractor.close(); xml.close(); } @Test public void testTable() throws Exception { XMLSlideShow xml = openPPTX("present1.pptx"); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); String text = extractor.getText(); assertTrue(text.length() > 0); // Check comments are there assertContains(text, "TEST"); extractor.close(); xml.close(); } /** * Test that we can get the text from macro enabled, * template, theme, slide enabled etc formats, as * well as from the normal file */ @Test public void testDifferentSubformats() throws Exception { String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "thmx", // "xps" - Doesn't have a core document }; for(String extension : extensions) { String filename = "testPPT." + extension; XMLSlideShow xml = openPPTX(filename); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); String text = extractor.getText(); if (extension.equals("thmx")) { // Theme file doesn't have any textual content assertEquals(filename, 0, text.length()); continue; } assertTrue(filename, text.length() > 0); assertContains(filename, text, "Attachment Test"); assertContains(filename, text, "This is a test file data with the same content"); assertContains(filename, text, "content parsing"); assertContains(filename, text, "Different words to test against"); assertContains(filename, text, "Mystery"); extractor.close(); xml.close(); } } @Test public void test45541() throws Exception { // extract text from a powerpoint that has a header in the notes-element POITextExtractor extr = ExtractorFactory.createExtractor( slTests.getFile("45541_Header.pptx")); String text = extr.getText(); assertNotNull(text); assertFalse("Had: " + text, text.contains("testdoc")); text = ((XSLFPowerPointExtractor)extr).getText(false, true); assertContains(text, "testdoc"); extr.close(); assertNotNull(text); // extract text from a powerpoint that has a footer in the master-slide extr = ExtractorFactory.createExtractor( slTests.getFile("45541_Footer.pptx")); text = extr.getText(); assertNotContained(text, "testdoc"); text = ((XSLFPowerPointExtractor)extr).getText(false, true); assertNotContained(text, "testdoc"); text = ((XSLFPowerPointExtractor)extr).getText(false, false, true); assertNotContained(text, "testdoc"); extr.close(); } @Test public void bug54570() throws IOException { XMLSlideShow xml = openPPTX("bug54570.pptx"); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); String text = extractor.getText(); assertNotNull(text); extractor.close(); xml.close(); } private XMLSlideShow openPPTX(String file) throws IOException { InputStream is = slTests.openResourceAsStream(file); try { return new XMLSlideShow(is); } finally { is.close(); } } }