/*
* (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Contributors:
* Nuxeo
* Antoine Taillefer
*
*/
package org.nuxeo.ecm.core.convert.plugins.text.extractors;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.nuxeo.ecm.core.convert.plugins.text.extractors.presentation.PresentationSlide;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Pptx to text converter: parses the Open XML presentation document to read its content.
*/
public class PPTX2TextConverter extends XmlZip2TextConverter {
protected static final Log log = LogFactory.getLog(PPTX2TextConverter.class);
private static final String PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX = "ppt/slides/slide";
protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException,
SAXException {
Set<PresentationSlide> slides = new TreeSet<PresentationSlide>();
ZipEntry zipEntry = zis.getNextEntry();
while (zipEntry != null) {
String zipEntryName = zipEntry.getName();
if (zipEntryName.startsWith(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX)
&& zipEntryName.length() > PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()) {
char slideNumberChar = zipEntryName.charAt(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length());
int slideNumber = -1;
try {
slideNumber = Integer.parseInt(String.valueOf(slideNumberChar));
} catch (NumberFormatException nfe) {
log.warn("Slide number is not an non integer, won't take this slide into account.");
}
if (slideNumber > -1) {
OpenXmlContentHandler contentHandler = new OpenXmlContentHandler();
reader.setContentHandler(contentHandler);
reader.parse(new InputSource(new ByteArrayInputStream(IOUtils.toByteArray(zis))));
slides.add(new PresentationSlide(contentHandler.getContent(), slideNumber));
}
}
zipEntry = zis.getNextEntry();
}
if (!slides.isEmpty()) {
Iterator<PresentationSlide> slidesIt = slides.iterator();
while (slidesIt.hasNext()) {
PresentationSlide slide = slidesIt.next();
sb.append(slide.getContent());
sb.append("\n");
}
}
}
}