/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.extractor;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.jackrabbit.extractor.AbstractTextExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Text extractor for MS Office 2007 documents.
*/
public class MsOffice2007TextExtractor extends AbstractTextExtractor {
/**
* Logger instance.
*/
private static final Logger log = LoggerFactory.getLogger(MsOffice2007TextExtractor.class);
/**
* Creates a new <code>MsOffice2007TextExtractor</code> instance.
*/
public MsOffice2007TextExtractor() {
super(new String[]{"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.spreadsheetml.template"});
}
//-------------------------------------------------------< TextExtractor >
/**
* {@inheritDoc}
*/
public Reader extractText(InputStream stream, String type, String encoding) throws IOException {
ZipInputStream zis = null;
try {
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
saxParserFactory.setValidating(false);
SAXParser saxParser = saxParserFactory.newSAXParser();
XMLReader xmlReader = saxParser.getXMLReader();
xmlReader.setFeature("http://xml.org/sax/features/validation", false);
xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
MsOffice2007ContentHandler contentHandler = null;
if (type.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||
type.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.template")) {
contentHandler = new WordprocessingMLContentHandler();
} else if (type.equals("application/vnd.openxmlformats-officedocument.presentationml.template") ||
type.equals("application/vnd.openxmlformats-officedocument.presentationml.slideshow") ||
type.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
contentHandler = new PresentationMLContentHandler();
} else if (type.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") ||
type.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.template")) {
contentHandler = new SpreadsheetMLContentHandler();
}
xmlReader.setContentHandler(contentHandler);
zis = new ZipInputStream(stream);
ZipEntry ze;
StringBuffer sb = new StringBuffer();
while ((ze = zis.getNextEntry()) != null) {
if (ze.getName().startsWith(contentHandler.getFilePattern())) {
// It is unspecified whether the XML parser closes the stream when
// done parsing. To ensure that the stream gets closed just once,
// we prevent the parser from closing it by catching the close()
// call and explicitly close the stream in a finally block.
InputSource is = new InputSource(new FilterInputStream(zis) {
public void close() {}
});
log.debug("Parsing "+ze);
xmlReader.parse(is);
sb.append(contentHandler.getContent());
} else {
log.debug("- "+ze);
}
}
log.debug("TEXT: "+sb.toString());
return new StringReader(sb.toString());
} catch (ParserConfigurationException e) {
log.warn("Failed to extract Microsoft Office 2007 text content", e);
return new StringReader("");
} catch (SAXException e) {
log.warn("Failed to extract Microsoft Office 2007 text content", e);
return new StringReader("");
} finally {
zis.close();
stream.close();
}
}
}