/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.app.mediafilter; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.IOException; import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.xmlbeans.XmlException; import org.dspace.content.Item; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Extract flat text from Microsoft Word documents (.doc, .docx). */ public class PoiWordFilter extends MediaFilter { private static final Logger LOG = LoggerFactory.getLogger(PoiWordFilter.class); @Override public String getFilteredName(String oldFilename) { return oldFilename + ".txt"; } @Override public String getBundleName() { return "TEXT"; } @Override public String getFormatString() { return "Text"; } @Override public String getDescription() { return "Extracted text"; } @Override public InputStream getDestinationStream(Item currentItem, InputStream source, boolean verbose) throws Exception { String text; try { // get input stream from bitstream, pass to filter, get string back POITextExtractor extractor = ExtractorFactory.createExtractor(source); text = extractor.getText(); } catch (IOException | OpenXML4JException | XmlException e) { System.err.format("Invalid File Format: %s%n", e.getMessage()); LOG.error("Unable to parse the bitstream: ", e); throw e; } // if verbose flag is set, print out extracted text to STDOUT if (verbose) { System.out.println(text); } // return the extracted text as a stream. return new ByteArrayInputStream(text.getBytes()); } }