/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.app.mediafilter; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.IOException; import org.apache.log4j.Logger; import org.textmining.extraction.TextExtractor; import org.textmining.extraction.word.WordTextExtractorFactory; /* * * to do: helpful error messages - can't find mediafilter.cfg - can't * instantiate filter - bitstream format doesn't exist. * */ public class WordFilter extends MediaFilter { private static Logger log = Logger.getLogger(WordFilter.class); public String getFilteredName(String oldFilename) { return oldFilename + ".txt"; } /** * @return String bundle name * */ public String getBundleName() { return "TEXT"; } /** * @return String bitstreamformat */ public String getFormatString() { return "Text"; } /** * @return String description */ public String getDescription() { return "Extracted text"; } /** * @param source * source input stream * * @return InputStream the resulting input stream */ public InputStream getDestinationStream(InputStream source) throws Exception { // get input stream from bitstream // pass to filter, get string back try { WordTextExtractorFactory factory = new WordTextExtractorFactory(); TextExtractor e = factory.textExtractor(source); String extractedText = e.getText(); // if verbose flag is set, print out extracted text // to STDOUT if (MediaFilterManager.isVerbose) { System.out.println(extractedText); } // generate an input stream with the extracted text byte[] textBytes = extractedText.getBytes(); ByteArrayInputStream bais = new ByteArrayInputStream(textBytes); return bais; // will this work? or will the byte array be out of scope? } catch (IOException ioe) { System.out.println("Invalid Word Format"); log.error("Error detected - Word File format not recognized: " + ioe.getMessage(), ioe); throw ioe; } } }