/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import javax.swing.text.Document;
import javax.swing.text.html.HTMLEditorKit;
/*
*
* to do: helpful error messages - can't find mediafilter.cfg - can't
* instantiate filter - bitstream format doesn't exist
*
*/
public class HTMLFilter extends MediaFilter
{
public String getFilteredName(String oldFilename)
{
return oldFilename + ".txt";
}
/**
* @return String bundle name
*
*/
public String getBundleName()
{
return "TEXT";
}
/**
* @return String bitstreamformat
*/
public String getFormatString()
{
return "Text";
}
/**
* @return String description
*/
public String getDescription()
{
return "Extracted text";
}
/**
* @param source
* source input stream
*
* @return InputStream the resulting input stream
*/
public InputStream getDestinationStream(InputStream source)
throws Exception
{
// try and read the document - set to ignore character set directive,
// assuming that the input stream is already set properly (I hope)
HTMLEditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
kit.read(source, doc, 0);
String extractedText = doc.getText(0, doc.getLength());
// generate an input stream with the extracted text
byte[] textBytes = extractedText.getBytes();
ByteArrayInputStream bais = new ByteArrayInputStream(textBytes);
return bais; // will this work? or will the byte array be out of scope?
}
}