package com.knowledgetree.textextraction; import java.io.OutputStreamWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.UnsupportedEncodingException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.StringReader; import java.io.ByteArrayInputStream; import java.util.Hashtable; import org.apache.log4j.Logger; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.exception.TikaException; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class KTTextExtractor { private static KTTextExtractor textExtractor = null; private Logger logger; /** * Returns the existing KTTextExtractor object * @return KTTextExtractor */ public static KTTextExtractor get() { if(KTTextExtractor.textExtractor == null) { KTTextExtractor.textExtractor = new KTTextExtractor(); } return KTTextExtractor.textExtractor; } /** * Constructor for KTTextExtractor */ private KTTextExtractor() { this.logger = Logger.getLogger("com.knowledgetree.textextraction"); this.logger.info("Text Extraction starting..."); } /** * Returns the log object */ public Logger getLogger() { return logger; } /** * Extracts the content from a given file and writes the plain text output to a file * * @param String contentFilename The source file containing the content to be extracted * @param String outputFilename The target file for saving the extracted text * @return Integer 0 on success | -1 on failure */ public int ExtractTextFromFile(String contentFilename, String outputFilename) { this.logger.debug("Text Extractor: file in: " + contentFilename + "; file out: " + outputFilename); try { // Open streams to the source file and target/output file FileInputStream inStream = new FileInputStream(contentFilename); FileOutputStream outStream = new FileOutputStream(outputFilename); // Use a writer to handle the output from the tika extractor OutputStreamWriter outFile = new OutputStreamWriter(outStream, "UTF8"); ContentHandler textHandler = new BodyContentHandler(outFile); // Instantiate the Tika 'AutoDetect' Parser AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try { // Parse the file, the output is automatically written to the target file parser.parse(inStream, textHandler, metadata); outFile.close(); inStream.close(); } catch (Exception ex) { this.logger.error("Text Extractor: Failed with message - " + ex.getMessage()); return -1; } } catch (Exception ex) { this.logger.error("Text Extractor: File could not be found - " + ex.getMessage()); return -1; } return 0; } /** * Extracts the content from a data stream and returns the plain text in an SAX XML object * @param data The data to be extracted. * @return SAX XML */ public java.util.Map<String, String> ExtractText(byte[] data) { Hashtable<String,String> result = new Hashtable<String,String>(); /* We use no files so we use the ByteArrayInputStream to simulate an input stream */ ByteArrayInputStream stream = new ByteArrayInputStream(data); /* Instantiate the Tika 'AutoDetect' Parser */ AutoDetectParser p = new AutoDetectParser(); /* Create a new ContentHandler interface to store our content */ StringHandler sh = new StringHandler(); try { p.parse(stream, sh, new Metadata()); } catch(Exception ex) { result.put("status","1"); result.put("message", ex.getMessage()); return result; } result.put("status", "0"); result.put("text", sh.getString()); return result; } }