package org.myrobotlab.document.transformer;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.myrobotlab.document.Document;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* This stage will use Apache Tika to perform text and metadata extraction on
* many different types of documents including, but not limited to, pdf, office
* documents, html, etc..
*
* @author kwatters
*
*/
public class TextExtractor extends AbstractStage {
private String textField = "text";
private String filePathField = "filepath";
@Override
public void startStage(StageConfiguration config) {
// TODO: support processing a byte array on a document.
// rather than just a reference for on disk
if (config != null) {
textField = config.getProperty("textField", "text");
filePathField = config.getProperty("filePathField", "filepath");
}
}
@Override
public List<Document> processDocument(Document doc) {
// Create the parser..
// not sure if the parser is thread safe, so we create a new one here
// each time. probably not effecient to do this.
Parser parser = new AutoDetectParser();
ParseContext parseCtx = new ParseContext();
parseCtx.set(Parser.class, parser);
// TODO how does the doc model support this?
if (!doc.hasField(filePathField)) {
return null;
}
// we have the field populated
for (Object pathObj : doc.getField(filePathField)) {
// TODO: test the object type here.
String path = (String) pathObj;
File f = new File(path);
if (!f.exists()) {
// TODO: log that the file path was not found
System.out.println("File path not found " + path);
continue;
}
FileInputStream binaryData = null;
try {
binaryData = new FileInputStream(f);
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
// This should never happen.
continue;
}
// InputStream binaryData = null;
Metadata metadata = new Metadata();
StringWriter textData = new StringWriter();
ContentHandler bch = new BodyContentHandler(textData);
try {
parser.parse(binaryData, bch, metadata, parseCtx);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
doc.addToField(textField, textData.toString());
for (String name : metadata.names()) {
// clean the field name first.
String cleanName = cleanFieldName(name);
for (String value : metadata.getValues(name)) {
doc.addToField(cleanName, value);
}
}
}
return null;
}
// TODO: this should go on a common utility interface or something.
private static String cleanFieldName(String name) {
String cleanName = name.trim().toLowerCase();
cleanName = cleanName.replaceAll(" ", "_");
cleanName = cleanName.replaceAll("-", "_");
cleanName = cleanName.replaceAll(":", "_");
return cleanName;
}
@Override
public void stopStage() {
// TODO Auto-generated method stub
}
@Override
public void flush() {
// TODO Auto-generated method stub
}
}