TextExtractor.java example

Explorer
myrobotlab-master
- src
- test
  - ArduinoChaosTest.java
  - ArduinoMotorPotTest.java
  - org
    - myrobotlab
package org.myrobotlab.document.transformer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.myrobotlab.document.Document;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * This stage will use Apache Tika to perform text and metadata extraction on
 * many different types of documents including, but not limited to, pdf, office
 * documents, html, etc..
 * 
 * @author kwatters
 *
 */
public class TextExtractor extends AbstractStage {

  private String textField = "text";
  private String filePathField = "filepath";

  @Override
  public void startStage(StageConfiguration config) {
    // TODO: support processing a byte array on a document.
    // rather than just a reference for on disk
    if (config != null) {
      textField = config.getProperty("textField", "text");
      filePathField = config.getProperty("filePathField", "filepath");
    }

  }

  @Override
  public List<Document> processDocument(Document doc) {
    // Create the parser..
    // not sure if the parser is thread safe, so we create a new one here
    // each time. probably not effecient to do this.
    Parser parser = new AutoDetectParser();
    ParseContext parseCtx = new ParseContext();
    parseCtx.set(Parser.class, parser);

    // TODO how does the doc model support this?
    if (!doc.hasField(filePathField)) {
      return null;
    }

    // we have the field populated
    for (Object pathObj : doc.getField(filePathField)) {

      // TODO: test the object type here.
      String path = (String) pathObj;

      File f = new File(path);
      if (!f.exists()) {
        // TODO: log that the file path was not found
        System.out.println("File path not found " + path);
        continue;
      }

      FileInputStream binaryData = null;
      try {
        binaryData = new FileInputStream(f);
      } catch (FileNotFoundException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
        // This should never happen.
        continue;
      }
      // InputStream binaryData = null;

      Metadata metadata = new Metadata();
      StringWriter textData = new StringWriter();
      ContentHandler bch = new BodyContentHandler(textData);
      try {
        parser.parse(binaryData, bch, metadata, parseCtx);
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (SAXException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (TikaException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }

      doc.addToField(textField, textData.toString());
      for (String name : metadata.names()) {
        // clean the field name first.
        String cleanName = cleanFieldName(name);
        for (String value : metadata.getValues(name)) {
          doc.addToField(cleanName, value);
        }
      }
    }

    return null;
  }

  // TODO: this should go on a common utility interface or something.
  private static String cleanFieldName(String name) {
    String cleanName = name.trim().toLowerCase();
    cleanName = cleanName.replaceAll(" ", "_");
    cleanName = cleanName.replaceAll("-", "_");
    cleanName = cleanName.replaceAll(":", "_");
    return cleanName;
  }

  @Override
  public void stopStage() {
    // TODO Auto-generated method stub

  }

  @Override
  public void flush() {
    // TODO Auto-generated method stub

  }

}