CSVConnector.java example

Explorer
myrobotlab-master
- src
- test
  - ArduinoChaosTest.java
  - ArduinoMotorPotTest.java
  - org
    - myrobotlab
package org.myrobotlab.service;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.myrobotlab.document.Document;
import org.myrobotlab.document.connector.AbstractConnector;
import org.myrobotlab.document.connector.ConnectorState;
import org.myrobotlab.document.transformer.ConnectorConfig;
import org.myrobotlab.framework.ServiceType;
import org.myrobotlab.string.StringUtil;

import au.com.bytecode.opencsv.CSVReader;

public class CSVConnector extends AbstractConnector {

  private static final long serialVersionUID = 1L;
  private String filename;
  private String[] columns;
  private String idField;
  private String separator = ",";
  private int numFields;
  private int idColumn = -1;
  private boolean useRowAsId = true;
  private int skipRows = 1;
  private boolean firstRowAsColumns = false;

  public CSVConnector(String name) {
    super(name);
  }

  @Override
  public void setConfig(ConnectorConfig config) {
    // TODO: remove side effects of a "setter"
    // the parsing of the config should be handled elsewhere? maybe initialize?
    // TODO: validate the config options are valid.

    setDocIdPrefix(config.getStringParam("docIdPrefix", ""));
    filename = config.getProperty("filename");
    columns = config.getStringArray("columns");
    idField = config.getProperty("idField");

    separator = config.getProperty("separator");
    numFields = config.getIntegerParam("numFields", numFields);
    // this is computed in initialize.
    // idColumn = config.getProperty("idColumn");
    useRowAsId = config.getBoolParam("useRowAsId", useRowAsId);
    skipRows = config.getIntegerParam("skipRows", skipRows);
    firstRowAsColumns = config.getBoolParam("firstRowAsColumns", firstRowAsColumns);
  }

  public void initialize() {
    // filename = config.getProperty("filename", "data/myfile.csv");
    // columns = config.getProperty("columnnames",
    // "id,column1,column2").split(",");
    // idField = config.getProperty("idcolumn", "id");
    // idPrefix = config.getProperty("idprefix", "doc_");
    // separator = config.getProperty("separator", ",");
    // if (separator.length() > 1) {
    // // This is an error condition we can only have a character as a
    // separator.
    //
    // }

    numFields = columns.length;
    for (int i = 0; i < numFields; i++) {
      if (columns[i].equals(idField)) {
        idColumn = i;
        break;
      }
    }
  }

  @Override
  public void startCrawling() {

    state = ConnectorState.RUNNING;
    // compile the map to for header to column number.
    // TODO: add a directory traversal ..
    // log.info("Starting CSV Connector");
    File fileToCrawl = new File(filename);
    if (!fileToCrawl.exists()) {
      // error. file not found.
      System.out.println("File not found..." + filename);
      return;
    }

    FileReader reader = null;
    try {
      reader = new FileReader(fileToCrawl);
    } catch (FileNotFoundException e) {
      // This should not happen
      e.printStackTrace();
    }
    CSVReader csvReader = new CSVReader(reader, separator.charAt(0));
    if (firstRowAsColumns) {
      // we should read the first row as the column header
      try {
        columns = csvReader.readNext();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }
    // pick out which column has the primary key / id field.
    initialize();

    int rowNum = 0;
    String[] nextLine;

    try {

      while ((nextLine = csvReader.readNext()) != null) {
        // TODO: replace this with connector state, and make private isRunning
        // again.
        if (!state.equals(ConnectorState.RUNNING)) {
          // we've been interrupted.
          log.info("Crawl interrupted, stopping crawl.");
          state = ConnectorState.INTERRUPTED;
          break;
        }
        rowNum++;
        if (rowNum <= skipRows) {
          continue;
        }
        String id;
        if (useRowAsId) {
          id = getDocIdPrefix() + rowNum;
        } else {
          id = getDocIdPrefix() + nextLine[idColumn];
        }
        Document docToSend = new Document(id);
        for (int i = 0; i < numFields; i++) {
          String v = nextLine[i];
          if (!StringUtil.isEmpty(v)) {
            docToSend.addToField(columns[i], v);
          }
        }
        feed(docToSend);
      }
    } catch (IOException e) {
      // TODO Auto-generated catch block
      // shouldn't see this.. but who knows.
      e.printStackTrace();
      log.error("IO Exception during crawl. {}", e.getMessage());
      // TODO: re-throw something else?
    }

    // Lets poll until our outbox has been completely picked up.
    while (outbox.size() > 0) {
      // wait until our outbox has drained before going to stopped?
      try {
        log.info("Waiting for outbox to drain. Size: {}", outbox.size());
        Thread.sleep(10);
      } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }

    // TODO: why the heck does this not block until we're done as we expect?!?!
    state = ConnectorState.STOPPED;
    flush();
    // TODO: push this state management to the base class?

  }

  @Override
  public void stopCrawling() {
    // TODO Auto-generated method stub
  }

  public String getFilename() {
    return filename;
  }

  public void setFilename(String filename) {
    this.filename = filename;
  }

  public String[] getColumns() {
    return columns;
  }

  public void setColumns(String[] columns) {
    this.columns = columns;
  }

  public String getIdField() {
    return idField;
  }

  public void setIdField(String idField) {
    this.idField = idField;
  }

  public String getSeparator() {
    return separator;
  }

  public void setSeparator(String separator) {
    this.separator = separator;
  }

  public int getNumFields() {
    return numFields;
  }

  public void setNumFields(int numFields) {
    this.numFields = numFields;
  }

  public int getIdColumn() {
    return idColumn;
  }

  public void setIdColumn(int idColumn) {
    this.idColumn = idColumn;
  }

  /**
   * This static method returns all the details of the class without it having
   * to be constructed. It has description, categories, dependencies, and peer
   * definitions.
   * 
   * @return ServiceType - returns all the data
   * 
   */
  static public ServiceType getMetaData() {

    ServiceType meta = new ServiceType(CSVConnector.class.getCanonicalName());
    meta.addDescription("This service crawls a csv file and publishes each row as a document");
    meta.addCategory("ingest");
    return meta;
  }

}