package org.myrobotlab.service;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import org.myrobotlab.document.Document;
import org.myrobotlab.document.connector.AbstractConnector;
import org.myrobotlab.document.connector.ConnectorState;
import org.myrobotlab.document.transformer.ConnectorConfig;
import org.myrobotlab.framework.ServiceType;
import org.myrobotlab.string.StringUtil;
import au.com.bytecode.opencsv.CSVReader;
public class CSVConnector extends AbstractConnector {
private static final long serialVersionUID = 1L;
private String filename;
private String[] columns;
private String idField;
private String separator = ",";
private int numFields;
private int idColumn = -1;
private boolean useRowAsId = true;
private int skipRows = 1;
private boolean firstRowAsColumns = false;
public CSVConnector(String name) {
super(name);
}
@Override
public void setConfig(ConnectorConfig config) {
// TODO: remove side effects of a "setter"
// the parsing of the config should be handled elsewhere? maybe initialize?
// TODO: validate the config options are valid.
setDocIdPrefix(config.getStringParam("docIdPrefix", ""));
filename = config.getProperty("filename");
columns = config.getStringArray("columns");
idField = config.getProperty("idField");
separator = config.getProperty("separator");
numFields = config.getIntegerParam("numFields", numFields);
// this is computed in initialize.
// idColumn = config.getProperty("idColumn");
useRowAsId = config.getBoolParam("useRowAsId", useRowAsId);
skipRows = config.getIntegerParam("skipRows", skipRows);
firstRowAsColumns = config.getBoolParam("firstRowAsColumns", firstRowAsColumns);
}
public void initialize() {
// filename = config.getProperty("filename", "data/myfile.csv");
// columns = config.getProperty("columnnames",
// "id,column1,column2").split(",");
// idField = config.getProperty("idcolumn", "id");
// idPrefix = config.getProperty("idprefix", "doc_");
// separator = config.getProperty("separator", ",");
// if (separator.length() > 1) {
// // This is an error condition we can only have a character as a
// separator.
//
// }
numFields = columns.length;
for (int i = 0; i < numFields; i++) {
if (columns[i].equals(idField)) {
idColumn = i;
break;
}
}
}
@Override
public void startCrawling() {
state = ConnectorState.RUNNING;
// compile the map to for header to column number.
// TODO: add a directory traversal ..
// log.info("Starting CSV Connector");
File fileToCrawl = new File(filename);
if (!fileToCrawl.exists()) {
// error. file not found.
System.out.println("File not found..." + filename);
return;
}
FileReader reader = null;
try {
reader = new FileReader(fileToCrawl);
} catch (FileNotFoundException e) {
// This should not happen
e.printStackTrace();
}
CSVReader csvReader = new CSVReader(reader, separator.charAt(0));
if (firstRowAsColumns) {
// we should read the first row as the column header
try {
columns = csvReader.readNext();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// pick out which column has the primary key / id field.
initialize();
int rowNum = 0;
String[] nextLine;
try {
while ((nextLine = csvReader.readNext()) != null) {
// TODO: replace this with connector state, and make private isRunning
// again.
if (!state.equals(ConnectorState.RUNNING)) {
// we've been interrupted.
log.info("Crawl interrupted, stopping crawl.");
state = ConnectorState.INTERRUPTED;
break;
}
rowNum++;
if (rowNum <= skipRows) {
continue;
}
String id;
if (useRowAsId) {
id = getDocIdPrefix() + rowNum;
} else {
id = getDocIdPrefix() + nextLine[idColumn];
}
Document docToSend = new Document(id);
for (int i = 0; i < numFields; i++) {
String v = nextLine[i];
if (!StringUtil.isEmpty(v)) {
docToSend.addToField(columns[i], v);
}
}
feed(docToSend);
}
} catch (IOException e) {
// TODO Auto-generated catch block
// shouldn't see this.. but who knows.
e.printStackTrace();
log.error("IO Exception during crawl. {}", e.getMessage());
// TODO: re-throw something else?
}
// Lets poll until our outbox has been completely picked up.
while (outbox.size() > 0) {
// wait until our outbox has drained before going to stopped?
try {
log.info("Waiting for outbox to drain. Size: {}", outbox.size());
Thread.sleep(10);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// TODO: why the heck does this not block until we're done as we expect?!?!
state = ConnectorState.STOPPED;
flush();
// TODO: push this state management to the base class?
}
@Override
public void stopCrawling() {
// TODO Auto-generated method stub
}
public String getFilename() {
return filename;
}
public void setFilename(String filename) {
this.filename = filename;
}
public String[] getColumns() {
return columns;
}
public void setColumns(String[] columns) {
this.columns = columns;
}
public String getIdField() {
return idField;
}
public void setIdField(String idField) {
this.idField = idField;
}
public String getSeparator() {
return separator;
}
public void setSeparator(String separator) {
this.separator = separator;
}
public int getNumFields() {
return numFields;
}
public void setNumFields(int numFields) {
this.numFields = numFields;
}
public int getIdColumn() {
return idColumn;
}
public void setIdColumn(int idColumn) {
this.idColumn = idColumn;
}
/**
* This static method returns all the details of the class without it having
* to be constructed. It has description, categories, dependencies, and peer
* definitions.
*
* @return ServiceType - returns all the data
*
*/
static public ServiceType getMetaData() {
ServiceType meta = new ServiceType(CSVConnector.class.getCanonicalName());
meta.addDescription("This service crawls a csv file and publishes each row as a document");
meta.addCategory("ingest");
return meta;
}
}