//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import com.opencsv.CSVReader;
import uk.gov.dstl.baleen.contentextractors.helpers.AbstractContentExtractor;
import uk.gov.dstl.baleen.types.metadata.Metadata;
/**
* Takes a single line of CSV data, and splits it into 'columns' based on the specified separator character.
* The column designated as the main content is set as the JCas body, and other columns are added as Metadata annotations.
*
* @baleen.javadoc
*/
public class CsvContentExtractor extends AbstractContentExtractor {
/**
* Separator to split columns by.
* Can only be a single character, and if more than one character is provided only the first will be used.
*
* @baleen.config ,
*/
public static final String PARAM_SEPARATOR = "separator";
@ConfigurationParameter(name = PARAM_SEPARATOR, defaultValue = ",")
String separator;
/**
* The column number containing the content
*
* @baleen.config 1
*/
public static final String PARAM_CONTENT_COLUMN = "contentColumn";
@ConfigurationParameter(name = PARAM_CONTENT_COLUMN, defaultValue = "1")
Integer contentColumn;
/**
* A list of column headings (in order, skipping the content column) to use as metadata keys.
* If not provided, each column will be called column# where the # represents the column number.
*
* @baleen.config
*/
public static final String PARAM_COLUMNS = "columns";
@ConfigurationParameter(name = PARAM_COLUMNS, defaultValue = {})
List<String> columns;
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
super.doProcessStream(stream, source, jCas);
try(
CSVReader reader = new CSVReader(new InputStreamReader(stream, StandardCharsets.UTF_8), separator.charAt(0))
){
String[] cols = reader.readNext();
if(cols == null || cols.length < contentColumn){
throw new IOException("Not enough columns");
}
for(int i = 0; i < cols.length; i++){
if(i == (contentColumn - 1)){
jCas.setDocumentText(cols[i]);
}else{
addMetadata(jCas, i, cols[i]);
}
}
}
}
private void addMetadata(JCas jCas, Integer index, String value){
Metadata md = new Metadata(jCas);
Integer colNameIndex = index;
if(index >= contentColumn)
colNameIndex--;
if(colNameIndex >= columns.size() || columns.get(colNameIndex).trim().isEmpty()){
md.setKey("column" + (index + 1));
}else{
md.setKey(columns.get(colNameIndex).trim());
}
md.setValue(value);
getSupport().add(md);
}
}