package org.hackreduce.storm; import au.com.bytecode.opencsv.CSVReader; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; import java.io.FileReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; /** * This spout reads data from a CSV file. It is only suitable for testing in local mode */ public class CsvSpout extends BaseRichSpout { private final String fileName; private final char separator; private boolean includesHeaderRow; private SpoutOutputCollector _collector; private CSVReader reader; private AtomicLong linesRead; public CsvSpout(String filename, char separator, boolean includesHeaderRow) { this.fileName = filename; this.separator = separator; this.includesHeaderRow = includesHeaderRow; linesRead=new AtomicLong(0); } @Override public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { _collector = collector; try { reader = new CSVReader(new FileReader(fileName), separator); // read and ignore the header if one exists if (includesHeaderRow) reader.readNext(); } catch (Exception e) { throw new RuntimeException(e); } } @Override public void nextTuple() { try { String[] line = reader.readNext(); if (line != null) { long id=linesRead.incrementAndGet(); _collector.emit(new Values(line),id); } else System.out.println("Finished reading file, "+linesRead.get()+" lines read"); } catch (Exception e) { e.printStackTrace(); } } @Override public void ack(Object id) { } @Override public void fail(Object id) { System.err.println("Failed tuple with id "+id); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { try { CSVReader reader = new CSVReader(new FileReader(fileName), separator); // read csv header to get field info String[] fields = reader.readNext(); if (includesHeaderRow) { System.out.println("DECLARING OUTPUT FIELDS"); for (String a : fields) System.out.println(a); declarer.declare(new Fields(Arrays.asList(fields))); } else { // if there are no headers, just use field_index naming convention ArrayList<String> f= new ArrayList<String>(fields.length); for (int i = 0; i < fields.length; i++) { f.add("field_"+i); } declarer.declare(new Fields(f)); } } catch (Exception e) { throw new RuntimeException(e); } } }