package hadoop.webtables.wordcount; import java.io.ByteArrayInputStream; import java.io.IOException; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Counter; public class WordcountMapper extends Mapper<Text, BytesWritable, Text, IntWritable> { static enum CountersEnum { INPUT_WORDS } private final static IntWritable one = new IntWritable(1); private Text word = new Text(); @Override public void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException { ByteArrayInputStream inputStream = new ByteArrayInputStream( value.getBytes()); int n = value.getLength(); byte[] bytes = new byte[n]; inputStream.read(bytes, 0, n); CSVParser parser = CSVParser .parse(new String(bytes), CSVFormat.DEFAULT); try { for (CSVRecord csvRecord : parser) { for (String string : csvRecord) { word.set(string); context.write(word, one); Counter counter = context.getCounter( CountersEnum.class.getName(), CountersEnum.INPUT_WORDS.toString()); counter.increment(1); } } } catch (Exception e) { } } }