package org.hipi.tools.downloader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; public class DownloaderRecordReader extends RecordReader<LongWritable, Text> { private long startLine; private long linesRead; private long numLines; private long linesPerRecord; private String urls; private BufferedReader reader; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { // Obtain path to input list of input images and open input stream FileSplit fileSplit = (FileSplit)split; Path path = fileSplit.getPath(); FileSystem fileSystem = path.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fileSystem.open(path); // Note the start and length fields in the FileSplit object are being used to // convey a range of lines in the input list of image URLs startLine = fileSplit.getStart(); numLines = fileSplit.getLength(); linesRead = 0; //total lines read by this particular record reader instance linesPerRecord = 100; //can be modified to change key/value pair size (may improve efficiency) //If it exists, get the relevant compression codec for the FileSplit CompressionCodecFactory codecFactory = new CompressionCodecFactory(context.getConfiguration()); CompressionCodec codec = codecFactory.getCodec(path); // If the codec was found, use it to create an decompressed input stream. // Otherwise, assume input stream is already decompressed if (codec != null) { reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fileIn))); } else { reader = new BufferedReader(new InputStreamReader(fileIn)); } } // Get the progress within the split @Override public float getProgress() { float percent = (numLines == 0 ? 0.0f : ((float)linesRead)/((float)numLines)); return percent; } @Override public void close() throws IOException { reader.close(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return new LongWritable(startLine + linesRead); } @Override public Text getCurrentValue() throws IOException, InterruptedException { return new Text(urls); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { // if the record reader has reached the end of its partition, stop now. if (linesRead >= numLines) { return false; } urls = ""; String line = ""; // linesPerRecord is set in the initialize() method above. for (int i = 0; (i < linesPerRecord) && (linesRead < numLines); i++) { line = reader.readLine(); if (line == null) { throw new IOException("Unexpected EOF while retrieving next line from input split."); } urls += line + "\n"; linesRead++; } return !line.isEmpty(); } }