package org.archive.hadoop.pig;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.LineReader;
import org.apache.pig.tools.counters.PigCounterHelper;
import org.archive.util.zip.OpenJDK7GZIPInputStream;
public class HttpInputLineRecordReader extends RecordReader<LongWritable, Text> {
private final static Logger LOGGER =
Logger.getLogger(HttpTextLoader.class.getName());
protected LongWritable key;
protected Text value;
protected int linesRead = 0;
protected long maxLines = 0;
protected long totalLines = 0;
protected String splitInfo;
protected Counter counter;
protected String urlString;
protected HttpURLConnection conn;
protected LineReader reader;
//protected CountingInputStream cis;
protected PigCounterHelper counterHelper;
protected final static String HTTP_INPUT_COUNTER_GROUP = "Http Input";
protected final static String LINE_COUNTER = "Lines Read";
protected final static String BYTE_COUNTER = "Bytes Read";
public HttpInputLineRecordReader(String urlString, int split) throws IOException
{
this.urlString = urlString;
this.key = new LongWritable(0);
this.value = new Text("");
splitInfo = "Split #" + split + " ";
counterHelper = new PigCounterHelper();
}
@Override
public synchronized void close() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
if (conn != null) {
conn.disconnect();
conn = null;
}
}
@Override
public LongWritable getCurrentKey() {
return key;
}
@Override
public Text getCurrentValue() {
return value;
}
@Override
public boolean nextKeyValue() throws IOException {
if ((maxLines > 0) && (linesRead >= maxLines)) {
return false;
}
long bytesRead = reader.readLine(value);
if (bytesRead <= 0) {
return false;
}
linesRead++;
incCounters(bytesRead);
key.set(key.get() + bytesRead);
return true;
}
public void incCounters(long bytesRead) {
counterHelper.incrCounter(HTTP_INPUT_COUNTER_GROUP, LINE_COUNTER, 1);
counterHelper.incrCounter(HTTP_INPUT_COUNTER_GROUP, BYTE_COUNTER, bytesRead);
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (totalLines > 0) {
return (float)linesRead / (float)totalLines;
}
return 0;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
close();
Configuration conf = context.getConfiguration();
boolean useGzip = conf.getBoolean(HttpTextLoader.HTTP_TEXTLOADER_GZIP, true);
if (useGzip) {
urlString += HttpTextLoader.GZIP_PARAM;
}
URL url = new URL(urlString);
LOGGER.info("Loader initialize - " + urlString);
conn = (HttpURLConnection)url.openConnection();
conn.connect();
String linesEstimate = conn.getHeaderField(HttpTextLoader.NUM_LINES_HEADER_FIELD);
if (linesEstimate != null) {
try {
totalLines = Integer.parseInt(linesEstimate);
} catch (NumberFormatException n) {
}
}
InputStream is = conn.getInputStream();
//is = cis = new CountingInputStream(is);
if (useGzip) {
is = new OpenJDK7GZIPInputStream(is);
}
reader = new LineReader(is);
}
public String getUrl()
{
return urlString;
}
}