package org.archive.hadoop.pig;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.TextLoader;
public class HttpTextLoader extends TextLoader {
private final static Logger LOGGER =
Logger.getLogger(HttpTextLoader.class.getName());
protected final static String HTTP_TEXTLOADER_URL = "httptextloader.url";
protected final static String HTTP_TEXTLOADER_NUM_SPLITS = "httptextloader.numSplits";
//protected final static String HTTP_TEXTLOADER_MAX_LINES = "httptextloader.maxLines";
protected final static String HTTP_TEXTLOADER_GZIP = "httptextloader.gzip";
protected final static String HTTP_TEXTLOADER_ZIPNUM_CLUSTER = "httptextloader.clusterUri";
protected final static String HTTP_TEXTLOADER_MAX_AGGREGATE_BLOCKS = "httptextloader.maxAggregateBlocks";
protected final static String HTTP_TEXTLOADER_HTTP_LOAD_CDX = "httptextloader.httpLoadCdx";
protected final static String HTTP_TEXTLOADER_AUTH = "httptextloader.authCookie";
protected final static String SHOW_NUM_PAGES = "&showNumPages=true";
protected final static String GZIP_PARAM = "&gzip=true";
// protected final static String CDX_PARAM = "&cdx=true";
protected final static String INDEX_ONLY_PARAM = "&showPagedIndex=true";
protected final static String PAGE_PARAM = "&page=";
// protected final static String NUM_SPLIT_PARAM = "&numSplits=";
protected final static String NUM_PAGES_HEADER_FIELD = "X-CDX-Num-Pages";
protected final static String CLUSTER_URI_HEADER_FIELD = "X-CDX-Cluster-Uri";
public HttpTextLoader()
{
super();
}
public HttpTextLoader(String option, String param)
{
// if (option != null) {
// if (option.equalsIgnoreCase("splits")) {
// this.numSplits = Integer.parseInt(param);
// this.maxLinesPerSplit = 0;
// } else if (option.equalsIgnoreCase("maxLines")) {
// this.numSplits = 1;
// this.maxLinesPerSplit = Integer.parseInt(param);
// }
// }
}
// public static String getSplitUrl(String url, int split, int numSplits)
// {
// StringBuilder builder = new StringBuilder(url);
// builder.append(SPLIT_PARAM);
// builder.append(split);
// builder.append(NUM_SPLIT_PARAM);
// builder.append(numSplits);
// return builder.toString();
// }
public static String getPageUrl(String url, int page)
{
StringBuilder builder = new StringBuilder(url);
builder.append(PAGE_PARAM);
builder.append(page);
return builder.toString();
}
@Override
public InputFormat<LongWritable, Text> getInputFormat() {
return new InputFormat<LongWritable, Text>()
{
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
TaskAttemptContext job) {
if (!(split instanceof HttpClusterInputSplit)) {
throw new RuntimeException("Wrong Input Split, must be HttpClusterInputSplit");
}
Configuration conf = job.getConfiguration();
String clusterUri = conf.get(HTTP_TEXTLOADER_ZIPNUM_CLUSTER);
boolean httpLoadCdx = conf.getBoolean(HTTP_TEXTLOADER_HTTP_LOAD_CDX, false);
HttpClusterInputSplit clusterSplit = (HttpClusterInputSplit)split;
try {
if (!httpLoadCdx && (clusterUri != null)) {
int maxAggBlocks = Integer.parseInt(conf.get(HTTP_TEXTLOADER_MAX_AGGREGATE_BLOCKS, "1"));
return new HttpZipNumDerefLineRecordReader(clusterUri, clusterSplit.getUrl() + INDEX_ONLY_PARAM, clusterSplit.getSplit(), maxAggBlocks);
} else {
return new HttpInputLineRecordReader(clusterSplit.getUrl(), clusterSplit.getSplit());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public List<InputSplit> getSplits(JobContext context)
throws IOException {
ArrayList<InputSplit> array = new ArrayList<InputSplit>();
Configuration conf = context.getConfiguration();
String url = conf.get(HTTP_TEXTLOADER_URL);
int numPages = conf.getInt(HTTP_TEXTLOADER_NUM_SPLITS, 1);
LOGGER.info("getSplits - " + numPages + " " + url);
for (int i = 0; i < numPages; i++) {
array.add(new HttpClusterInputSplit(getPageUrl(url, i), i, numPages));
}
return array;
}
};
}
@Override
public void setLocation(String location, Job job) throws IOException {
Configuration conf = job.getConfiguration();
String savedLoc = conf.get(HTTP_TEXTLOADER_URL);
if (savedLoc == null) {
location = URLDecoder.decode(location, "UTF-8");
conf.set(HTTP_TEXTLOADER_URL, location);
} else {
location = savedLoc;
}
int numPages = conf.getInt(HTTP_TEXTLOADER_NUM_SPLITS, -1);
if (numPages == -1) {
numPages = queryLineCount(location, conf);
}
if (numPages < 0) {
numPages = 1;
}
// if (maxLinesPerSplit > 0) {
//
// int totalLineCount = Integer.parseInt(conf.get(HTTP_TEXTLOADER_MAX_LINES, "-1"));
//
// if (totalLineCount == -1) {
// totalLineCount = queryLineCount(location, conf);
//
// conf.set(HTTP_TEXTLOADER_MAX_LINES, String.valueOf(totalLineCount));
// }
//
// if (totalLineCount > 0) {
// numSplits = (totalLineCount / maxLinesPerSplit) + 1;
// LOGGER.info("Total Line Count / maxLinesPerSplit = " + totalLineCount + " / " + maxLinesPerSplit + " = " + numSplits);
// } else {
// LOGGER.info("Total Line Count Not Available");
// }
// }
conf.setInt(HTTP_TEXTLOADER_NUM_SPLITS, numPages);
LOGGER.info("setLocation - " + numPages + " " + location);
}
protected int queryLineCount(String url, Configuration conf)
{
HttpURLConnection conn = null;
int numLines = 0;
try {
URL theURL = new URL(url + SHOW_NUM_PAGES);
conn = (HttpURLConnection)theURL.openConnection();
conn.setRequestMethod("HEAD");
String authCookie = conf.get(HttpTextLoader.HTTP_TEXTLOADER_AUTH);
if (authCookie != null) {
conn.setRequestProperty("Cookie", "cdx_auth_token=" + authCookie);
}
conn.connect();
numLines = conn.getHeaderFieldInt(NUM_PAGES_HEADER_FIELD, 0);
String clusterUri = conn.getHeaderField(CLUSTER_URI_HEADER_FIELD);
if (clusterUri != null) {
conf.set(HTTP_TEXTLOADER_ZIPNUM_CLUSTER, clusterUri);
}
} catch (IOException io) {
io.printStackTrace();
} finally {
if (conn != null) {
conn.disconnect();
}
}
return numLines;
}
@Override
public void prepareToRead(RecordReader reader, PigSplit split) {
super.prepareToRead(reader, split);
}
@Override
public String relativeToAbsolutePath(String location, Path curDir)
throws IOException {
return location;
}
}