package org.archive.hadoop.pig; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.builtin.TextLoader; public class HttpTextLoader extends TextLoader { private final static Logger LOGGER = Logger.getLogger(HttpTextLoader.class.getName()); protected int maxLinesPerSplit = 0; protected int numSplits = 1; protected final static String HTTP_TEXTLOADER_URL = "httptextloader.url"; protected final static String HTTP_TEXTLOADER_NUM_SPLITS = "httptextloader.numSplits"; protected final static String HTTP_TEXTLOADER_MAX_LINES = "httptextloader.maxLines"; protected final static String HTTP_TEXTLOADER_GZIP = "httptextloader.gzip"; protected final static String HTTP_TEXTLOADER_ZIPNUM_CLUSTER = "httptextloader.clusterUri"; protected final static String HTTP_TEXTLOADER_MAX_AGGREGATE_BLOCKS = "httptextloader.maxAggregateBlocks"; protected final static String COUNT_LINES_PARAM = "&countLines=true"; protected final static String GZIP_PARAM = "&output=gzip"; protected final static String CDX_PARAM = "&cdx=true"; protected final static String SPLIT_PARAM = "&split="; protected final static String NUM_SPLIT_PARAM = "&numSplits="; protected final static String NUM_LINES_HEADER_FIELD = "X-Cluster-Num-Lines"; public HttpTextLoader() { super(); } public HttpTextLoader(String option, String param) { if (option != null) { if (option.equalsIgnoreCase("splits")) { this.numSplits = Integer.parseInt(param); this.maxLinesPerSplit = 0; } else if (option.equalsIgnoreCase("maxLines")) { this.numSplits = 1; this.maxLinesPerSplit = Integer.parseInt(param); } } } public static String getSplitUrl(String url, int split, int numSplits) { StringBuilder builder = new StringBuilder(url); builder.append(SPLIT_PARAM); builder.append(split); builder.append(NUM_SPLIT_PARAM); builder.append(numSplits); return builder.toString(); } @Override public InputFormat<LongWritable, Text> getInputFormat() { return new InputFormat<LongWritable, Text>() { @Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext job) { if (!(split instanceof HttpClusterInputSplit)) { throw new RuntimeException("Wrong Input Split, must be HttpClusterInputSplit"); } Configuration conf = job.getConfiguration(); String clusterUri = conf.get(HTTP_TEXTLOADER_ZIPNUM_CLUSTER); HttpClusterInputSplit clusterSplit = (HttpClusterInputSplit)split; try { if (clusterUri != null) { int maxAggBlocks = Integer.parseInt(conf.get(HTTP_TEXTLOADER_MAX_AGGREGATE_BLOCKS, "1")); return new HttpZipNumDerefLineRecordReader(clusterUri, clusterSplit.getUrl(), clusterSplit.getSplit(), maxAggBlocks); } else { return new HttpInputLineRecordReader(clusterSplit.getUrl() + CDX_PARAM, clusterSplit.getSplit()); } } catch (IOException e) { throw new RuntimeException(e); } } @Override public List<InputSplit> getSplits(JobContext context) throws IOException { ArrayList<InputSplit> array = new ArrayList<InputSplit>(); Configuration conf = context.getConfiguration(); String url = conf.get(HTTP_TEXTLOADER_URL); int numSplits = conf.getInt(HTTP_TEXTLOADER_NUM_SPLITS, 1); LOGGER.info("getSplits - " + numSplits + " " + url); for (int i = 0; i < numSplits; i++) { array.add(new HttpClusterInputSplit(getSplitUrl(url, i, numSplits), i, numSplits)); } return array; } }; } @Override public void setLocation(String location, Job job) throws IOException { Configuration conf = job.getConfiguration(); String savedLoc = conf.get(HTTP_TEXTLOADER_URL); if (savedLoc == null) { location = URLDecoder.decode(location, "UTF-8"); conf.set(HTTP_TEXTLOADER_URL, location); } else { location = savedLoc; } if (maxLinesPerSplit > 0) { int totalLineCount = Integer.parseInt(conf.get(HTTP_TEXTLOADER_MAX_LINES, "-1")); if (totalLineCount == -1) { totalLineCount = queryLineCount(location); conf.set(HTTP_TEXTLOADER_MAX_LINES, String.valueOf(totalLineCount)); } if (totalLineCount > 0) { numSplits = totalLineCount / maxLinesPerSplit; LOGGER.info("Total Line Count / maxLinesPerSplit = " + totalLineCount + " / " + maxLinesPerSplit + " = " + numSplits); } else { LOGGER.info("Total Line Count Not Available"); } } conf.setInt(HTTP_TEXTLOADER_NUM_SPLITS, numSplits); LOGGER.info("setLocation - " + numSplits + " " + location); } protected int queryLineCount(String url) { HttpURLConnection conn = null; int numLines = 0; try { URL theURL = new URL(url + COUNT_LINES_PARAM); conn = (HttpURLConnection)theURL.openConnection(); conn.setRequestMethod("HEAD"); conn.connect(); numLines = conn.getHeaderFieldInt(NUM_LINES_HEADER_FIELD, 0); } catch (IOException io) { io.printStackTrace(); } finally { if (conn != null) { conn.disconnect(); } } return numLines; } @Override public void prepareToRead(RecordReader reader, PigSplit split) { super.prepareToRead(reader, split); } @Override public String relativeToAbsolutePath(String location, Path curDir) throws IOException { return location; } }