package org.archive.hadoop.pig; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.builtin.TextLoader; public class HttpTextLoader extends TextLoader { private final static Logger LOGGER = Logger.getLogger(HttpTextLoader.class.getName()); protected final static String HTTP_TEXTLOADER_URL = "httptextloader.url"; protected final static String HTTP_TEXTLOADER_NUM_SPLITS = "httptextloader.numSplits"; //protected final static String HTTP_TEXTLOADER_MAX_LINES = "httptextloader.maxLines"; protected final static String HTTP_TEXTLOADER_GZIP = "httptextloader.gzip"; protected final static String HTTP_TEXTLOADER_ZIPNUM_CLUSTER = "httptextloader.clusterUri"; protected final static String HTTP_TEXTLOADER_MAX_AGGREGATE_BLOCKS = "httptextloader.maxAggregateBlocks"; protected final static String HTTP_TEXTLOADER_HTTP_LOAD_CDX = "httptextloader.httpLoadCdx"; protected final static String HTTP_TEXTLOADER_AUTH = "httptextloader.authCookie"; protected final static String SHOW_NUM_PAGES = "&showNumPages=true"; protected final static String GZIP_PARAM = "&gzip=true"; // protected final static String CDX_PARAM = "&cdx=true"; protected final static String INDEX_ONLY_PARAM = "&showPagedIndex=true"; protected final static String PAGE_PARAM = "&page="; // protected final static String NUM_SPLIT_PARAM = "&numSplits="; protected final static String NUM_PAGES_HEADER_FIELD = "X-CDX-Num-Pages"; protected final static String CLUSTER_URI_HEADER_FIELD = "X-CDX-Cluster-Uri"; public HttpTextLoader() { super(); } public HttpTextLoader(String option, String param) { // if (option != null) { // if (option.equalsIgnoreCase("splits")) { // this.numSplits = Integer.parseInt(param); // this.maxLinesPerSplit = 0; // } else if (option.equalsIgnoreCase("maxLines")) { // this.numSplits = 1; // this.maxLinesPerSplit = Integer.parseInt(param); // } // } } // public static String getSplitUrl(String url, int split, int numSplits) // { // StringBuilder builder = new StringBuilder(url); // builder.append(SPLIT_PARAM); // builder.append(split); // builder.append(NUM_SPLIT_PARAM); // builder.append(numSplits); // return builder.toString(); // } public static String getPageUrl(String url, int page) { StringBuilder builder = new StringBuilder(url); builder.append(PAGE_PARAM); builder.append(page); return builder.toString(); } @Override public InputFormat<LongWritable, Text> getInputFormat() { return new InputFormat<LongWritable, Text>() { @Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext job) { if (!(split instanceof HttpClusterInputSplit)) { throw new RuntimeException("Wrong Input Split, must be HttpClusterInputSplit"); } Configuration conf = job.getConfiguration(); String clusterUri = conf.get(HTTP_TEXTLOADER_ZIPNUM_CLUSTER); boolean httpLoadCdx = conf.getBoolean(HTTP_TEXTLOADER_HTTP_LOAD_CDX, false); HttpClusterInputSplit clusterSplit = (HttpClusterInputSplit)split; try { if (!httpLoadCdx && (clusterUri != null)) { int maxAggBlocks = Integer.parseInt(conf.get(HTTP_TEXTLOADER_MAX_AGGREGATE_BLOCKS, "1")); return new HttpZipNumDerefLineRecordReader(clusterUri, clusterSplit.getUrl() + INDEX_ONLY_PARAM, clusterSplit.getSplit(), maxAggBlocks); } else { return new HttpInputLineRecordReader(clusterSplit.getUrl(), clusterSplit.getSplit()); } } catch (IOException e) { throw new RuntimeException(e); } } @Override public List<InputSplit> getSplits(JobContext context) throws IOException { ArrayList<InputSplit> array = new ArrayList<InputSplit>(); Configuration conf = context.getConfiguration(); String url = conf.get(HTTP_TEXTLOADER_URL); int numPages = conf.getInt(HTTP_TEXTLOADER_NUM_SPLITS, 1); LOGGER.info("getSplits - " + numPages + " " + url); for (int i = 0; i < numPages; i++) { array.add(new HttpClusterInputSplit(getPageUrl(url, i), i, numPages)); } return array; } }; } @Override public void setLocation(String location, Job job) throws IOException { Configuration conf = job.getConfiguration(); String savedLoc = conf.get(HTTP_TEXTLOADER_URL); if (savedLoc == null) { location = URLDecoder.decode(location, "UTF-8"); conf.set(HTTP_TEXTLOADER_URL, location); } else { location = savedLoc; } int numPages = conf.getInt(HTTP_TEXTLOADER_NUM_SPLITS, -1); if (numPages == -1) { numPages = queryLineCount(location, conf); } if (numPages < 0) { numPages = 1; } // if (maxLinesPerSplit > 0) { // // int totalLineCount = Integer.parseInt(conf.get(HTTP_TEXTLOADER_MAX_LINES, "-1")); // // if (totalLineCount == -1) { // totalLineCount = queryLineCount(location, conf); // // conf.set(HTTP_TEXTLOADER_MAX_LINES, String.valueOf(totalLineCount)); // } // // if (totalLineCount > 0) { // numSplits = (totalLineCount / maxLinesPerSplit) + 1; // LOGGER.info("Total Line Count / maxLinesPerSplit = " + totalLineCount + " / " + maxLinesPerSplit + " = " + numSplits); // } else { // LOGGER.info("Total Line Count Not Available"); // } // } conf.setInt(HTTP_TEXTLOADER_NUM_SPLITS, numPages); LOGGER.info("setLocation - " + numPages + " " + location); } protected int queryLineCount(String url, Configuration conf) { HttpURLConnection conn = null; int numLines = 0; try { URL theURL = new URL(url + SHOW_NUM_PAGES); conn = (HttpURLConnection)theURL.openConnection(); conn.setRequestMethod("HEAD"); String authCookie = conf.get(HttpTextLoader.HTTP_TEXTLOADER_AUTH); if (authCookie != null) { conn.setRequestProperty("Cookie", "cdx_auth_token=" + authCookie); } conn.connect(); numLines = conn.getHeaderFieldInt(NUM_PAGES_HEADER_FIELD, 0); String clusterUri = conn.getHeaderField(CLUSTER_URI_HEADER_FIELD); if (clusterUri != null) { conf.set(HTTP_TEXTLOADER_ZIPNUM_CLUSTER, clusterUri); } } catch (IOException io) { io.printStackTrace(); } finally { if (conn != null) { conn.disconnect(); } } return numLines; } @Override public void prepareToRead(RecordReader reader, PigSplit split) { super.prepareToRead(reader, split); } @Override public String relativeToAbsolutePath(String location, Path curDir) throws IOException { return location; } }