package org.archive.hadoop.io; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.logging.Logger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.archive.hadoop.cdx.CDXCluster; import org.archive.hadoop.cdx.SplitFile; import org.archive.hadoop.util.PartitionName; import org.archive.util.iterator.CloseableIteratorUtil; import org.archive.util.iterator.SortedCompositeIterator; public class MergeClusterRangesInputFormat extends InputFormat<Long, Text> { private static final Logger LOG = Logger.getLogger(MergeClusterRangesInputFormat.class.getName()); private static final String SPLIT_CONFIG_KEY = "merge.cluster.split.path"; private static final String MERGE_CLUSTER_PATH_CONFIG_KEY = "merge.cluster.paths"; public static void setSplitPath(Configuration conf, String path) throws IOException { conf.set(SPLIT_CONFIG_KEY, path); LOG.warning(String.format("Setting Split path: %s",path)); SplitFile splitFile = new SplitFile(); Reader r = getSplitReader(conf); splitFile.read(r); for(int i = 0; i < splitFile.size(); i++) { PartitionName.setPartitionOutputName(conf, i, splitFile.getName(i)); } r.close(); } private static Reader getSplitReader(Configuration conf) throws IOException { String pathString = getSplitPath(conf); LOG.warning(String.format("Got Split path: %s",pathString)); Path splitPath = new Path(pathString); FileSystem fs = FileSystem.get(splitPath.toUri(), conf); FSDataInputStream fsdis = fs.open(splitPath); return new InputStreamReader(fsdis, Charset.forName("UTF-8")); } private static String getSplitPath(Configuration conf) { return conf.get(SPLIT_CONFIG_KEY); } public static void setClusterPaths(Configuration conf, String[] paths) { conf.setStrings(MERGE_CLUSTER_PATH_CONFIG_KEY, paths); } private static String[] getClusterPaths(Configuration conf) { return conf.getStrings(MERGE_CLUSTER_PATH_CONFIG_KEY); } @Override public RecordReader<Long, Text> createRecordReader(InputSplit split, TaskAttemptContext arg1) throws IOException, InterruptedException { // TODO!! return new MergeClusterRangesRecordReader(); } @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Reader r = getSplitReader(conf); SplitFile splitFile = new SplitFile(); splitFile.read(r); r.close(); ArrayList<InputSplit> splits = new ArrayList<InputSplit>(); for(int i = 0; i < splitFile.size(); i++) { MergeClusterRangesInputSplit split = new MergeClusterRangesInputSplit(splitFile.size() - i, splitFile.getStart(i), splitFile.getEnd(i), getClusterPaths(conf)); splits.add(split); LOG.warning(String.format("Added split(%d) (%s)-(%s)", splitFile.size() - i,splitFile.getStart(i),splitFile.getEnd(i))); } return splits; } public class MergeClusterRangesRecordReader extends RecordReader<Long, Text> { Iterator<String> itr; Long key = null; Text value = null; @Override public void close() throws IOException { CloseableIteratorUtil.attemptClose(itr); } @Override public Long getCurrentKey() throws IOException, InterruptedException { return key; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } @Override public float getProgress() throws IOException, InterruptedException { // TODO ... return 0; } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); MergeClusterRangesInputSplit mSplit = (MergeClusterRangesInputSplit) split; SortedCompositeIterator<String> itrS = new SortedCompositeIterator<String>(new Comparator<String>() { public int compare(String o1, String o2) { return o1.compareTo(o2); } }); String start = mSplit.getStart(); String end = mSplit.getEnd(); for(String clusterPath : mSplit.getClusterPaths()) { LOG.warning(String.format("Added range(%d) (%s)-(%s): %s", context.getTaskAttemptID().getId(),start,end,clusterPath)); CDXCluster cluster = new CDXCluster(conf, new Path(clusterPath)); itrS.addIterator(cluster.getRange(start, end)); } // TODO: filtering on SURTs: itr = itrS; } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if(key == null) { key = new Long(0); } if(value == null) { value = new Text(); } if(itr.hasNext()) { key = new Long(key.longValue() + 1); value.set(itr.next()); return true; } return false; } } }