package hip.ch6.joins.replicated.framework; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class GenericReplicatedJoin extends Mapper<Object, Object, Object, Object> { private Map<Object, List<Pair>> cachedRecords = new HashMap<Object, List<Pair>>(); private boolean distributedCacheIsSmaller; private Path[] distributedCacheFiles; /** * Transforms a record from the input split to a Pair object. The * input splits are ostensibly larger than the Distributed Cache file. * <p/> * This implementation works with keys and values produced with the * KeyValueTextInputFormat, or any InputFormat which yeilds keys and * values with meaningful toString methods. For other input formats, this method * should be overridden to convert the key/value into a Pair where * the key is text. * * @param key the key emitted by the {@link org.apache.hadoop.mapreduce.InputFormat} * @param value the key emitted by the {@link org.apache.hadoop.mapreduce.InputFormat} * @return a Pair object, where the key is a Text object containing * the key for joining purposes, and the value contains data which * will be used when creating the composite output key */ public Pair readFromInputFormat(Object key, Object value) { return new Pair<String, String>(key.toString(), value.toString()); } /** * Get the object which will be used to read data from the * Distributed Cache file. The Distrubuted Cache files are referred * to as "R" files. They are ostensibly the smaller files compared * to "L" files. * <p/> * The default implementation provides an implementation that works * with line-based text files, where keys and values are separated * with whitespace. * * @return a reader which can unmarshall data from the Distributed * Cache */ public DistributedCacheFileReader getDistributedCacheReader() { return new TextDistributedCacheFileReader(); } /** * Join together a record from the input split and Distributed Cache * and return a new pair which will be emitted by the Map. * <p/> * If a null is returned, no output will be produced. * <p/> * The default implementation assumes that the Pair keys and values * are Strings and concatenates them together delimited by the * tab character. * <p/> * This should be overridden in cases where the values aren't Strings, * or to change how the output value is created. * * @param inputSplitPair a record from the input split * @param distCachePair a record from the Distributed Cache * @return a composite output value which is compatible with the * expected value type for the {@link org.apache.hadoop.mapreduce.OutputFormat} * used for this job */ public Pair join(Pair inputSplitPair, Pair distCachePair) { StringBuilder sb = new StringBuilder(); if (inputSplitPair.getData() != null) { sb.append(inputSplitPair.getData()); } sb.append("\t"); if (distCachePair.getData() != null) { sb.append(distCachePair.getData()); } return new Pair<Text, Text>( new Text(inputSplitPair.getKey().toString()), new Text(sb.toString())); } @Override protected void setup( Context context) throws IOException, InterruptedException { distributedCacheFiles = DistributedCache.getLocalCacheFiles( context.getConfiguration()); int distCacheSizes = 0; for (Path distFile : distributedCacheFiles) { System.out.println("Distcache file: " + distFile + " " + distFile.getName()); if (distFile.getName().startsWith("part")) { System.out.println("Added: " + distFile); File distributedCacheFile = new File(distFile.toString()); distCacheSizes += distributedCacheFile.length(); } } if (context.getInputSplit() instanceof FileSplit) { FileSplit split = (FileSplit) context.getInputSplit(); long inputSplitSize = split.getLength(); distributedCacheIsSmaller = (distCacheSizes < inputSplitSize); } else { // if the input split isn't a FileSplit, then assume the // distributed cache is smaller than the input split // distributedCacheIsSmaller = true; } System.out.println( "distributedCacheIsSmaller = " + distributedCacheIsSmaller); if (distributedCacheIsSmaller) { for (Path distFile : distributedCacheFiles) { if (distFile.getName().startsWith("part")) { File distributedCacheFile = new File(distFile.toString()); DistributedCacheFileReader reader = getDistributedCacheReader(); reader.init(distributedCacheFile); for (Pair p : (Iterable<Pair>) reader) { addToCache(p); } reader.close(); } } } } private void addToCache(Pair pair) { List<Pair> values = cachedRecords.get(pair.getKey()); if (values == null) { values = new ArrayList<Pair>(); cachedRecords.put(pair.getKey(), values); } values.add(pair); } @Override protected void map(Object key, Object value, Context context) throws IOException, InterruptedException { System.out.println("K[" + key + "]"); Pair pair = readFromInputFormat(key, value); if (distributedCacheIsSmaller) { joinAndCollect(pair, context); } else { addToCache(pair); } } public void joinAndCollect(Pair p, Context context) throws IOException, InterruptedException { List<Pair> cached = cachedRecords.get(p.getKey()); if (cached != null) { for (Pair cp : cached) { Pair result; if (distributedCacheIsSmaller) { result = join(p, cp); } else { result = join(cp, p); } if (result != null) { context.write(result.getKey(), result.getData()); } } } } @Override protected void cleanup( Context context) throws IOException, InterruptedException { if (!distributedCacheIsSmaller) { System.out.println("Outputting in cleanup"); for (Path distFile : distributedCacheFiles) { File distributedCacheFile = new File(distFile.toString()); if (distFile.getName().startsWith("part")) { DistributedCacheFileReader reader = getDistributedCacheReader(); reader.init(distributedCacheFile); for (Pair p : (Iterable<Pair>) reader) { joinAndCollect(p, context); } reader.close(); } } } } }