GenericReplicatedJoin.java example

Explorer

hiped2-master
- src
  - main
    - java
      - hip
        ch1
        InvertedIndexJob.java
        ch10
        dstat
        ApplicationMaster.java
        Client.java
        ch3
        StockPriceWritable.java
        StockWritableSerDe.java
        TextArrayWritable.java
        avro
        AvroBytesRecord.java
        AvroKeyValueFileWrite.java
        AvroKeyValueMapReduce.java
        AvroMixedMapReduce.java
        AvroRecordMapReduce.java
        AvroStockAvgFileRead.java
        AvroStockFileRead.java
        AvroStockFileWrite.java
        AvroStockUtils.java
        AvroTextMapReduce.java
        gen
        Stock.java
        StockAvg.java
        csv
        CSVInputFormat.java
        CSVMapReduce.java
        CSVOutputFormat.java
        json
        JsonInputFormat.java
        JsonMapReduce.java
        parquet
        AvroGenericParquetMapReduce.java
        AvroParquetMapReduce.java
        AvroProjectionParquetMapReduce.java
        ExampleParquetMapReduce.java
        ParquetAvroStockReader.java
        ParquetAvroStockWriter.java
        proto
        StockAvgProtobufFileReader.java
        StockProtocolBuffersFileReader.java
        StockProtocolBuffersMapReduce.java
        StockProtos.java
        StockUtils.java
        seqfile
        protobuf
        ProtobufSerialization.java
        SequenceFileProtobufMapReduce.java
        SequenceFileProtobufReader.java
        SequenceFileProtobufWriter.java
        writable
        SequenceFileStockLoader.java
        SequenceFileStockMapReduce.java
        SequenceFileStockReader.java
        SequenceFileStockWriter.java
        SequenceFileStoreFunc.java
        thrift
        Stock.java
        StockAvg.java
        ThriftStockAvgFileReader.java
        ThriftStockMapReduce.java
        xml
        XMLMapReduceReader.java
        XmlInputFormat.java
        XmlMapReduceWriter.java
        ch4
        AvroAccountFilesWrite.java
        CompressedFileRead.java
        CompressedFileWrite.java
        CompressedFileWriteFromLocal.java
        CompressedMapReduce.java
        CompressionIOBenchmark.java
        CustomPartitionerJob.java
        LzopFileReadWrite.java
        LzopMapReduce.java
        MultipleOutputsJob.java
        RandomLzopTextWrite.java
        SmallFilesMapReduce.java
        SmallFilesRead.java
        SmallFilesWrite.java
        ch5
        CopyHdfsFileToLocal.java
        CopyStreamToHdfs.java
        db
        DBImportMapReduce.java
        SqoopSequenceFileReader.java
        StockDbWritable.java
        hbase
        ExportedReader.java
        HBaseScanAvroStock.java
        HBaseSinkMapReduce.java
        HBaseWriter.java
        ImportMapReduce.java
        http
        HttpDownloadMap.java
        HttpDownloadMapReduce.java
        kafka
        KafkaAvroWriter.java
        camus
        StockMessageDecoder.java
        StockSchemaRegistry.java
        ch6
        joins
        FilterProjection.java
        User.java
        UserLog.java
        bloom
        BloomFilterCreator.java
        BloomJoin.java
        composite
        CompositeJoin.java
        contribjoin
        Main.java
        Map.java
        Reduce.java
        TextTaggedMapOutput.java
        repartition
        SampleMain.java
        SampleMap.java
        SampleReduce.java
        SimpleRepartitionJoin.java
        SimpleRepartitionMapReduce2.java
        StreamingRepartitionJoin.java
        TextTaggedOutputValue.java
        impl
        CompositeKey.java
        CompositeKeyComparator.java
        CompositeKeyOnlyComparator.java
        CompositeKeyPartitioner.java
        OptimizedDataJoinJob.java
        OptimizedDataJoinMapperBase.java
        OptimizedDataJoinReducerBase.java
        OptimizedJobBase.java
        OutputValue.java
        replicated
        framework
        DistributedCacheFileReader.java
        GenericReplicatedJoin.java
        Main.java
        Pair.java
        TextDistributedCacheFileReader.java
        simple
        ReplicatedJoin.java
        semijoin
        FinalJoinJob.java
        Main.java
        ReplicatedFilterJob.java
        UniqueHashedKeyJob.java
        sampler
        ReservoirSamplerInputFormat.java
        SamplerJob.java
        sort
        secondary
        Person.java
        PersonComparator.java
        PersonModified.java
        PersonNameComparator.java
        PersonNamePartitioner.java
        SortMapReduce.java
        total
        SampleMapReduce.java
        TotalSortMapReduce.java
        ch7
        bloom
        BloomFilterCreator.java
        BloomFilterDumper.java
        BloomJoin.java
        friendsofafriend
        CalcMapReduce.java
        Main.java
        Person.java
        PersonComparator.java
        PersonNameComparator.java
        PersonNamePartitioner.java
        SortMapReduce.java
        hyperloglog
        Example.java
        pagerank
        giraph
        PageRankVertex.java
        mr
        Main.java
        Map.java
        Node.java
        Reduce.java
        shortestpath
        Main.java
        Map.java
        Node.java
        Reduce.java
        ch8
        CloneReduce.java
        CombineJob.java
        DataSkewGnuplot.java
        DataSkewMetrics.java
        ExtractJobMetrics.java
        ExtractJobTaskTimeline.java
        FilterProjectJob.java
        IdentityJob.java
        JobHelper.java
        JobHistoryHelper.java
        LongSleepJob.java
        MetricSummary.java
        PaddedTable.java
        PersonBinaryComparator.java
        PersonSortMapReduce.java
        SampleJob.java
        SkewLogsJob.java
        SlowJob.java
        StringConcat.java
        StringTokBench.java
        TaskMetrics.java
        TaskThroughput.java
        ch9
        hive
        ExtractMovie.java
        Geoloc.java
        LogSerDe.java
        mahout
        HomegrownNBClassifier.java
        MovieUserEvaluator.java
        MovieUserRecommender.java
        Synthetic2DClusteringPrep.java
        util
        ApacheCommonLogParser.java
        ApacheCommonLogReader.java
        AvroDump.java
        CatHdfs.java
        Cli.java
        CliCommonOpts.java
        CommonLogEntry.java
        CommonLogInputFormat.java
        HadoopCompat.java
        HdfsIoUtils.java
        JobHelper.java
        PaddedTable.java
        Range.java
        StreamToHdfs.java
  - test
    - java
      - hip
        TestBase.java
        ch8
        SimpleMovingAverage.java
        localjobrunner
        IdentityTest.java
        IdentityWithBuilderTest.java
        TextIOJobBuilder.java
        minimrcluster
        IdentityMiniTest.java
        mrunit
        IdentityMapJUnitAssertsTest.java
        IdentityMapJUnitTest.java
        IdentityMapReduceTest.java
        IdentityMapTest.java
        IdentityReduceTest.java
        MRUnitJUnitAsserts.java
        PipelineTest.java

package hip.ch6.joins.replicated.framework;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class GenericReplicatedJoin
    extends Mapper<Object, Object, Object, Object> {

  private Map<Object, List<Pair>> cachedRecords =
      new HashMap<Object, List<Pair>>();
  private boolean distributedCacheIsSmaller;
  private Path[] distributedCacheFiles;

  /**
   * Transforms a record from the input split to a Pair object.  The
   * input splits are ostensibly larger than the Distributed Cache file.
   * <p/>
   * This implementation works with keys and values produced with the
   * KeyValueTextInputFormat, or any InputFormat which yeilds keys and
   * values with meaningful toString methods.  For other input formats, this method
   * should be overridden to convert the key/value into a Pair where
   * the key is text.
   *
   * @param key   the key emitted by the {@link org.apache.hadoop.mapreduce.InputFormat}
   * @param value the key emitted by the {@link org.apache.hadoop.mapreduce.InputFormat}
   * @return a Pair object, where the key is a Text object containing
   * the key for joining purposes, and the value contains data which
   * will be used when creating the composite output key
   */
  public Pair readFromInputFormat(Object key, Object value) {
    return new Pair<String, String>(key.toString(), value.toString());
  }


  /**
   * Get the object which will be used to read data from the
   * Distributed Cache file.  The Distrubuted Cache files are referred
   * to as "R" files.  They are ostensibly the smaller files compared
   * to "L" files.
   * <p/>
   * The default implementation provides an implementation that works
   * with line-based text files, where keys and values are separated
   * with whitespace.
   *
   * @return a reader which can unmarshall data from the Distributed
   * Cache
   */
  public DistributedCacheFileReader getDistributedCacheReader() {
    return new TextDistributedCacheFileReader();
  }

  /**
   * Join together a record from the input split and Distributed Cache
   * and return a new pair which will be emitted by the Map.
   * <p/>
   * If a null is returned, no output will be produced.
   * <p/>
   * The default implementation assumes that the Pair keys and values
   * are Strings and concatenates them together delimited by the
   * tab character.
   * <p/>
   * This should be overridden in cases where the values aren't Strings,
   * or to change how the output value is created.
   *
   * @param inputSplitPair a record from the input split
   * @param distCachePair  a record from the Distributed Cache
   * @return a composite output value which is compatible with the
   * expected value type for the {@link org.apache.hadoop.mapreduce.OutputFormat}
   * used for this job
   */
  public Pair join(Pair inputSplitPair, Pair distCachePair) {
    StringBuilder sb = new StringBuilder();
    if (inputSplitPair.getData() != null) {
      sb.append(inputSplitPair.getData());
    }
    sb.append("\t");
    if (distCachePair.getData() != null) {
      sb.append(distCachePair.getData());
    }
    return new Pair<Text, Text>(
        new Text(inputSplitPair.getKey().toString()),
        new Text(sb.toString()));
  }


  @Override
  protected void setup(
      Context context)
      throws IOException, InterruptedException {

    distributedCacheFiles = DistributedCache.getLocalCacheFiles(
        context.getConfiguration());

    int distCacheSizes = 0;
    for (Path distFile : distributedCacheFiles) {
      System.out.println("Distcache file: " + distFile + " " + distFile.getName());
      if (distFile.getName().startsWith("part")) {
        System.out.println("Added: " + distFile);
        File distributedCacheFile = new File(distFile.toString());
        distCacheSizes += distributedCacheFile.length();
      }
    }

    if (context.getInputSplit() instanceof FileSplit) {
      FileSplit split = (FileSplit) context.getInputSplit();

      long inputSplitSize = split.getLength();

      distributedCacheIsSmaller = (distCacheSizes < inputSplitSize);
    } else {
      // if the input split isn't a FileSplit, then assume the
      // distributed cache is smaller than the input split
      //
      distributedCacheIsSmaller = true;
    }


    System.out.println(
        "distributedCacheIsSmaller = " + distributedCacheIsSmaller);

    if (distributedCacheIsSmaller) {
      for (Path distFile : distributedCacheFiles) {
        if (distFile.getName().startsWith("part")) {
          File distributedCacheFile = new File(distFile.toString());
          DistributedCacheFileReader reader =
              getDistributedCacheReader();
          reader.init(distributedCacheFile);
          for (Pair p : (Iterable<Pair>) reader) {
            addToCache(p);
          }
          reader.close();
        }
      }
    }
  }

  private void addToCache(Pair pair) {
    List<Pair> values = cachedRecords.get(pair.getKey());
    if (values == null) {
      values = new ArrayList<Pair>();
      cachedRecords.put(pair.getKey(), values);
    }
    values.add(pair);
  }

  @Override
  protected void map(Object key, Object value, Context context)
      throws IOException, InterruptedException {
    System.out.println("K[" + key + "]");

    Pair pair = readFromInputFormat(key, value);
    if (distributedCacheIsSmaller) {
      joinAndCollect(pair, context);
    } else {
      addToCache(pair);
    }
  }


  public void joinAndCollect(Pair p, Context context)
      throws IOException, InterruptedException {
    List<Pair> cached = cachedRecords.get(p.getKey());
    if (cached != null) {
      for (Pair cp : cached) {
        Pair result;
        if (distributedCacheIsSmaller) {
          result = join(p, cp);
        } else {
          result = join(cp, p);
        }
        if (result != null) {
          context.write(result.getKey(), result.getData());
        }
      }
    }
  }

  @Override
  protected void cleanup(
      Context context)
      throws IOException, InterruptedException {
    if (!distributedCacheIsSmaller) {
      System.out.println("Outputting in cleanup");

      for (Path distFile : distributedCacheFiles) {
        File distributedCacheFile = new File(distFile.toString());
        if (distFile.getName().startsWith("part")) {
          DistributedCacheFileReader reader =
              getDistributedCacheReader();
          reader.init(distributedCacheFile);
          for (Pair p : (Iterable<Pair>) reader) {
            joinAndCollect(p, context);
          }
          reader.close();
        }
      }
    }
  }
}