TestKeyWordCount.java example

Explorer
avro-master
- doc
  - examples
    - java-example
      - src
        main
        java
        example
        GenericMain.java
        SpecificMain.java
    - mr-example
      - src
        main
        java
        example
        AvroWordCount.java
        GenerateData.java
        MapReduceAvroWordCount.java
        MapReduceColorCount.java
        MapredColorCount.java
- lang
  - java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.trevni.avro.mapreduce;

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.trevni.avro.WordCountUtil;
import org.apache.trevni.avro.mapreduce.AvroTrevniKeyOutputFormat;
import org.junit.Test;

public class TestKeyWordCount {

  private static long total = 0;

  static final Schema STRING = Schema.create(Schema.Type.STRING);
  static { GenericData.setStringType(STRING, GenericData.StringType.String); }
  static final Schema LONG = Schema.create(Schema.Type.LONG);


  private static class WordCountMapper extends
      Mapper<AvroKey<String>, NullWritable, Text, LongWritable> {
    private LongWritable mCount = new LongWritable();
    private Text mText = new Text();

    @Override
    protected void setup(Context context) {
      mCount.set(1);
    }

    @Override
    protected void map(AvroKey<String> key, NullWritable value, Context context)
        throws IOException, InterruptedException {

      try {
        StringTokenizer tokens = new StringTokenizer(key.datum());
        while (tokens.hasMoreTokens()) {
          mText.set(tokens.nextToken());
          context.write(mText, mCount);
        }
      } catch (Exception e) {
        throw new RuntimeException(key + " "  + key.datum() , e);
      }

    }
  }

  private static class WordCountReducer extends Reducer< Text, LongWritable, AvroKey<GenericData.Record>, NullWritable> {

    private AvroKey<GenericData.Record> result ;

    @Override
    protected void setup(Context context) {
      result = new AvroKey<GenericData.Record>();
      result.datum(new Record(Pair.getPairSchema(STRING,LONG)));
    }

    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
      long count = 0;
      for (LongWritable value: values) {
        count += value.get();
      }

      result.datum().put("key", key.toString());
      result.datum().put("value", count);

      context.write(result, NullWritable.get());
    }
  }



  public static class Counter extends
  Mapper<AvroKey<GenericData.Record>, NullWritable, NullWritable, NullWritable> {
    @Override
    protected void map(AvroKey<GenericData.Record> key, NullWritable value, Context context)
        throws IOException, InterruptedException {
      total += (Long)key.datum().get("value");
    }
  }


  @Test public void testIOFormat() throws Exception {
    checkOutputFormat();
    checkInputFormat();
  }

  public void checkOutputFormat() throws Exception {
    Job job = new Job();

    WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000");

    wordCountUtil.writeLinesFile();

    AvroJob.setInputKeySchema(job, STRING);
    AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG));

    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
    FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
    FileOutputFormat.setCompressOutput(job, true);

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class);

    job.waitForCompletion(true);

    wordCountUtil.validateCountsFile();
  }

  public void checkInputFormat() throws Exception {
    Job job = new Job();

    WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest");

    job.setMapperClass(Counter.class);

    Schema subSchema = Schema.parse("{\"type\":\"record\"," +
                                    "\"name\":\"PairValue\","+
                                    "\"fields\": [ " +
                                    "{\"name\":\"value\", \"type\":\"long\"}" +
                                    "]}");
    AvroJob.setInputKeySchema(job, subSchema);

    FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*"));
    job.setInputFormatClass(AvroTrevniKeyInputFormat.class);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(NullOutputFormat.class);

    total = 0;
    job.waitForCompletion(true);
    assertEquals(WordCountUtil.TOTAL, total);

  }

}