/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.apache.avro.mapreduce; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroValue; import org.apache.avro.mapred.FsInput; import org.apache.avro.specific.SpecificDatumReader; import org.apache.avro.reflect.ReflectData; import org.apache.avro.reflect.ReflectDatumReader; import org.apache.avro.util.Utf8; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; public class TestWordCount { @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); public static final Schema STATS_SCHEMA = Schema.parse("{\"name\":\"stats\",\"type\":\"record\"," + "\"fields\":[{\"name\":\"count\",\"type\":\"int\"}," + "{\"name\":\"name\",\"type\":\"string\"}]}"); public static class ReflectStats { String name; int count; } // permit data written as SpecficStats to be read as ReflectStats private static Schema REFLECT_STATS_SCHEMA = ReflectData.get().getSchema(ReflectStats.class); static { REFLECT_STATS_SCHEMA.addAlias(TextStats.SCHEMA$.getFullName()); } private static class LineCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable mOne; @Override protected void setup(Context context) { mOne = new IntWritable(1); } @Override protected void map(LongWritable fileByteOffset, Text line, Context context) throws IOException, InterruptedException { context.write(line, mOne); } } private static class StatCountMapper extends Mapper<AvroKey<TextStats>, NullWritable, Text, IntWritable> { private IntWritable mCount; private Text mText; @Override protected void setup(Context context) { mCount = new IntWritable(0); mText = new Text(""); } @Override protected void map(AvroKey<TextStats> record, NullWritable ignore, Context context) throws IOException, InterruptedException { mCount.set(record.datum().count); mText.set(record.datum().name.toString()); context.write(mText, mCount); } } private static class ReflectCountMapper extends Mapper<AvroKey<ReflectStats>, NullWritable, Text, IntWritable> { private IntWritable mCount; private Text mText; @Override protected void setup(Context context) { mCount = new IntWritable(0); mText = new Text(""); } @Override protected void map(AvroKey<ReflectStats> record, NullWritable ignore, Context context) throws IOException, InterruptedException { mCount.set(record.datum().count); mText.set(record.datum().name); context.write(mText, mCount); } } private static class AvroSumReducer extends Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> { @Override protected void reduce(Text key, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable count : counts) { sum += count.get(); } context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum)); } } private static class GenericStatsReducer extends Reducer<Text, IntWritable, AvroKey<GenericData.Record>, NullWritable> { private AvroKey<GenericData.Record> mStats; @Override protected void setup(Context context) { mStats = new AvroKey<GenericData.Record>(null); } @Override protected void reduce(Text line, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { GenericData.Record record = new GenericData.Record(STATS_SCHEMA); int sum = 0; for (IntWritable count : counts) { sum += count.get(); } record.put("name", new Utf8(line.toString())); record.put("count", new Integer(sum)); mStats.datum(record); context.write(mStats, NullWritable.get()); } } private static class SpecificStatsReducer extends Reducer<Text, IntWritable, AvroKey<TextStats>, NullWritable> { private AvroKey<TextStats> mStats; @Override protected void setup(Context context) { mStats = new AvroKey<TextStats>(null); } @Override protected void reduce(Text line, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { TextStats record = new TextStats(); record.count = 0; for (IntWritable count : counts) { record.count += count.get(); } record.name = line.toString(); mStats.datum(record); context.write(mStats, NullWritable.get()); } } private static class ReflectStatsReducer extends Reducer<Text, IntWritable, AvroKey<ReflectStats>, NullWritable> { private AvroKey<ReflectStats> mStats; @Override protected void setup(Context context) { mStats = new AvroKey<ReflectStats>(null); } @Override protected void reduce(Text line, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { ReflectStats record = new ReflectStats(); record.count = 0; for (IntWritable count : counts) { record.count += count.get(); } record.name = line.toString(); mStats.datum(record); context.write(mStats, NullWritable.get()); } } private static class SortMapper extends Mapper<AvroKey<TextStats>, NullWritable, AvroKey<TextStats>, NullWritable> { @Override protected void map(AvroKey<TextStats> key, NullWritable value, Context context) throws IOException, InterruptedException { context.write(key, value); } } private static class SortReducer extends Reducer<AvroKey<TextStats>, NullWritable, AvroKey<TextStats>, NullWritable> { @Override protected void reduce(AvroKey<TextStats> key, Iterable<NullWritable> ignore, Context context) throws IOException, InterruptedException { context.write(key, NullWritable.get()); } } @Test public void testAvroGenericOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt") .toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); AvroJob.setOutputKeySchema(job, STATS_SCHEMA); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-generic"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA)); Map<String, Integer> counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); } @Test public void testAvroSpecificOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt") .toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(SpecificStatsReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); } @Test public void testAvroReflectOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt") .toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(ReflectStatsReducer.class); AvroJob.setOutputKeySchema(job, REFLECT_STATS_SCHEMA); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-reflect"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<ReflectStats> reader = new DataFileReader<ReflectStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new ReflectDatumReader<ReflectStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (ReflectStats record : reader) { counts.put(record.name.toString(), record.count); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); } @Test public void testAvroInput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro") .toURI().toString())); job.setInputFormatClass(AvroKeyInputFormat.class); AvroJob.setInputKeySchema(job, TextStats.SCHEMA$); job.setMapperClass(StatCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(SpecificStatsReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); } @Test public void testReflectInput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro") .toURI().toString())); job.setInputFormatClass(AvroKeyInputFormat.class); AvroJob.setInputKeySchema(job, REFLECT_STATS_SCHEMA); job.setMapperClass(ReflectCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(ReflectStatsReducer.class); AvroJob.setOutputKeySchema(job, REFLECT_STATS_SCHEMA); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-reflect-input"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<ReflectStats> reader = new DataFileReader<ReflectStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new ReflectDatumReader<ReflectStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (ReflectStats record : reader) { counts.put(record.name.toString(), record.count); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); } @Test public void testAvroMapOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro") .toURI().toString())); job.setInputFormatClass(AvroKeyInputFormat.class); AvroJob.setInputKeySchema(job, TextStats.SCHEMA$); job.setMapperClass(SortMapper.class); AvroJob.setMapOutputKeySchema(job, TextStats.SCHEMA$); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SortReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); } /** * Tests the MR output to text files when using AvroKey and AvroValue records. */ @Test public void testAvroUsingTextFileOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(getClass() .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt") .toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(AvroSumReducer.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); job.setOutputFormatClass(TextOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-text"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); Path filePath = outputFiles[0].getPath(); InputStream inputStream = filePath.getFileSystem(job.getConfiguration()).open(filePath); Assert.assertNotNull(inputStream); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); try { Assert.assertTrue(reader.ready()); Assert.assertEquals("apple\t3", reader.readLine()); Assert.assertEquals("banana\t2", reader.readLine()); Assert.assertEquals("carrot\t1", reader.readLine()); Assert.assertFalse(reader.ready()); } finally { reader.close(); } } }