/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.avro.Schema; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.junit.Test; import org.kiji.mapreduce.framework.HBaseKijiTableInputFormat; import org.kiji.mapreduce.framework.KijiTableInputFormat; import org.kiji.schema.DecodedCell; import org.kiji.schema.EntityId; import org.kiji.schema.HBaseEntityId; import org.kiji.schema.KijiDataRequest; import org.kiji.schema.KijiDataRequestBuilder.ColumnsDef; import org.kiji.schema.KijiRowData; import org.kiji.schema.filter.ColumnValueEqualsRowFilter; import org.kiji.schema.filter.KijiRowFilter; import org.kiji.schema.testutil.FooTableIntegrationTest; /** Tests for the KijiTableInputFormat. */ public class IntegrationTestKijiTableInputFormat extends FooTableIntegrationTest { public static class TestMapper extends Mapper<EntityId, KijiRowData, Text, Text> { @Override public void map(EntityId entityId, KijiRowData row, Context context) throws IOException, InterruptedException { final String name = row.getMostRecentValue("info", "name").toString(); final String email = row.getMostRecentValue("info", "email").toString(); // Build email domain regex. final Pattern emailRegex = Pattern.compile(".+@(.+)"); final Matcher emailMatcher = emailRegex.matcher(email); // Extract domain from email. assertTrue(emailMatcher.find()); final String emailDomain = emailMatcher.group(1); context.write(new Text(emailDomain), new Text(name)); } } public static class TestReducer extends Reducer<Text, Text, Text, Text> { @Override public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // Combine all the names. final Iterator<Text> iter = values.iterator(); final StringBuilder names = new StringBuilder(iter.next().toString()); while (iter.hasNext()) { final Text name = iter.next(); names.append(","); names.append(name.toString()); } // Write names to context. final Text output = new Text(names.toString().trim()); context.write(key, output); } } public Job setupJob( String jobName, Path outputFile, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, KijiRowFilter filter) throws Exception { final Job job = new Job(createConfiguration()); final Configuration conf = job.getConfiguration(); // Get settings for test. final KijiDataRequest request = KijiDataRequest.builder() .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")) .build(); job.setJarByClass(IntegrationTestKijiTableInputFormat.class); // Setup the InputFormat. KijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter); job.setInputFormatClass(HBaseKijiTableInputFormat.class); // Duplicate functionality from MapReduceJobBuilder, since we are not using it here: final List<Path> jarFiles = Lists.newArrayList(); final FileSystem fs = FileSystem.getLocal(conf); for (String cpEntry : System.getProperty("java.class.path").split(":")) { if (cpEntry.endsWith(".jar")) { jarFiles.add(fs.makeQualified(new Path(cpEntry))); } } DistributedCacheJars.addJarsToDistributedCache(job, jarFiles); // Create a test job. job.setJobName(jobName); // Setup the OutputFormat. TextOutputFormat.setOutputPath(job, outputFile.getParent()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // Set the mapper class. if (null != mapperClass) { job.setMapperClass(mapperClass); } // Set the reducer class. if (null != reducerClass) { job.setReducerClass(reducerClass); } return job; } private Path createOutputFile() { return new Path(String.format("/%s-%s-%d/part-r-00000", getClass().getName(), mTestName.getMethodName(), System.currentTimeMillis())); } /** Test KijiTableInputFormat in a map-only job. */ @Test public void testMapJob() throws Exception { final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob( "testMapJob", outputFile, TestMapper.class, null, // reducer class null, // start key null, // limit key null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet( "usermail.example.com\tAaron Kimball", "gmail.com\tJohn Doe", "usermail.example.com\tChristophe Bisciglia", "usermail.example.com\tKiyan Ahmadizadeh", "gmail.com\tJane Doe", "usermail.example.com\tGarrett Wu"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 } /** Test KijiTableInputFormat in a map-only job with start and limit keys. */ @Test public void testMapJobWithStartAndLimitKeys() throws Exception { final Path outputFile = createOutputFile(); // Set the same entity IDs for start and limit, and we should get just the start row final EntityId startEntityId = getFooTable().getEntityId("jane.doe@gmail.com"); final byte[] endRowKey = startEntityId.getHBaseRowKey(); final EntityId rawLimitEntityId = HBaseEntityId.fromHBaseRowKey(Arrays.copyOf(endRowKey, endRowKey.length + 1)); // Create a test job. final Job job = setupJob( "testMapJobWithStartAndLimitKeys", outputFile, TestMapper.class, null, // reducer class startEntityId, rawLimitEntityId, null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet( "gmail.com\tJane Doe"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 } /** Test KijiTableInputFormat in a map-only job with a row filter. */ @Test public void testMapJobWithFilter() throws Exception { final KijiRowFilter filter = new ColumnValueEqualsRowFilter("info", "email", new DecodedCell<String>(Schema.create(Schema.Type.STRING), "aaron@usermail.example.com")); final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob( "testMapJobWithFilter", outputFile, TestMapper.class, null, // reducer class null, // start key null, // limit key filter); // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 } /** Test KijiTableInputFormat in a MapReduce job. */ @Test public void testMapReduceJob() throws Exception { final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob( "testMapReduceJob", outputFile, TestMapper.class, TestReducer.class, null, // start key null, // limit key null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> output = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final ImmutableMap.Builder<String, Set<String>> builder = ImmutableMap.builder(); for (String line : output) { final String[] keyValue = line.split("\t"); final String emailDomain = keyValue[0]; final Set<String> names = Sets.newHashSet(keyValue[1].split(",")); builder.put(emailDomain, names); } final Map<String, Set<String>> actual = builder.build(); final Map<String, Set<String>> expected = ImmutableMap.<String, Set<String>>builder() .put("usermail.example.com", Sets.newHashSet( "Aaron Kimball", "Christophe Bisciglia", "Kiyan Ahmadizadeh", "Garrett Wu")) .put("gmail.com", Sets.newHashSet("John Doe", "Jane Doe")) .build(); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 } }