/** * (c) Copyright 2013 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.io.OutputStream; import java.util.Random; import com.google.common.base.Preconditions; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.kiji.mapreduce.bulkimport.KijiBulkImportJobBuilder; import org.kiji.mapreduce.bulkimport.KijiBulkImporter; import org.kiji.mapreduce.input.MapReduceJobInputs; import org.kiji.mapreduce.output.MapReduceJobOutputs; import org.kiji.schema.EntityId; import org.kiji.schema.Kiji; import org.kiji.schema.KijiDataRequest; import org.kiji.schema.KijiDataRequestBuilder; import org.kiji.schema.KijiRowData; import org.kiji.schema.KijiRowScanner; import org.kiji.schema.KijiTable; import org.kiji.schema.KijiTableReader; import org.kiji.schema.layout.KijiTableLayout; import org.kiji.schema.testutil.AbstractKijiIntegrationTest; public class IntegrationTestKijiBulkLoad extends AbstractKijiIntegrationTest { private static final Logger LOG = LoggerFactory.getLogger(IntegrationTestKijiBulkLoad.class); private Configuration mConf = null; private FileSystem mFS = null; private Path mBulkImportInputPath = null; private Kiji mKiji = null; private KijiTable mOutputTable = null; /** * Generates a random HDFS path. * * @param prefix Prefix for the random file name. * @return a random HDFS path. * @throws Exception on error. */ private Path makeRandomPath(String prefix) throws Exception { Preconditions.checkNotNull(mFS); final Path base = new Path(FileSystem.getDefaultUri(mConf)); final Random random = new Random(System.nanoTime()); return new Path(base, String.format("/%s-%s", prefix, random.nextLong())); } private void writeTestResource(Path path, String testResource) throws Exception { final OutputStream ostream = mFS.create(path); IOUtils.write(TestingResources.get(testResource), ostream); ostream.close(); } @Before public void setUp() throws Exception { mConf = createConfiguration(); mFS = FileSystem.get(mConf); mBulkImportInputPath = makeRandomPath("bulk-import-input"); // Prepare input file: writeTestResource(mBulkImportInputPath, "org/kiji/mapreduce/TestBulkImportInput.txt"); mKiji = Kiji.Factory.open(getKijiURI(), mConf); final KijiTableLayout layout = KijiTableLayout.newLayout(KijiMRTestLayouts.getTestLayout()); mKiji.createTable("test", layout); mOutputTable = mKiji.openTable("test"); } @After public void tearDown() throws Exception { mOutputTable.release(); mKiji.release(); mFS.delete(mBulkImportInputPath, false); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 mOutputTable = null; mKiji = null; mBulkImportInputPath = null; mFS = null; mConf = null; } @Test public void testBulkLoadHFile() throws Exception { final Path hfileDirPath = this.makeRandomPath("hfile-output"); final KijiMapReduceJob mrjob = KijiBulkImportJobBuilder.create() .withConf(getConf()) .withInput(MapReduceJobInputs.newTextMapReduceJobInput(mBulkImportInputPath)) .withBulkImporter(TestBulkImporter.SimpleBulkImporter.class) .withOutput(MapReduceJobOutputs.newHFileMapReduceJobOutput( mOutputTable.getURI(), hfileDirPath)) .build(); assertTrue(mrjob.run()); final HFileLoader loader = HFileLoader.create(mConf); // There is only one reducer, hence one HFile shard: final Path hfilePath = new Path(hfileDirPath, "part-r-00000.hfile"); loader.load(hfilePath, mOutputTable); } /** * Bulk importer intended to run on the generic KijiMR test layout. Uses the resource * org/kiji/mapreduce/layout/test.json. * * This importer emits cells, but immediately deletes some of them as well: we should * not see the deleted cells after bulk load. */ public static class TombstoningBulkImporter extends KijiBulkImporter<LongWritable, Text> { /** {@inheritDoc} */ @Override public void produce(LongWritable inputKey, Text value, KijiTableContext context) throws IOException { final String line = value.toString(); final String[] split = line.split(":"); Preconditions.checkState(split.length == 2, String.format("Unable to parse bulk-import test input line: '%s'.", line)); final String rowKey = split[0]; final String name = split[1]; final EntityId eid = context.getEntityId(rowKey); context.put(eid, "primitives", "string", 1L, name); context.put(eid, "primitives", "long", 1L, inputKey.get()); // Now delete the long context.deleteCell(eid, "primitives", "long", 1L); } } @Test public void testBulkLoadHFilesWithTombstones() throws Exception { final Path hfileDirPath = this.makeRandomPath("hfile-output"); final KijiMapReduceJob mrjob = KijiBulkImportJobBuilder.create() .withConf(getConf()) .withInput(MapReduceJobInputs.newTextMapReduceJobInput(mBulkImportInputPath)) .withBulkImporter(TombstoningBulkImporter.class) .withOutput(MapReduceJobOutputs.newHFileMapReduceJobOutput( mOutputTable.getURI(), hfileDirPath)) .build(); assertTrue(mrjob.run()); final HFileLoader loader = HFileLoader.create(mConf); loader.load(hfileDirPath, mOutputTable); final KijiDataRequestBuilder dataRequestBuilder = KijiDataRequest.builder(); dataRequestBuilder .newColumnsDef() .add("primitives", "string") .add("primitives", "long"); final KijiDataRequest dataRequest = dataRequestBuilder.build(); final KijiTableReader tableReader = mOutputTable.openTableReader(); try { final KijiRowScanner scanner = tableReader.getScanner(dataRequest); try { int rows = 0; for (KijiRowData row : scanner) { rows++; assertTrue(row.containsColumn("primitives", "string")); assertFalse(row.containsColumn("primitives", "long")); } assertTrue("At least one row should have been found.", rows > 0); } finally { scanner.close(); } } finally { tableReader.close(); } } @Test public void testBulkLoadDirectory() throws Exception { final Path hfileDirPath = this.makeRandomPath("hfile-output"); final KijiMapReduceJob mrjob = KijiBulkImportJobBuilder.create() .withConf(getConf()) .withInput(MapReduceJobInputs.newTextMapReduceJobInput(mBulkImportInputPath)) .withBulkImporter(TestBulkImporter.SimpleBulkImporter.class) .withOutput(MapReduceJobOutputs.newHFileMapReduceJobOutput( mOutputTable.getURI(), hfileDirPath)) .build(); assertTrue(mrjob.run()); final HFileLoader loader = HFileLoader.create(mConf); loader.load(hfileDirPath, mOutputTable); } @Test public void testBulkLoadMultipleSplits() throws Exception { final int nSplits = 3; final Path hfileDirPath = this.makeRandomPath("hfile-output"); final KijiMapReduceJob mrjob = KijiBulkImportJobBuilder.create() .withConf(getConf()) .withInput(MapReduceJobInputs.newTextMapReduceJobInput(mBulkImportInputPath)) .withBulkImporter(TestBulkImporter.SimpleBulkImporter.class) .withOutput(MapReduceJobOutputs.newHFileMapReduceJobOutput( mOutputTable.getURI(), hfileDirPath, nSplits)) .build(); assertTrue(mrjob.run()); final HFileLoader loader = HFileLoader.create(mConf); loader.load(hfileDirPath, mOutputTable); } }