/** * (c) Copyright 2012 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.tools; import java.io.IOException; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hbase.HConstants; import org.kiji.annotations.ApiAudience; import org.kiji.common.flags.Flag; import org.kiji.mapreduce.HFileLoader; import org.kiji.schema.Kiji; import org.kiji.schema.KijiTable; import org.kiji.schema.KijiURI; import org.kiji.schema.tools.BaseTool; import org.kiji.schema.tools.KijiToolLauncher; import org.kiji.schema.util.ResourceUtils; /** Bulk loads HFiles into a Kiji table. */ @ApiAudience.Private public final class KijiBulkLoad extends BaseTool { /** ExecutorService to execute the callables when bulk-loading. */ private ExecutorService mExecutorService = Executors.newCachedThreadPool(); @Flag(name="hfile", usage="Path of the directory containing HFile(s) to bulk-load. " + "Typically --hfile=hdfs://hdfs-cluster-address/path/to/hfile.dir/") private String mHFileFlag = null; @Flag(name="table", usage="URI of the Kiji table to bulk-load into.") private String mTableURIFlag = null; @Flag(name="timeout-milliseconds", usage="Timeout in milliseconds to wait for a bulk-load to " + "succeed. Default 10 seconds (10000 milliseconds).") private final Long mLoadTimeoutMilliseconds = 10000L; // 10 seconds @Flag(name="chmod-interactive", usage="When false, does not prompt for confirmation before " + "chmod'ing the hfile directory.") private Boolean mChmodInteractive = true; /** URI of the Kiji table to bulk-load into. */ private KijiURI mTableURI = null; /** Path of the HFile(s) to bulk-load. */ private Path mHFile = null; /** {@inheritDoc} */ @Override public String getName() { return "bulk-load"; } /** {@inheritDoc} */ @Override public String getDescription() { return "Bulk load HFiles into a table"; } /** {@inheritDoc} */ @Override public String getCategory() { return "Bulk"; } /** * Recursively sets the permissions to 777 on the HFiles. There is no built-in way in the * Hadoop Java API to recursively set permissions on a directory, so we implement it here. * * @param path The Path to the directory to chmod. * @throws IOException on IOException. */ private void recursiveGrantAllHFilePermissions(Path path) throws IOException { FileSystem hdfs = path.getFileSystem(getConf()); // Set the permissions on the path itself. hdfs.setPermission(path, FsPermission.createImmutable((short) 0777)); // Recurse into any files and directories in the path. // We must use listStatus because listFiles does not list subdirectories. FileStatus[] fileStatuses = hdfs.listStatus(path); for (FileStatus fileStatus : fileStatuses) { if (!fileStatus.getPath().equals(path)) { recursiveGrantAllHFilePermissions(fileStatus.getPath()); } } } /** {@inheritDoc} */ @Override protected void validateFlags() throws Exception { super.validateFlags(); Preconditions.checkArgument((mTableURIFlag != null) && !mTableURIFlag.isEmpty(), "Specify the table to bulk-load into with " + "--table=kiji://hbase-address/kiji-instance/table"); mTableURI = KijiURI.newBuilder(mTableURIFlag).build(); Preconditions.checkArgument(mTableURI.getTable() != null, "Specify the table to bulk-load into with " + "--table=kiji://hbase-address/kiji-instance/table"); Preconditions.checkArgument((mHFileFlag != null) && !mHFileFlag.isEmpty(), "Specify the HFiles to bulk-load. " + "E.g. --hfile=hdfs://hdfs-cluster-address/path/to/hfile.dir/"); mHFile = new Path(mHFileFlag); } /** {@inheritDoc} */ @Override protected int run(List<String> nonFlagArgs) throws Exception { final Kiji kiji = Kiji.Factory.open(mTableURI, getConf()); try { final KijiTable table = kiji.openTable(mTableURI.getTable()); try { // Load the HFiles. // // TODO: Consolidate this logic in a single central place: We must consolidate the // logic to properly initialize a Configuration object to target a specific HBase // cluster (hence the manual override of the ZooKeeper quorum/client-port). // // The reason for this manual override here is : KijiBulkLoad needs a // Configuration to create an HBaseLoader for the HBase instance targeted at from // the table URI. KijiTable does not expose its internal Configuration and // Kiji.getConf() is deprecated, so we have to construct one externally. final Configuration conf = getConf(); conf.set(HConstants.ZOOKEEPER_QUORUM, Joiner.on(",").join(mTableURI.getZookeeperQuorumOrdered())); conf.setInt(HConstants.ZOOKEEPER_CLIENT_PORT, mTableURI.getZookeeperClientPort()); final HFileLoader hFileLoader = HFileLoader.create(conf); // Create a new Callable for loading HFiles. This is used later to execute the HFile // loading in a separate thread, so we can check if it's done and, if necessary, chmod 777 // the HFile directory concurrently while it runs. Callable<Void> hFileLoadCallable = new Callable<Void>() { public Void call() throws Exception { hFileLoader.load(mHFile, table); return null; } }; Future<Void> hFileLoadTask = mExecutorService.submit(hFileLoadCallable); Long startTime = System.currentTimeMillis(); // Wait until mLoadTimeoutMilliseconds has passed. We do not use Future#get(long timeout) // because that will cancel the future if it isn't done. while ( System.currentTimeMillis() < startTime + mLoadTimeoutMilliseconds && !hFileLoadTask.isDone()) { Thread.sleep(100); } if (hFileLoadTask.isDone()) { return SUCCESS; } // If it did not complete in mLoadTimeoutMilliseconds, try to chmod the directory. if (mChmodInteractive) { if (!inputConfirmation( "First attempt at bulk-load timed out after " + mLoadTimeoutMilliseconds + " milliseconds. Do you want to chmod -R 777 the HFile directory?", mHFile.getName() )) { getPrintStream().println("Bulk-load timed out, not retrying."); return FAILURE; } else { recursiveGrantAllHFilePermissions(mHFile); try { hFileLoadTask.get(mLoadTimeoutMilliseconds, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { getPrintStream().println("Bulk-load failed due to a second timeout."); return FAILURE; } } } else { recursiveGrantAllHFilePermissions(mHFile); try { hFileLoadTask.get(mLoadTimeoutMilliseconds, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { getPrintStream().println("Bulk-load failed due to a second timeout."); return FAILURE; } } return SUCCESS; } finally { ResourceUtils.releaseOrLog(table); } } finally { ResourceUtils.releaseOrLog(kiji); } } /** * Program entry point. * * @param args The command-line arguments. * @throws Exception If there is an error. */ public static void main(String[] args) throws Exception { System.exit(new KijiToolLauncher().run(new KijiBulkLoad(), args)); } }