/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.blur.hive; import java.io.IOException; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import org.apache.blur.manager.BlurPartitioner; import org.apache.blur.mapreduce.lib.BlurColumn; import org.apache.blur.mapreduce.lib.BlurOutputFormat; import org.apache.blur.mapreduce.lib.BlurRecord; import org.apache.blur.thirdparty.thrift_0_9_0.TException; import org.apache.blur.thrift.BlurClient; import org.apache.blur.thrift.generated.Blur.Iface; import org.apache.blur.thrift.generated.BlurException; import org.apache.blur.thrift.generated.Column; import org.apache.blur.thrift.generated.Record; import org.apache.blur.thrift.generated.RecordMutation; import org.apache.blur.thrift.generated.RecordMutationType; import org.apache.blur.thrift.generated.RowMutation; import org.apache.blur.thrift.generated.RowMutationType; import org.apache.blur.thrift.generated.TableDescriptor; import org.apache.blur.utils.BlurConstants; import org.apache.blur.utils.ShardUtil; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.Writer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Progressable; public class BlurHiveOutputFormat implements HiveOutputFormat<Text, BlurRecord> { private static final String BLUR_USER_PROXY = "blur.user.proxy"; private static final String BLUR = "blur"; private static final String BLUR_USER_NAME = "blur.user.name"; private static final String BLUR_BULK_MUTATE_ID = "blur.bulk.mutate.id"; public static String getBulkId(Configuration conf) { return conf.get(BLUR_BULK_MUTATE_ID); } public static void setBulkId(Configuration conf, String bulkId) { conf.set(BLUR_BULK_MUTATE_ID, bulkId); } @Override public void checkOutputSpecs(FileSystem fileSystem, JobConf jobConf) throws IOException { } @Override public RecordWriter<Text, BlurRecord> getRecordWriter(FileSystem fileSystem, JobConf jobConf, String name, Progressable progressable) throws IOException { throw new RuntimeException("Should never be called."); } @Override public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException { if (BlurSerDe.shouldUseMRWorkingPath(jc)) { return getMrWorkingPathWriter(jc); } return getBulkRecordWriter(jc); } private org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getMrWorkingPathWriter( final Configuration configuration) throws IOException { PrivilegedExceptionAction<org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter> privilegedExceptionAction = new PrivilegedExceptionAction<org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter>() { @Override public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter run() throws Exception { String workingPathStr = configuration.get(BlurConstants.BLUR_BULK_UPDATE_WORKING_PATH); Path workingPath = new Path(workingPathStr); Path tmpDir = new Path(workingPath, "tmp"); FileSystem fileSystem = tmpDir.getFileSystem(configuration); String loadId = configuration.get(BlurSerDe.BLUR_MR_LOAD_ID); Path loadPath = new Path(tmpDir, loadId); final Writer writer = new SequenceFile.Writer(fileSystem, configuration, new Path(loadPath, UUID.randomUUID() .toString()), Text.class, BlurRecord.class); return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() { @Override public void write(Writable w) throws IOException { BlurRecord blurRecord = (BlurRecord) w; String rowId = blurRecord.getRowId(); writer.append(new Text(rowId), blurRecord); } @Override public void close(boolean abort) throws IOException { writer.close(); } }; } }; UserGroupInformation userGroupInformation = getUGI(configuration); try { return userGroupInformation.doAs(privilegedExceptionAction); } catch (InterruptedException e) { throw new IOException(e); } } public static UserGroupInformation getUGI(final Configuration configuration) throws IOException { String user = getBlurUser(configuration); UserGroupInformation userGroupInformation; UserGroupInformation currentUser = UserGroupInformation.getCurrentUser(); if (user.equals(currentUser.getUserName())) { userGroupInformation = currentUser; } else { if (BlurHiveOutputFormat.isBlurUserAsProxy(configuration)) { userGroupInformation = UserGroupInformation.createProxyUser(user, currentUser); } else { userGroupInformation = UserGroupInformation.createRemoteUser(user); } } return userGroupInformation; } public static boolean isBlurUserAsProxy(Configuration configuration) { return configuration.getBoolean(BLUR_USER_PROXY, false); } public static void setBlurUserAsProxy(Configuration configuration, boolean blurUserProxy) { configuration.setBoolean(BLUR_USER_PROXY, blurUserProxy); } public static String getBlurUser(Configuration configuration) { return configuration.get(BLUR_USER_NAME, BLUR); } public static void setBlurUser(Configuration configuration, String blurUser) { configuration.set(BLUR_USER_NAME, blurUser); } private org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getBulkRecordWriter(Configuration configuration) throws IOException { TableDescriptor tableDescriptor = BlurOutputFormat.getTableDescriptor(configuration); String conStr = configuration.get(BlurSerDe.BLUR_CONTROLLER_CONNECTION_STR); final Iface controllerClient = BlurClient.getClient(conStr); final String table = tableDescriptor.getName(); final int numberOfShardsInTable = tableDescriptor.getShardCount(); final String bulkId = getBulkId(configuration); return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() { private BlurPartitioner _blurPartitioner = new BlurPartitioner(); private Map<String, List<RowMutation>> _serverBatches = new ConcurrentHashMap<String, List<RowMutation>>(); private int _capacity = 100; private Map<String, String> _shardToServerLayout; @Override public void write(Writable w) throws IOException { BlurRecord blurRecord = (BlurRecord) w; String rowId = blurRecord.getRowId(); RowMutation rowMutation = new RowMutation(); rowMutation.setTable(table); rowMutation.setRowId(rowId); rowMutation.setRowMutationType(RowMutationType.UPDATE_ROW); rowMutation.addToRecordMutations(new RecordMutation(RecordMutationType.REPLACE_ENTIRE_RECORD, toRecord(blurRecord))); try { String server = getServer(rowId); List<RowMutation> batch = _serverBatches.get(server); if (batch == null) { _serverBatches.put(server, batch = new ArrayList<RowMutation>(_capacity)); } batch.add(rowMutation); checkForFlush(_capacity); } catch (BlurException e) { throw new IOException(e); } catch (TException e) { throw new IOException(e); } } @Override public void close(boolean abort) throws IOException { try { checkForFlush(1); } catch (BlurException e) { throw new IOException(e); } catch (TException e) { throw new IOException(e); } } private void checkForFlush(int max) throws BlurException, TException { for (Entry<String, List<RowMutation>> e : _serverBatches.entrySet()) { String server = e.getKey(); List<RowMutation> batch = e.getValue(); if (batch.size() >= max) { Iface client = BlurClient.getClient(server); client.bulkMutateAddMultiple(bulkId, batch); batch.clear(); } } } private String getServer(String rowId) throws BlurException, TException { int shard = _blurPartitioner.getShard(rowId, numberOfShardsInTable); String shardId = ShardUtil.getShardName(shard); return getServerFromShardId(table, shardId); } private String getServerFromShardId(String table, String shardId) throws BlurException, TException { if (_shardToServerLayout == null) { _shardToServerLayout = controllerClient.shardServerLayout(table); } return _shardToServerLayout.get(shardId); } }; } protected Record toRecord(BlurRecord blurRecord) { return new Record(blurRecord.getRecordId(), blurRecord.getFamily(), toColumns(blurRecord.getColumns())); } private List<Column> toColumns(List<BlurColumn> columns) { List<Column> result = new ArrayList<Column>(); for (BlurColumn blurColumn : columns) { result.add(new Column(blurColumn.getName(), blurColumn.getValue())); } return result; } }