/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.pinterest.secor.parser; import com.pinterest.secor.common.*; import com.pinterest.secor.message.Message; import com.pinterest.secor.util.CompressionUtil; import com.pinterest.secor.util.FileUtil; import com.pinterest.secor.util.ReflectionUtil; import org.apache.hadoop.io.compress.CompressionCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Stack; /** * Partition finalizer writes _SUCCESS files to date partitions that very likely won't be receiving * any new messages. It also adds those partitions to Hive. * * @author Pawel Garbacki (pawel@pinterest.com) */ public class PartitionFinalizer { private static final Logger LOG = LoggerFactory.getLogger(PartitionFinalizer.class); private final SecorConfig mConfig; private final ZookeeperConnector mZookeeperConnector; private final TimestampedMessageParser mMessageParser; private final KafkaClient mKafkaClient; private final QuboleClient mQuboleClient; private final String mFileExtension; private final int mLookbackPeriods; public PartitionFinalizer(SecorConfig config) throws Exception { mConfig = config; mKafkaClient = new KafkaClient(mConfig); mZookeeperConnector = new ZookeeperConnector(mConfig); mMessageParser = (TimestampedMessageParser) ReflectionUtil.createMessageParser( mConfig.getMessageParserClass(), mConfig); mQuboleClient = new QuboleClient(mConfig); if (mConfig.getFileExtension() != null && !mConfig.getFileExtension().isEmpty()) { mFileExtension = mConfig.getFileExtension(); } else if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) { CompressionCodec codec = CompressionUtil.createCompressionCodec(mConfig.getCompressionCodec()); mFileExtension = codec.getDefaultExtension(); } else { mFileExtension = ""; } mLookbackPeriods = config.getFinalizerLookbackPeriods(); LOG.info("Lookback periods: " + mLookbackPeriods); } private String[] getFinalizedUptoPartitions(String topic) throws Exception { final int numPartitions = mKafkaClient.getNumPartitions(topic); List<Message> lastMessages = new ArrayList<Message>(numPartitions); List<Message> committedMessages = new ArrayList<Message>(numPartitions); for (int partition = 0; partition < numPartitions; ++partition) { TopicPartition topicPartition = new TopicPartition(topic, partition); Message lastMessage = mKafkaClient.getLastMessage(topicPartition); Message committedMessage = mKafkaClient.getCommittedMessage(topicPartition); if (lastMessage == null || committedMessage == null) { // This will happen if no messages have been posted to the given topic partition. LOG.error("For topic {} partition {}, lastMessage: {}, committed: {}", topicPartition.getTopic(), topicPartition.getPartition(), lastMessage, committedMessage); continue; } lastMessages.add(lastMessage); committedMessages.add(committedMessage); } return mMessageParser.getFinalizedUptoPartitions(lastMessages, committedMessages); } private void finalizePartitionsUpTo(String topic, String[] uptoPartitions) throws Exception { String prefix = FileUtil.getPrefix(topic, mConfig); LOG.info("Finalize up to (but not include) {}, dim: {}", uptoPartitions, uptoPartitions.length); String[] previous = mMessageParser.getPreviousPartitions(uptoPartitions); Stack<String[]> toBeFinalized = new Stack<String[]>(); // Walk backwards to collect all partitions which are previous to the upTo partition // Do not include the upTo partition // Stop at the first partition which already have the SUCCESS file for (int i = 0; i < mLookbackPeriods; i++) { LOG.info("Looking for partition: " + Arrays.toString(previous)); LogFilePath logFilePath = new LogFilePath(prefix, topic, previous, mConfig.getGeneration(), 0, 0, mFileExtension); if (FileUtil.s3PathPrefixIsAltered(logFilePath.getLogFilePath(), mConfig)) { logFilePath = logFilePath.withPrefix(FileUtil.getS3AlternativePrefix(mConfig)); } String logFileDir = logFilePath.getLogFileDir(); if (FileUtil.exists(logFileDir)) { String successFilePath = logFileDir + "/_SUCCESS"; if (FileUtil.exists(successFilePath)) { LOG.info( "SuccessFile exist already, short circuit return. " + successFilePath); break; } LOG.info("Folder {} exists and ready to be finalized.", logFileDir); toBeFinalized.push(previous); } else { LOG.info("Folder {} doesn't exist, skip", logFileDir); } previous = mMessageParser.getPreviousPartitions(previous); } LOG.info("To be finalized partitions: {}", toBeFinalized); if (toBeFinalized.isEmpty()) { LOG.warn("There is no partitions to be finalized."); return; } // Now walk forward the collected partitions to do the finalization // Note we are deliberately walking backwards and then forwards to make sure we don't // end up in a situation that a later date partition is finalized and then the system // crashes (which creates unfinalized partition folders in between) while (!toBeFinalized.isEmpty()) { String[] current = toBeFinalized.pop(); LOG.info("Finalizing partition: " + Arrays.toString(current)); // We only perform hive registration on the last dimension of the partition array // i.e. only do hive registration for the hourly folder, but not for the daily if (uptoPartitions.length == current.length) { try { StringBuilder sb = new StringBuilder(); for (int i = 0; i < current.length; i++) { String par = current[i]; // We expect the partition array in the form of key=value if // they need to go through hive registration String[] parts = par.split("="); assert parts.length == 2 : "wrong partition format: " + par; if (i > 0) { sb.append(","); } sb.append(parts[0]); sb.append("='"); sb.append(parts[1]); sb.append("'"); } LOG.info("Hive partition string: " + sb); String hiveTableName = mConfig.getHiveTableName(topic); LOG.info("Hive table name from config: {}", hiveTableName); if (hiveTableName == null) { String hivePrefix = null; try { hivePrefix = mConfig.getHivePrefix(); hiveTableName = hivePrefix + topic; LOG.info("Hive table name from prefix: {}", hiveTableName); } catch (RuntimeException ex) { LOG.warn("HivePrefix is not defined. Skip hive registration"); } } if (hiveTableName != null && mConfig.getQuboleEnabled()) { mQuboleClient.addPartition(hiveTableName, sb.toString()); } } catch (Exception e) { LOG.error("failed to finalize topic " + topic, e); continue; } } // Generate the SUCCESS file at the end LogFilePath logFilePath = new LogFilePath(prefix, topic, current, mConfig.getGeneration(), 0, 0, mFileExtension); if (FileUtil.s3PathPrefixIsAltered(logFilePath.getLogFilePath(), mConfig)) { logFilePath = logFilePath.withPrefix(FileUtil.getS3AlternativePrefix(mConfig)); LOG.info("Will finalize alternative s3 logFilePath {}", logFilePath); } String logFileDir = logFilePath.getLogFileDir(); String successFilePath = logFileDir + "/_SUCCESS"; LOG.info("touching file {}", successFilePath); FileUtil.touch(successFilePath); } } public void finalizePartitions() throws Exception { List<String> topics = mZookeeperConnector.getCommittedOffsetTopics(); for (String topic : topics) { if (!topic.matches(mConfig.getKafkaTopicFilter())) { LOG.info("skipping topic {}", topic); } else { LOG.info("finalizing topic {}", topic); String[] partitions = getFinalizedUptoPartitions(topic); LOG.info("finalized timestamp for topic {} is {}", topic , partitions); if (partitions != null) { finalizePartitionsUpTo(topic, partitions); } } } } }