/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.piggybank.storage.hiverc; import java.io.IOException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.List; import java.util.Properties; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.piggybank.storage.HiveColumnarLoader; import org.apache.pig.piggybank.storage.partition.PathPartitionHelper; /** * HiveRCInputFormat used by HiveColumnarLoader as the InputFormat; * <p/> * Reasons for implementing a new InputFormat sub class:<br/> * <ul> * <li>The current RCFileInputFormat uses the old InputFormat mapred interface, * and the pig load store design used the new InputFormat mapreduce classes.</li> * <li>The splits are calculated by the InputFormat, HiveColumnarLoader supports * date partitions, the filtering is done here.</li> * </ul> */ public class HiveRCInputFormat extends FileInputFormat<LongWritable, BytesRefArrayWritable> { transient PathPartitionHelper partitionHelper = new PathPartitionHelper(); String signature = ""; public HiveRCInputFormat() { this(null); } public HiveRCInputFormat(String signature) { this.signature = signature; Properties properties = UDFContext.getUDFContext().getUDFProperties( HiveColumnarLoader.class, new String[] { signature }); // This expression is passed in the // HiveColumnarLoader.setPartitionExpression method by the Pig Loader // Classes. String partitionExpression = properties .getProperty(PathPartitionHelper.PARITITION_FILTER_EXPRESSION); // backwards compatibility String dateRange = properties .getProperty(HiveColumnarLoader.DATE_RANGE); if (partitionExpression == null && dateRange != null) { partitionExpression = buildFilterExpressionFromDatePartition(dateRange); properties.setProperty( PathPartitionHelper.PARITITION_FILTER_EXPRESSION, partitionExpression); } } @Override protected List<FileStatus> listStatus(JobContext jobContext) throws IOException { List<FileStatus> files = partitionHelper.listStatus(jobContext, HiveColumnarLoader.class, signature); if (files == null) files = super.listStatus(jobContext); return files; } /** * If the date range was supplied in the loader constructor we need to build * our own filter expression.<br/> * * @param dateRange * @return String */ private String buildFilterExpressionFromDatePartition(String dateRange) { Properties properties = UDFContext.getUDFContext().getUDFProperties( HiveColumnarLoader.class, new String[] { signature }); String partitionColumnStr = properties .getProperty(PathPartitionHelper.PARTITION_COLUMNS); boolean isYearMonthDayFormat = false; // only 3 partition types are supported (its impossible with date // partitions to support all possible combinations here). // 1) yyyy-MM-dd which is as /daydate=[date]/files // 2) yyyy-MM-dd which is as /date=[date]/files // 3) yyyy-MM-dd which is as /year=[year]/month=[month]/day=[day] String key = null; if (partitionColumnStr.contains("daydate")) { key = "daydate"; // use daydate as key } else if (partitionColumnStr.contains("date")) { key = "date"; // user date as key } else if (partitionColumnStr.contains("year") && partitionColumnStr.contains("month") && partitionColumnStr.contains("day")) { isYearMonthDayFormat = true; } else { throw new RuntimeException( "Not date partitions where found for partitions: " + partitionColumnStr); } String[] split = dateRange.split(":"); if (split.length != 2) { throw new RuntimeException( "The date range must have format yyyy-MM-dd:yyyy-MM-dd"); } String partitionExpression = null; if (isYearMonthDayFormat) { // extract the YearMonthDay from the to dates; DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); Date date1 = parseDate(dateFormat, split[0]); Calendar cal = Calendar.getInstance(); cal.setTime(date1); partitionExpression = "(year >= '" + cal.get(Calendar.YEAR) + "' and month >= '" + formatNumber((cal.get(Calendar.MONTH) + 1)) + "' and day >= '" + formatNumber(cal.get(Calendar.DAY_OF_MONTH)) + "')"; Date date2 = parseDate(dateFormat, split[1]); cal.setTime(date2); partitionExpression += " and (year <= '" + cal.get(Calendar.YEAR) + "' and month <= '" + formatNumber((cal.get(Calendar.MONTH) + 1)) + "' and day <= '" + formatNumber(cal.get(Calendar.DAY_OF_MONTH)) + "')"; } else { partitionExpression = key + " >= '" + split[0] + "' and " + key + " <= '" + split[1] + "'"; } return partitionExpression; } private static final String formatNumber(int numb) { if (numb < 10) { return "0" + numb; } else { return "" + numb; } } /** * Initialises an instance of HiveRCRecordReader. */ @Override public RecordReader<LongWritable, BytesRefArrayWritable> createRecordReader( InputSplit split, TaskAttemptContext ctx) throws IOException, InterruptedException { HiveRCRecordReader reader = new HiveRCRecordReader(); return reader; } /** * Parse a date string with format yyyy-MM-dd. * * @param dateFormat * DateFormat * @param dateString * String * @return Date */ private static final Date parseDate(DateFormat dateFormat, String dateString) { try { return dateFormat.parse(dateString); } catch (ParseException e) { RuntimeException rt = new RuntimeException(e); rt.setStackTrace(e.getStackTrace()); throw rt; } } /** * The input split size should never be smaller than the * RCFile.SYNC_INTERVAL */ @Override protected long getFormatMinSplitSize() { return RCFile.SYNC_INTERVAL; } }