package brickhouse.udf.collect; /** * Copyright 2012 Klout, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; /** * GroupCountUDF provides a sequence number for all rows which have the * same value for a particular grouping. * This allows us to count how many rows are in a grouping and cap them * off after a certain point. * <p/> * <p>For example, we can cap-off the number of records per ks_uid with something like * <p/> * select * ks_uid, val, group_count(ks_uid) as rank * from * ( select ks_uid, val from table1 * distribute by ks_uid * sort by ks_uid, val ) ordered_keys * where group_count( ks_uid ) < 100 */ @Description( name = "group_count", value = " A sequence id for all rows with the same value for a specific grouping" ) public class GroupCountUDF extends UDF { private String lastGrouping = null; private int lastCount = 0; public Integer evaluate(String grouping) { // First time through ... if (lastGrouping == null) { lastGrouping = grouping; lastCount = 1; return 0; } if (lastGrouping != null && lastGrouping.equals(grouping)) { int retVal = lastCount; lastCount++; return retVal; } else { lastCount = 1; lastGrouping = grouping; return 0; } } }