/* * Copyright [2012-2015] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.core.autotype; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; /** * To merge all mapper {@link HyperLogLogPlus} statistics together according to variable id. */ public class AutoTypeDistinctCountReducer extends Reducer<IntWritable, CountAndFrequentItemsWritable, IntWritable, Text> { private Text outputValue = new Text(); @Override protected void reduce(IntWritable key, Iterable<CountAndFrequentItemsWritable> values, Context context) throws IOException, InterruptedException { HyperLogLogPlus hyperLogLogPlus = null; Set<String> fis = new HashSet<String>(); long count = 0, invalidCount = 0, validNumCount = 0; for(CountAndFrequentItemsWritable cfiw: values) { count += cfiw.getCount(); invalidCount += cfiw.getInvalidCount(); validNumCount += cfiw.getValidNumCount(); fis.addAll(cfiw.getFrequetItems()); if(hyperLogLogPlus == null) { hyperLogLogPlus = HyperLogLogPlus.Builder.build(cfiw.getHyperBytes()); } else { try { hyperLogLogPlus = (HyperLogLogPlus) hyperLogLogPlus.merge(HyperLogLogPlus.Builder.build(cfiw .getHyperBytes())); } catch (CardinalityMergeException e) { throw new RuntimeException(e); } } } outputValue.set(count + ":" + invalidCount + ":" + validNumCount + ":" + hyperLogLogPlus.cardinality() + ":" + limitedFrequentItems(fis)); context.write(key, outputValue); } private static String limitedFrequentItems(Set<String> fis) { StringBuilder sb = new StringBuilder(200); int size = Math.min(fis.size(), CountAndFrequentItemsWritable.FREQUET_ITEM_MAX_SIZE * 10); Iterator<String> iterator = fis.iterator(); int i = 0; while(i < size) { String next = iterator.next().replaceAll(":", " ").replace(",", " "); sb.append(next); if(i != size - 1) { sb.append(","); } i += 1; } return sb.toString(); } }