/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.imhotep.iql; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.indeed.imhotep.api.ImhotepOutOfMemoryException; import com.indeed.imhotep.ez.EZImhotepSession; import com.indeed.imhotep.ez.EZImhotepSession.FTGSCallback; import com.indeed.imhotep.ez.Field; import com.indeed.imhotep.ez.GroupKey; import com.indeed.imhotep.ez.StatReference; import com.indeed.imhotep.ez.Stats.Stat; import gnu.trove.TIntObjectHashMap; import it.unimi.dsi.fastutil.doubles.DoubleArrayList; import it.unimi.dsi.fastutil.doubles.DoubleList; import it.unimi.dsi.fastutil.ints.Int2LongMap; import it.unimi.dsi.fastutil.ints.Int2LongOpenHashMap; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * @author jsgroth */ public class PercentileGrouping extends Grouping { private final Stat countStat; private final List<Field> fields = Lists.newArrayList(); private final List<Double> percentiles = Lists.newArrayList(); private final List<Integer> fieldProjectionPositions = Lists.newArrayList(); public PercentileGrouping(final Stat countStat) { this.countStat = countStat; } public void addPercentileQuery(final Field field, final double percentile, final int fieldProjectionPosition) { fields.add(field); percentiles.add(percentile); fieldProjectionPositions.add(fieldProjectionPosition); } @Override public Map<Integer, GroupKey> regroup(final EZImhotepSession session, final Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException { throw new UnsupportedOperationException("Percentiles must be used as the last group"); } @Override public Iterator<GroupStats> getGroupStats(final EZImhotepSession session, final Map<Integer, GroupKey> groupKeys, final List<StatReference> statRefs, final long timeoutTS) throws ImhotepOutOfMemoryException { if(groupKeys.isEmpty()) { // we don't have any parent groups probably because all docs were filtered out return Collections.<GroupStats>emptyList().iterator(); } final StatReference countStatRef = session.pushStat(countStat); final long[] counts = getCounts(countStatRef); final Int2ObjectMap<Int2LongMap> groupToPositionToStats = getPercentileStats(session, groupKeys, countStatRef, counts); final List<GroupStats> result = Lists.newArrayList(); final int statCount = statRefs.size(); final int groupCount = session.getNumGroups(); // get values for the normal stats final TIntObjectHashMap<double[]> statsResults = (statCount > 0) ? getGroupStatsValues(session, statRefs, groupCount) : null; // combine normal stats with distinct counts for (int groupNum = 1; groupNum < groupCount; groupNum++) { final Int2LongMap groupPercentileData = groupToPositionToStats.get(groupNum); double[] statsVals = statsResults != null ? statsResults.get(groupNum) : null; double[] values = new double[statCount + fields.size()]; for(int i = 0, statsValsIndex = 0; i < values.length; i++) { if(groupPercentileData != null && groupPercentileData.containsKey(i)) { // percentile value values[i] = groupPercentileData.get(i); } else if(statsVals != null && statsValsIndex < statsVals.length) { values[i] = statsVals[statsValsIndex++]; // normal stat value available } else { values[i] = 0; // normal stat not in stats array } } GroupKey groupKey = groupKeys.get(groupNum); result.add(new GroupStats(groupKey, values)); } return result.iterator(); } private Int2ObjectMap<Int2LongMap> getPercentileStats(final EZImhotepSession session, final Map<Integer, GroupKey> groupKeys, final StatReference countStatRef, final long[] counts) { final Set<Field> uniqueFields = Sets.newHashSet(fields); final Int2ObjectMap<Int2LongMap> groupToPositionToStats = new Int2ObjectOpenHashMap<Int2LongMap>(); for (final int group : groupKeys.keySet()) { groupToPositionToStats.put(group, new Int2LongOpenHashMap()); } for (final Field f : uniqueFields) { final List<Double> fieldPercentiles = Lists.newArrayList(); final List<Integer> projectionPositions = Lists.newArrayList(); for (int i = 0; i < fields.size(); ++i) { if (f.equals(fields.get(i))) { fieldPercentiles.add(percentiles.get(i)); projectionPositions.add(fieldProjectionPositions.get(i)); } } final Int2ObjectMap<DoubleList> percentileValues = new Int2ObjectOpenHashMap<DoubleList>(); for (final int group : groupKeys.keySet()) { final DoubleList groupPercentileValues = new DoubleArrayList(); for (final double percentile : fieldPercentiles) { groupPercentileValues.add(percentile / 100 * counts[group]); } percentileValues.put(group, groupPercentileValues); } final PercentileFTGSCallback callback = new PercentileFTGSCallback(session.getStackDepth(), countStatRef, percentileValues); // hack for ramses indexes, it's slower to iterate over a string field as an int field but it's better than // doing a 2D metric regroup like ramhotep does final Field ftgsField = f.isIntField() ? f : Field.intField(f.getFieldName()); session.ftgsIterate(Arrays.asList(ftgsField), callback); final Int2ObjectMap<LongList> groupToPercentileStats = callback.finalizeAndGetGroupToPercentileStats(); for (final int group : groupToPercentileStats.keySet()) { final LongList percentileStats = groupToPercentileStats.get(group); for (int i = 0; i < percentileStats.size(); ++i) { final int position = projectionPositions.get(i); groupToPositionToStats.get(group).put(position, percentileStats.getLong(i)); } } } session.popStat(); return groupToPositionToStats; } private static TIntObjectHashMap<double[]> getGroupStatsValues(EZImhotepSession session, List<StatReference> statRefs, int groupCount) { final int statCount = statRefs.size(); final double[][] statGroupValues = new double[statCount][]; for (int i = 0; i < statCount; i++) { statGroupValues[i] = session.getGroupStats(statRefs.get(i)); } final TIntObjectHashMap<double[]> ret = new TIntObjectHashMap<double[]>(groupCount); for (int group = 1; group <= groupCount; group++) { final double[] groupStats = new double[statCount]; for (int statNum = 0; statNum < groupStats.length; statNum++) { if(group < statGroupValues[statNum].length) { groupStats[statNum] = statGroupValues[statNum][group]; } } ret.put(group, groupStats); } return ret; } private static long[] getCounts(final StatReference countStatRef) { final double[] doubleGroupStats = countStatRef.getGroupStats(); final long[] groupStats = new long[doubleGroupStats.length]; for (int i = 0; i < doubleGroupStats.length; ++i) { groupStats[i] = Math.round(doubleGroupStats[i]); } return groupStats; } private static class PercentileFTGSCallback extends FTGSCallback { private final Int2ObjectMap<LongList> groupToPercentileStats; private final StatReference statRef; private final Int2ObjectMap<DoubleList> percentileValues; private Int2LongMap groupToPrevCount = new Int2LongOpenHashMap(); private Int2LongMap groupToPrevTerm = new Int2LongOpenHashMap(); private PercentileFTGSCallback(final int numStats, final StatReference statRef, final Int2ObjectMap<DoubleList> percentileValues) { super(numStats); this.statRef = statRef; this.percentileValues = percentileValues; groupToPercentileStats = new Int2ObjectOpenHashMap<LongList>(); for (final int group : percentileValues.keySet()) { final LongList stats = new LongArrayList(); for (int i = 0; i < percentileValues.get(group).size(); ++i) { stats.add(Long.MIN_VALUE); } groupToPercentileStats.put(group, stats); } } @Override protected void intTermGroup(final String field, final long term, final int group) { final long prevCount = groupToPrevCount.get(group); final long countForTerm = Math.round(getStat(statRef)); final long newCount = prevCount + countForTerm; final DoubleList groupPercentileValues = percentileValues.get(group); for (int i = 0; i < groupPercentileValues.size(); ++i) { final double percentileValue = groupPercentileValues.get(i); if (percentileValue > prevCount && percentileValue <= newCount) { groupToPercentileStats.get(group).set(i, term); } } groupToPrevCount.put(group, newCount); groupToPrevTerm.put(group, term); } @Override protected void stringTermGroup(final String field, final String term, final int group) { throw new UnsupportedOperationException("Percentiles do not work with string fields"); } public Int2ObjectMap<LongList> finalizeAndGetGroupToPercentileStats() { for (final int group : groupToPercentileStats.keySet()) { final LongList stats = groupToPercentileStats.get(group); for (int i = 0; i < stats.size(); ++i) { if (stats.getLong(i) == Long.MIN_VALUE) { stats.set(i, groupToPrevTerm.get(group)); } } } return groupToPercentileStats; } } }