/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.imhotep.ez; import com.google.common.base.Predicate; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.indeed.imhotep.RemoteImhotepMultiSession; import com.indeed.util.core.Pair; import com.indeed.util.core.io.Closeables2; import com.indeed.util.serialization.Stringifier; import com.indeed.flamdex.query.Query; import com.indeed.imhotep.GroupMultiRemapRule; import com.indeed.imhotep.QueryRemapRule; import com.indeed.imhotep.RegroupCondition; import com.indeed.imhotep.TermCount; import com.indeed.imhotep.api.FTGSIterator; import com.indeed.imhotep.api.ImhotepOutOfMemoryException; import com.indeed.imhotep.api.ImhotepSession; import gnu.trove.TIntObjectHashMap; import gnu.trove.TIntObjectIterator; import gnu.trove.TLongArrayList; import org.apache.log4j.Logger; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.Closeable; import java.text.DecimalFormat; import java.util.ArrayDeque; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Deque; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import static com.indeed.imhotep.ez.Field.IntField; import static com.indeed.imhotep.ez.Field.StringField; import static com.indeed.imhotep.ez.Stats.BinOpStat; import static com.indeed.imhotep.ez.Stats.CachedStat; import static com.indeed.imhotep.ez.Stats.ConstantStat; import static com.indeed.imhotep.ez.Stats.CountStat; import static com.indeed.imhotep.ez.Stats.DynamicMetricStat; import static com.indeed.imhotep.ez.Stats.ExpStat; import static com.indeed.imhotep.ez.Stats.HasIntStat; import static com.indeed.imhotep.ez.Stats.HasStringStat; import static com.indeed.imhotep.ez.Stats.IntFieldStat; import static com.indeed.imhotep.ez.Stats.Stat; import static com.indeed.imhotep.ez.Stats.StatRefStat; import static com.indeed.imhotep.ez.Stats.requireValid; /** * @author jwolfe */ public class EZImhotepSession implements Closeable { private static final Logger log = Logger.getLogger(EZImhotepSession.class); public static int GROUP_LIMIT = 1000000; // 1 mil private final ImhotepSession session; private final Deque<StatReference> statStack = new ArrayDeque<StatReference>(); private final Map<String, DynamicMetric> dynamicMetrics = Maps.newHashMap(); private int stackDepth = 0; private int numGroups = 2; private boolean closed = false; public EZImhotepSession(ImhotepSession session) { this.session = session; } public StatReference pushStatGeneric(Stat stat) throws ImhotepOutOfMemoryException { if(stat instanceof Stats.AggregateBinOpStat) { return pushStatComposite((Stats.AggregateBinOpStat) stat); } else { return pushStat(stat); } } public SingleStatReference pushStat(Stat stat) throws ImhotepOutOfMemoryException { if(stat instanceof Stats.AggregateBinOpStat) { throw new IllegalArgumentException("Aggregate operations have to be pushed with pushStatGeneric"); } final int initialDepth = stackDepth; for (String statToPush : stat.pushes(this)) { stackDepth = session.pushStat(statToPush); } if (initialDepth + 1 != stackDepth) { throw new RuntimeException("Bug! Did not change stack depth by exactly 1."); } SingleStatReference statReference = new SingleStatReference(initialDepth, stat.toString(), this); if(stat instanceof Stats.AggregateBinOpConstStat) { // hacks for handling division by a constant final Stats.AggregateBinOpConstStat statAsConstAggregate = (Stats.AggregateBinOpConstStat) stat; if(!"/".equals(statAsConstAggregate.getOp())) { throw new IllegalArgumentException("Only aggregate division is currently supported"); } statReference = new ConstantDivideSingleStatReference(statReference, statAsConstAggregate.getValue(), this); } statStack.push(statReference); return statReference; } public CompositeStatReference pushStatComposite(Stats.AggregateBinOpStat stat) throws ImhotepOutOfMemoryException { final int initialDepth = stackDepth; for (String statToPush : stat.pushes(this)) { stackDepth = session.pushStat(statToPush); } if (initialDepth + 2 != stackDepth) { throw new RuntimeException("Bug! Did not change stack depth by exactly 2."); } final SingleStatReference stat1 = new SingleStatReference(initialDepth, stat.toString(), this); final SingleStatReference stat2 = new SingleStatReference(initialDepth + 1, stat.toString(), this); final CompositeStatReference statReference = new CompositeStatReference(stat1, stat2); statStack.push(statReference); return statReference; } public StatReference popStat() { stackDepth = session.popStat(); final StatReference poppedStat = statStack.pop(); poppedStat.invalidate(); return poppedStat; } public int getStackDepth() { return this.stackDepth; } public int getNumGroups() { return numGroups; } public double[] getGroupStats(StatReference statReference) { return statReference.getGroupStats(); } long[] getGroupStats(int depth) { return session.getGroupStats(depth); } /** * Returns the number of bytes written to the temp files for this session locally. * Returns -1 if tempFileSizeBytesLeft was set to null or if the session is not a RemoteImhotepMultiSession. */ public long getTempFilesBytesWritten() { if(!(session instanceof RemoteImhotepMultiSession)) { return -1; } return ((RemoteImhotepMultiSession) session).getTempFilesBytesWritten(); } public DynamicMetric createDynamicMetric(String name) throws ImhotepOutOfMemoryException { if (dynamicMetrics.containsKey(name)) { throw new IllegalArgumentException("Dynamic metric with name "+name+" already exists!"); } session.createDynamicMetric(name); return new DynamicMetric(name); } public void deleteDynamicMetric(DynamicMetric metric) { metric.valid = false; throw new UnsupportedOperationException("Sorry, this isn't actually possible yet"); } public void ftgsSubsetIterate(Map<Field, List<?>> fieldsToTermsSubsets, FTGSCallback callback) { final FTGSIterator ftgsIterator = getFtgsSubsetIterator(fieldsToTermsSubsets); performIteration(callback, ftgsIterator); } public void ftgsIterate(List<Field> fields, FTGSCallback callback) { final FTGSIterator ftgsIterator = getFtgsIterator(fields); performIteration(callback, ftgsIterator); } private void performIteration(FTGSCallback callback, FTGSIterator ftgsIterator) { try { while (ftgsIterator.nextField()) { final String field = ftgsIterator.fieldName(); if (ftgsIterator.fieldIsIntType()) { while (ftgsIterator.nextTerm()) { final long term = ftgsIterator.termIntVal(); while (ftgsIterator.nextGroup()) { final int group = ftgsIterator.group(); ftgsIterator.groupStats(callback.stats); callback.intTermGroup(field, term, group); } } } else { while (ftgsIterator.nextTerm()) { final String term = ftgsIterator.termStringVal(); while (ftgsIterator.nextGroup()) { final int group = ftgsIterator.group(); ftgsIterator.groupStats(callback.stats); callback.stringTermGroup(field, term, group); } } } } } finally { Closeables2.closeQuietly(ftgsIterator, log); } } public <E> Iterator<E> ftgsGetSubsetIterator(Map<Field, List<?>> fieldsToTermsSubsets, final FTGSIteratingCallback<E> callback) { final FTGSIterator ftgsIterator = getFtgsSubsetIterator(fieldsToTermsSubsets); // TODO: make sure ftgsIterator gets closed return new FTGSCallbackIterator<E>(callback, ftgsIterator); } private FTGSIterator getFtgsSubsetIterator(Map<Field, List<?>> fieldsToTermsSubsets) { final Map<String, long[]> intFields = Maps.newHashMap(); final Map<String, String[]> stringFields = Maps.newHashMap(); for (Field field : fieldsToTermsSubsets.keySet()) { final List<?> terms = fieldsToTermsSubsets.get(field); if (field.isIntField()) { final long[] intTermsSubset = new long[terms.size()]; for(int i = 0; i < intTermsSubset.length; i++) { final Object term = terms.get(i); if(term instanceof Long) { intTermsSubset[i] = (Long) term; } else if(term instanceof String) { try { intTermsSubset[i] = Long.valueOf((String) term); } catch (NumberFormatException e) { // TODO: move throw new IllegalArgumentException("IN grouping for int field " + field.getFieldName() + " has a non integer argument: " + term); } } } Arrays.sort(intTermsSubset); intFields.put(field.fieldName, intTermsSubset); } else { final String[] stringTermsSubset = new String[terms.size()]; for(int i = 0; i < stringTermsSubset.length; i++) { stringTermsSubset[i] = (String)terms.get(i); } Arrays.sort(stringTermsSubset); stringFields.put(field.fieldName, stringTermsSubset); } } return session.getSubsetFTGSIterator(intFields, stringFields); } public <E> Iterator<E> ftgsGetIterator(List<Field> fields, final FTGSIteratingCallback<E> callback) { final FTGSIterator ftgsIterator = getFtgsIterator(fields); // TODO: make sure ftgsIterator gets closed return new FTGSCallbackIterator<E>(callback, ftgsIterator); } private FTGSIterator getFtgsIterator(List<Field> fields) { final List<String> intFields = Lists.newArrayList(); final List<String> stringFields = Lists.newArrayList(); for (Field field : fields) { if (field.isIntField()) { intFields.add(field.fieldName); } else { stringFields.add(field.fieldName); } } return session.getFTGSIterator( intFields.toArray(new String[intFields.size()]), stringFields.toArray(new String[stringFields.size()]) ); } public void filter(IntField field, Predicate<Long> predicate) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } final TLongArrayList intTerms = intFieldTerms(field, session, predicate); final long[] longs = intTerms.toNativeArray(); for (int group = 1; group < numGroups; group++) { session.intOrRegroup(field.getFieldName(), longs, group, 0, group); } } public void filterNegation(IntField field, Predicate<Long> predicate) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } final TLongArrayList intTerms = intFieldTerms(field, session, predicate); final long[] longs = intTerms.toNativeArray(); for (int group = 1; group < numGroups; group++) { session.intOrRegroup(field.getFieldName(), longs, group, group, 0); } } public void filter(IntField field, long[] terms) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.intOrRegroup(field.getFieldName(), terms, group, 0, group); } } public void filterNegation(IntField field, long[] terms) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.intOrRegroup(field.getFieldName(), terms, group, group, 0); } } public void filter(StringField field, Predicate<String> predicate) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } final List<String> stringTerms = stringFieldTerms(field, session, predicate); for (int group = 1; group < numGroups; group++) { session.stringOrRegroup(field.getFieldName(), stringTerms.toArray(new String[stringTerms.size()]), group, 0, group); } } public void filterNegation(StringField field, Predicate<String> predicate) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } final List<String> stringTerms = stringFieldTerms(field, session, predicate); for (int group = 1; group < numGroups; group++) { session.stringOrRegroup(field.getFieldName(), stringTerms.toArray(new String[stringTerms.size()]), group, group, 0); } } public void filter(StringField field, String[] terms) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.stringOrRegroup(field.getFieldName(), terms, group, 0, group); } } public void filterNegation(StringField field, String[] terms) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.stringOrRegroup(field.getFieldName(), terms, group, group, 0); } } public void filter(SingleStatReference stat, long min, long max) throws ImhotepOutOfMemoryException { requireValid(stat); numGroups = session.metricFilter(stat.depth, min, max, false); } public void filterNegation(SingleStatReference stat, long min, long max) throws ImhotepOutOfMemoryException { requireValid(stat); numGroups = session.metricFilter(stat.depth, min, max, true); } public void filter(Query query) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a query filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.regroup(new QueryRemapRule(group, query, 0, group)); } } public void filterNegation(Query query) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a query filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.regroup(new QueryRemapRule(group, query, group, 0)); } } /** * @param field field to sample by * @param p ratio of terms to remove. In the range [0,1] * @param salt the salt to use for hashing. Providing a constant salt will lead to a reproducible result. */ public void filterSample(Field field, double p, String salt) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a term filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.randomRegroup(field.getFieldName(), field.isIntField(), salt, p, group, 0, group); } } /** * @param field field to filter on * @param regex regex to test with */ public void filterRegex(Field field, String regex) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.regexRegroup(field.getFieldName(), regex, group, 0, group); } } /** * @param field field to filter on * @param regex regex to test with */ public void filterRegexNegation(Field field, String regex) throws ImhotepOutOfMemoryException { if (numGroups > 2) { System.err.println("WARNING: performing a filter with more than one group. Consider filtering before regrouping."); } for (int group = 1; group < numGroups; group++) { session.regexRegroup(field.getFieldName(), regex, group, group, 0); } } public static Map<Integer, GroupKey> newGroupKeys() { final Map<Integer, GroupKey> ret = Maps.newHashMap(); ret.put(1, GroupKey.empty()); return ret; } public @Nullable Map<Integer, GroupKey> explodeEachGroup(IntField field, long[] terms, @Nullable Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException { if(terms.length == 0) { return Maps.newHashMap(); } checkGroupLimitWithFactor(terms.length); final GroupMultiRemapRule[] rules = new GroupMultiRemapRule[numGroups-1]; final Map<Integer, GroupKey> ret = groupKeys == null ? null : Maps.<Integer, GroupKey>newHashMap(); int positiveGroup = 1; for (int group = 1; group < numGroups; group++) { final RegroupCondition[] conditions = new RegroupCondition[terms.length]; final int[] positiveGroups = new int[terms.length]; for (int i = 0; i < terms.length; i++) { final long term = terms[i]; final int newGroup = positiveGroup++; positiveGroups[i] = newGroup; if (groupKeys != null) { ret.put(newGroup, groupKeys.get(group).add(term)); } conditions[i] = new RegroupCondition(field.getFieldName(), true, term, null, false); } rules[group - 1] = new GroupMultiRemapRule(group, 0, positiveGroups, conditions); } numGroups = session.regroup(rules, true); return ret; } public @Nullable Map<Integer, GroupKey> explodeEachGroup(StringField field, String[] terms, @Nullable Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException { if(terms.length == 0) { return Maps.newHashMap(); } checkGroupLimitWithFactor(terms.length); final GroupMultiRemapRule[] rules = new GroupMultiRemapRule[numGroups-1]; final Map<Integer, GroupKey> ret = groupKeys == null ? null : Maps.<Integer, GroupKey>newHashMap(); int positiveGroup = 1; for (int group = 1; group < numGroups; group++) { final RegroupCondition[] conditions = new RegroupCondition[terms.length]; final int[] positiveGroups = new int[terms.length]; for (int i = 0; i < terms.length; i++) { final String term = terms[i]; final int newGroup = positiveGroup++; positiveGroups[i] = newGroup; if (groupKeys != null) { ret.put(newGroup, groupKeys.get(group).add(term)); } conditions[i] = new RegroupCondition(field.getFieldName(), false, 0, term, false); } rules[group - 1] = new GroupMultiRemapRule(group, 0, positiveGroups, conditions); } numGroups = session.regroup(rules, true); return ret; } private void checkGroupLimitWithFactor(int factor) { final double newNumGroups = (double)(numGroups-1) * factor; checkGroupLimit(newNumGroups); } private void checkGroupLimit(double newNumGroups) { if(newNumGroups > GROUP_LIMIT) { DecimalFormat df = new DecimalFormat("###,###"); throw new IllegalArgumentException("Number of groups " + df.format(newNumGroups) + " exceeds the limit " + df.format(GROUP_LIMIT)+ ". Please simplify the query."); } } public @Nullable Map<Integer, GroupKey> splitAll(Field field, @Nullable Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException { final Map<Integer, GroupKey> ret = groupKeys == null ? null : Maps.<Integer, GroupKey>newHashMap(); if (field.isIntField()) { final IntField intField = (IntField) field; final TIntObjectHashMap<TLongArrayList> termListsMap = getIntGroupTerms(intField); int newGroupCount = 0; for(TIntObjectIterator<TLongArrayList> iterator = termListsMap.iterator(); iterator.hasNext();) { iterator.advance(); TLongArrayList list = iterator.value(); newGroupCount += list.size(); } checkGroupLimit(newGroupCount); final GroupMultiRemapRule[] rules = new GroupMultiRemapRule[termListsMap.size()]; int ruleIndex = 0; int positiveGroup = 1; for (int group = 1; group < numGroups; group++) { final TLongArrayList termList = termListsMap.get(group); if (termList != null) { final long[] nativeArray = termList.toNativeArray(); positiveGroup = getIntRemapRules(field, groupKeys, ret, rules, ruleIndex, positiveGroup, group, nativeArray); ruleIndex++; } } if(newGroupCount > 0) { numGroups = session.regroup(rules, false); } } else { final StringField stringField = (StringField) field; final TIntObjectHashMap<List<String>> termListsMap = getStringGroupTerms(stringField); int newGroupCount = 0; for(TIntObjectIterator<List<String>> iterator = termListsMap.iterator(); iterator.hasNext();) { iterator.advance(); List<String> list = iterator.value(); newGroupCount += list.size(); } checkGroupLimit(newGroupCount); final GroupMultiRemapRule[] rules = new GroupMultiRemapRule[termListsMap.size()]; int ruleIndex = 0; int positiveGroup = 1; for (int group = 1; group < numGroups; group++) { final List<String> termList = termListsMap.get(group); if (termList != null) { positiveGroup = getStringRemapRules(field, groupKeys, ret, rules, ruleIndex, positiveGroup, group, termList); ruleIndex++; } } if(newGroupCount > 0) { numGroups = session.regroup(rules, false); } } return ret; } public @Nullable Map<Integer, GroupKey> splitAllExplode(Field field, @Nullable Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException { if (field.isIntField()) { final IntField intField = (IntField) field; final TLongArrayList terms = intFieldTerms(intField, session, null); return explodeEachGroup(intField, terms.toNativeArray(), groupKeys); } else { final StringField stringField = (StringField) field; final List<String> terms = stringFieldTerms(stringField, session, null); return explodeEachGroup(stringField, terms.toArray(new String[terms.size()]), groupKeys); } } private void checkGroupLimitForTerms(TIntObjectHashMap<Collection> groupToTerms) { int newNumGroups = 0; TIntObjectIterator<Collection> iterator = groupToTerms.iterator(); while(iterator.hasNext()) { iterator.advance(); Collection termsForGroup = iterator.value(); newNumGroups += termsForGroup.size(); } checkGroupLimit(newNumGroups); } @SuppressWarnings("unchecked") public @Nullable Map<Integer, GroupKey> splitAllTopK(Field field, @Nullable Map<Integer, GroupKey> groupKeys, int topK, Stat stat, boolean bottom) throws ImhotepOutOfMemoryException { final Map<Integer, GroupKey> ret = groupKeys == null ? null : Maps.<Integer, GroupKey>newHashMap(); if (field.isIntField()) { final IntField intField = (IntField) field; final TIntObjectHashMap<PriorityQueue<Pair<Double, Long>>> termListsMap = getIntGroupTermsTopK(intField, topK, stat, bottom); checkGroupLimitForTerms((TIntObjectHashMap<Collection>)(Object) termListsMap); final GroupMultiRemapRule[] rules = new GroupMultiRemapRule[termListsMap.size()]; int ruleIndex = 0; int positiveGroup = 1; for (int group = 1; group < numGroups; group++) { final PriorityQueue<Pair<Double, Long>> termList = termListsMap.get(group); if (termList != null) { final long[] nativeArray = new long[termList.size()]; int index = nativeArray.length-1; while (!termList.isEmpty()) { nativeArray[index--] = termList.remove().getSecond(); } positiveGroup = getIntRemapRules(field, groupKeys, ret, rules, ruleIndex, positiveGroup, group, nativeArray); ruleIndex++; } } numGroups = session.regroup(rules, true); } else { final StringField stringField = (StringField) field; final TIntObjectHashMap<PriorityQueue<Pair<Double, String>>> termListsMap = getStringGroupTermsTopK(stringField, topK, stat, bottom); checkGroupLimitForTerms((TIntObjectHashMap<Collection>)(Object)termListsMap); final GroupMultiRemapRule[] rules = new GroupMultiRemapRule[termListsMap.size()]; int ruleIndex = 0; int positiveGroup = 1; for (int group = 1; group < numGroups; group++) { final PriorityQueue<Pair<Double, String>> terms = termListsMap.get(group); if (terms != null) { final String[] termsArray = new String[terms.size()]; int index = termsArray.length-1; while (!terms.isEmpty()) { termsArray[index--] = terms.remove().getSecond(); } positiveGroup = getStringRemapRules(field, groupKeys, ret, rules, ruleIndex, positiveGroup, group, Arrays.asList(termsArray)); ruleIndex++; } } numGroups = session.regroup(rules, true); } return ret; } private int getStringRemapRules(final Field field, final @Nullable Map<Integer, GroupKey> groupKeys, Map<Integer, GroupKey> newGroupKeys, final GroupMultiRemapRule[] rules, final int ruleIndex, int positiveGroup, final int group, final List<String> termList) { final RegroupCondition[] conditions = new RegroupCondition[termList.size()]; final int[] positiveGroups = new int[termList.size()]; positiveGroup = getStringRegroupConditions(field, groupKeys, newGroupKeys, positiveGroup, group, termList, conditions, positiveGroups); rules[ruleIndex] = new GroupMultiRemapRule(group, 0, positiveGroups, conditions); return positiveGroup; } private int getIntRemapRules(final Field field, final @Nullable Map<Integer, GroupKey> groupKeys, final Map<Integer, GroupKey> newGroupKeys, final GroupMultiRemapRule[] rules, final int ruleIndex, int positiveGroup, final int group, final long[] nativeArray) { final RegroupCondition[] conditions = new RegroupCondition[nativeArray.length]; final int[] positiveGroups = new int[nativeArray.length]; positiveGroup = getIntRegroupConditions(field, groupKeys, newGroupKeys, positiveGroup, group, conditions, positiveGroups, nativeArray); rules[ruleIndex] = new GroupMultiRemapRule(group, 0, positiveGroups, conditions); return positiveGroup; } private int getStringRegroupConditions( final Field field, final @Nullable Map<Integer, GroupKey> groupKeys, final Map<Integer, GroupKey> newGroupKeys, int positiveGroup, final int group, final List<String> termList, final RegroupCondition[] conditions, final int[] positiveGroups ) { for (int i = 0; i < termList.size(); i++) { final String term = termList.get(i); conditions[i] = new RegroupCondition(field.getFieldName(), false, 0, term, false); final int newGroup = positiveGroup++; if (groupKeys != null) { newGroupKeys.put(newGroup, groupKeys.get(group).add(term)); } positiveGroups[i] = newGroup; } return positiveGroup; } private int getIntRegroupConditions( final Field field, final @Nullable Map<Integer, GroupKey> groupKeys, final Map<Integer, GroupKey> newGroupKeys, int positiveGroup, final int group, final RegroupCondition[] conditions, final int[] positiveGroups, final long[] nativeArray ) { for (int i = 0; i < nativeArray.length; i++) { final long term = nativeArray[i]; conditions[i] = new RegroupCondition(field.getFieldName(), true, term, null, false); final int newGroup = positiveGroup++; if (groupKeys != null) { newGroupKeys.put(newGroup, groupKeys.get(group).add(term)); } positiveGroups[i] = newGroup; } return positiveGroup; } public Map<Integer, GroupKey> metricRegroup(SingleStatReference statRef, long min, long max, long intervalSize, boolean noGutters, Stringifier<Long> stringifier, @Nullable Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException { final Map<Integer, GroupKey> ret = Maps.newHashMap(); final int gutterBuckets = noGutters ? 0 : 2; final int numBuckets = (int)((max-min-1)/intervalSize + 1 + gutterBuckets); for (int group = 1; group < numGroups; group++) { int bucket = 1; int newGroupOffset = (group - 1) * numBuckets; final GroupKey<String> groupKey = groupKeys != null ? groupKeys.get(group) : GroupKey.empty(); for (long i = min; i < max; i += intervalSize, bucket++) { final String bucketString = String.format("[%s, %s)", stringifier.toString(i), stringifier.toString(i + intervalSize)); ret.put(newGroupOffset + bucket, groupKey.add(bucketString)); } if(!noGutters) { ret.put(newGroupOffset + numBuckets - 1, groupKey.add(String.format("< %s", stringifier.toString(min)))); ret.put(newGroupOffset + numBuckets, groupKey.add(String.format(">= %s", stringifier.toString(max)))); } } final int newExpectedNumberOfGroups = (numGroups-1) * numBuckets; numGroups = session.metricRegroup(statRef.depth, min, max, intervalSize, noGutters); // Delete the keys for trailing groups that don't exist on the server for (int group = numGroups; group <= newExpectedNumberOfGroups; group++) { ret.remove(group); } return ret; } public Map<Integer, GroupKey> metricRegroup2D(SingleStatReference xStat, long xMin, long xMax, long xIntervalSize, SingleStatReference yStat, long yMin, long yMax, long yIntervalSize) throws ImhotepOutOfMemoryException { final Map<Integer, GroupKey> ret = Maps.newTreeMap(); numGroups = session.metricRegroup2D(xStat.depth, xMin, xMax, xIntervalSize, yStat.depth, yMin, yMax, yIntervalSize); final int xBuckets = (int)(((xMax - 1) - xMin) / xIntervalSize + 3); final int yBuckets = (int)(((yMax - 1) - yMin) / yIntervalSize + 3); final int numBuckets = xBuckets * yBuckets; ret.put(1, GroupKey.singleton(String.format("< %d, < %d", xMin, yMin))); ret.put(numBuckets, GroupKey.singleton(String.format(">= %d, >= %d", xMax, yMax))); ret.put(xBuckets, GroupKey.singleton(String.format(">= %d, < %d", xMax, yMin))); ret.put((yBuckets-1)*xBuckets+1, GroupKey.singleton(String.format("< %d, >= %d", xMin, yMax))); { int index = 2; for (long x = xMin; x < xMax; x+=xIntervalSize) { ret.put(index, GroupKey.singleton(String.format("[%d, %d), < %d", x, x+xIntervalSize, yMin))); ret.put(index+(yBuckets-1)*xBuckets, GroupKey.singleton(String.format("[%d, %d), >= %d", x, x+xIntervalSize, yMax))); index++; } } { int index = 1; for (long y = yMin; y < yMax; y+=yIntervalSize) { ret.put(index*xBuckets+1, GroupKey.singleton(String.format("< %d, [%d, %d)", xMin, y, y+yIntervalSize))); ret.put((index+1)*xBuckets, GroupKey.singleton(String.format(">= %d, [%d, %d)", xMax, y, y+yIntervalSize))); index++; } } { for (int xBucket = 2; xBucket < xBuckets; xBucket++) { final long xStart = (xBucket-2)*xIntervalSize; final long xEnd = xStart+xIntervalSize; for (int yBucket = 1; yBucket < yBuckets-1; yBucket++) { final long yStart = (yBucket-1)*yIntervalSize; final long yEnd = yStart+yIntervalSize; ret.put(yBucket*xBuckets+xBucket, GroupKey.singleton(String.format("[%d, %d), [%d, %d)", xStart, xEnd, yStart, yEnd))); } } } for (int group = numGroups; group <= numBuckets; group++) { ret.remove(group); } return ret; } public Map<String, Long> topTerms(StringField field, int k) { final List<TermCount> termCounts = session.approximateTopTerms(field.getFieldName(), false, k); final Map<String, Long> ret = Maps.newHashMap(); for (TermCount termCount : termCounts) { ret.put(termCount.getTerm().getTermStringVal(), termCount.getCount()); } return ret; } public Map<Long, Long> topTerms(IntField field, int k) { final List<TermCount> termCounts = session.approximateTopTerms(field.getFieldName(), true, k); final Map<Long, Long> ret = Maps.newHashMap(); for (TermCount termCount : termCounts) { ret.put(termCount.getTerm().getTermIntVal(), termCount.getCount()); } return ret; } private static final class GetGroupTermsCallback extends FTGSCallback { final TIntObjectHashMap<TLongArrayList> intTermListsMap = new TIntObjectHashMap<TLongArrayList>(); final TIntObjectHashMap<List<String>> stringTermListsMap = new TIntObjectHashMap<List<String>>(); public GetGroupTermsCallback(final int numStats) { super(numStats); } public void intTermGroup(final String field, final long term, int group) { if (!intTermListsMap.containsKey(group)) { intTermListsMap.put(group, new TLongArrayList()); } intTermListsMap.get(group).add(term); } public void stringTermGroup(final String field, final String term, int group) { if (!stringTermListsMap.containsKey(group)) { stringTermListsMap.put(group, Lists.<String>newArrayList()); } stringTermListsMap.get(group).add(term); } } private TIntObjectHashMap<List<String>> getStringGroupTerms(StringField field) { final GetGroupTermsCallback callback = new GetGroupTermsCallback(stackDepth); ftgsIterate(Arrays.asList((Field)field), callback); return callback.stringTermListsMap; } private TIntObjectHashMap<TLongArrayList> getIntGroupTerms(IntField field) { final GetGroupTermsCallback callback = new GetGroupTermsCallback(stackDepth); ftgsIterate(Arrays.asList((Field)field), callback); return callback.intTermListsMap; } private static final class GetGroupTermsCallbackTopK extends FTGSCallback { final TIntObjectHashMap<PriorityQueue<Pair<Double, Long>>> intTermListsMap = new TIntObjectHashMap<PriorityQueue<Pair<Double, Long>>>(); final TIntObjectHashMap<PriorityQueue<Pair<Double, String>>> stringTermListsMap = new TIntObjectHashMap<PriorityQueue<Pair<Double, String>>>(); final Comparator<Pair> halfPairComparator; private final StatReference count; private final int k; private final boolean isBottom; public GetGroupTermsCallbackTopK(final int numStats, StatReference count, int k, boolean isBottom) { super(numStats); this.count = count; this.k = k; this.isBottom = isBottom; final Comparator<Pair> baseComparator = new Pair.HalfPairComparator(); halfPairComparator = isBottom ? Collections.reverseOrder(baseComparator) : baseComparator; } public void intTermGroup(final String field, final long term, int group) { PriorityQueue<Pair<Double, Long>> terms = intTermListsMap.get(group); if (terms == null) { terms = new PriorityQueue<Pair<Double, Long>>(10, halfPairComparator); intTermListsMap.put(group, terms); } final Double count = getStat(this.count); if (terms.size() < k) { terms.add(Pair.of(count, term)); } else { final Double headCount = terms.peek().getFirst(); if ((!isBottom && count > headCount) || (isBottom && count < headCount)) { terms.remove(); terms.add(Pair.of(count, term)); } } } public void stringTermGroup(final String field, final String term, int group) { PriorityQueue<Pair<Double, String>> terms = stringTermListsMap.get(group); if (terms == null) { terms = new PriorityQueue<Pair<Double, String>>(10, halfPairComparator); stringTermListsMap.put(group, terms); } final Double count = getStat(this.count); if (terms.size() < k) { terms.add(Pair.of(count, term)); } else { final Double headCount = terms.peek().getFirst(); if ((!isBottom && count > headCount) || (isBottom && count < headCount)) { terms.remove(); terms.add(Pair.of(count, term)); } } } } private TIntObjectHashMap<PriorityQueue<Pair<Double, String>>> getStringGroupTermsTopK(StringField field, int k, Stat stat, boolean bottom) throws ImhotepOutOfMemoryException { final StatReference statRef = pushStat(stat); final GetGroupTermsCallbackTopK callback = new GetGroupTermsCallbackTopK(stackDepth, statRef, k, bottom); ftgsIterate(Arrays.asList((Field)field), callback); popStat(); return callback.stringTermListsMap; } private TIntObjectHashMap<PriorityQueue<Pair<Double, Long>>> getIntGroupTermsTopK(IntField field, int k, Stat stat, boolean bottom) throws ImhotepOutOfMemoryException { final StatReference statRef = pushStat(stat); final GetGroupTermsCallbackTopK callback = new GetGroupTermsCallbackTopK(stackDepth, statRef, k, bottom); ftgsIterate(Arrays.asList((Field)field), callback); popStat(); return callback.intTermListsMap; } public static void filter(StringField field, String[] terms, ImhotepSession session) throws ImhotepOutOfMemoryException { new EZImhotepSession(session).filter(field, terms); } private static final class FieldTermsCallback extends FTGSCallback { final TLongArrayList intTerms = new TLongArrayList(); final List<String> stringTerms = Lists.newArrayList(); private final Predicate<Long> predicateInt; private final Predicate<String> predicateString; private long lastIntTerm = Long.MIN_VALUE; private String lastStringTerm = null; private boolean firstIteration = true; public FieldTermsCallback(final int numStats, @Nullable Predicate<Long> predicateInt, @Nullable Predicate<String> predicateString) { super(numStats); this.predicateInt = predicateInt; this.predicateString = predicateString; } public void intTermGroup(final String field, final long term, int group) { // expecting incoming terms to be in sorted order if(firstIteration || term != lastIntTerm) { firstIteration = false; lastIntTerm = term; if(predicateInt == null || predicateInt.apply(term)) { intTerms.add(term); } } } public void stringTermGroup(final String field, final String term, int group) { // expecting incoming terms to be in sorted order if(firstIteration || !term.equals(lastStringTerm)) { firstIteration = false; lastStringTerm = term; if(predicateString == null || predicateString.apply(term)) { stringTerms.add(term); } } } } public TLongArrayList intFieldTerms(IntField field, ImhotepSession session, @Nullable Predicate<Long> filterPredicate) throws ImhotepOutOfMemoryException { final FieldTermsCallback callback = new FieldTermsCallback(stackDepth, filterPredicate, null); new EZImhotepSession(session).ftgsIterate(Arrays.asList((Field)field), callback); return callback.intTerms; } public List<String> stringFieldTerms(StringField field, ImhotepSession session, @Nullable Predicate<String> filterPredicate) throws ImhotepOutOfMemoryException { final FieldTermsCallback callback = new FieldTermsCallback(stackDepth, null, filterPredicate); new EZImhotepSession(session).ftgsIterate(Arrays.asList((Field)field), callback); return callback.stringTerms; } public static abstract class FTGSCallback { private final long[] stats; public FTGSCallback(int numStats) { stats = new long[numStats]; } protected final double getStat(StatReference ref) { requireValid(ref); return ref.getValue(stats); } protected abstract void intTermGroup(String field, long term, int group); protected abstract void stringTermGroup(String field, String term, int group); } public static abstract class FTGSIteratingCallback <E> { final long[] stats; public FTGSIteratingCallback(int numStats) { stats = new long[numStats]; } protected final double getStat(StatReference ref) { requireValid(ref); return ref.getValue(stats); } public abstract E intTermGroup(String field, long term, int group); public abstract E stringTermGroup(String field, String term, int group); } public class FTGSDoNothingCallback extends FTGSCallback { public FTGSDoNothingCallback(final int numStats) { super(numStats); } protected void intTermGroup(final String field, final long term, int group) {} protected void stringTermGroup(final String field, final String term, int group) {} } @Override public void close() { if (!closed) { session.close(); closed = true; } } public static Stat add(Stat... stats) { return new BinOpStat("+", stats); } public static Stat sub(Stat... stats) { return new BinOpStat("-", stats); } public static Stat mult(Stat... stats) { return new BinOpStat("*", stats); } public static Stat div(Stat... stats) { return new BinOpStat("/", stats); } public static Stat mod(Stat... stats) { return new BinOpStat("%", stats); } public static Stat less(Stat... stats) { return new BinOpStat("<", stats); } public static Stat lessEq(Stat... stats) { return new BinOpStat("<=", stats); } public static Stat isEqual(Stat... stats) { // try to optimize it as a hasint stat if(stats.length == 2 && stats[0] instanceof IntFieldStat && stats[1] instanceof ConstantStat) { return hasInt(((IntFieldStat)stats[0]).getFieldName(), ((ConstantStat) stats[1]).getValue()); } return new BinOpStat("=", stats); } public static Stat isNotEqual(Stat... stats) { return new BinOpStat("!=", stats); } public static Stat greater(Stat... stats) { return new BinOpStat(">", stats); } public static Stat greaterEq(Stat... stats) { return new BinOpStat(">=", stats); } public static Stat min(Stat... stats) { return new BinOpStat("min()", stats); } public static Stat max(Stat... stats) { return new BinOpStat("max()", stats); } public static Stat exp(Stat ref, int scaleFactor) { return new ExpStat(ref, scaleFactor); } public static Stat constant(long value) { return new ConstantStat(value); } public static Stat intField(String name) { return new IntFieldStat(name); } public static Stat intField(IntField field) { return new IntFieldStat(field.getFieldName()); } public static Stat dynamic(DynamicMetric metric) { return new DynamicMetricStat(metric); } public static Stat hasInt(String field, long value) { return new HasIntStat(field, value); } public static Stat hasString(String field, String value) { return new HasStringStat(field, value); } public static Stat lucene(Query luceneQuery) { return new Stats.LuceneQueryStat(luceneQuery); } public static Stat ref(SingleStatReference ref) { return new StatRefStat(ref); } public static Stat counts() { return new CountStat(); } public static Stat cached(Stat stat) { return new CachedStat(stat); } public static Stat abs(Stat stat) { return new Stats.AbsoluteValueStat(stat); } public static Stat floatScale(String intField, int mult, int add) { return new Stats.FloatScaleStat(intField, mult, add); } public static Stat multiplyShiftRight(int shift, Stat stat1, Stat stat2) { return new Stats.MultiplyShiftRight(shift, stat1, stat2); } public static Stat shiftLeftDivide(int shift, Stat stat1, Stat stat2) { return new Stats.ShiftLeftDivide(shift, stat1, stat2); } public static Stat aggDiv(Stat stat1, Stat stat2) { return new Stats.AggregateBinOpStat("/", stat1, stat2); } public static Stat aggDivConst(Stat stat1, long value) { return new Stats.AggregateBinOpConstStat("/", stat1, value); } @Nonnull private static long[] intArrayToLongArray(@Nonnull final int[] a) { final long[] ret = new long[a.length]; for (int i = 0; i < a.length; ++i) { ret[i] = a[i]; } return ret; } }