/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.iql;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.indeed.imhotep.api.ImhotepOutOfMemoryException;
import com.indeed.imhotep.ez.EZImhotepSession;
import com.indeed.imhotep.ez.Field;
import com.indeed.imhotep.ez.GroupKey;
import com.indeed.imhotep.ez.StatReference;
import org.apache.log4j.Logger;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import static com.indeed.imhotep.ez.Stats.Stat;
/**
* @author jplaisance
*/
public final class FieldGrouping extends Grouping {
private static final Logger log = Logger.getLogger(FieldGrouping.class);
private static final Stat DEFAULT_SORT_STAT = EZImhotepSession.counts();
private final Field field;
private final int topK;
private final Stat sortStat;
private final boolean isBottom;
private final boolean noExplode;
private final ArrayList<String> termSubset;
public FieldGrouping(final Field field) {
this(field, 0);
}
public FieldGrouping(final Field field, final boolean noExplode) {
this(field, 0, DEFAULT_SORT_STAT, false, noExplode, Collections.<String>emptyList());
}
public FieldGrouping(final Field field, boolean noExplode, List<String> termSubset) {
this(field, 0, DEFAULT_SORT_STAT, false, noExplode, termSubset);
}
public FieldGrouping(final Field field, int topK) {
this(field, topK, false);
}
public FieldGrouping(final Field field, int topK, boolean isBottom) {
this(field, topK, DEFAULT_SORT_STAT, isBottom);
}
public FieldGrouping(final Field field, int topK, Stat sortStat) {
this(field, topK, sortStat, false);
}
public FieldGrouping(final Field field, int topK, Stat sortStat, boolean isBottom) {
this(field, topK, sortStat, isBottom, false, Collections.<String>emptyList());
}
public FieldGrouping(final Field field, int topK, Stat sortStat, boolean isBottom, boolean noExplode, List<String> termSubset) {
this.field = field;
this.topK = topK;
this.sortStat = sortStat;
this.isBottom = isBottom;
this.noExplode = noExplode;
// remove duplicated terms as it makes Imhotep complain
this.termSubset = Lists.newArrayList(Sets.newLinkedHashSet(termSubset));
// validation
if(topK > EZImhotepSession.GROUP_LIMIT) {
DecimalFormat df = new DecimalFormat("###,###");
throw new IllegalArgumentException("Number of requested top terms (" + df.format(topK) + ") for field " +
field.getFieldName() + " exceeds the limit (" + df.format(EZImhotepSession.GROUP_LIMIT) +
"). Please simplify the query.");
}
}
public Map<Integer, GroupKey> regroup(final EZImhotepSession session, final Map<Integer, GroupKey> groupKeys) throws ImhotepOutOfMemoryException {
if(groupKeys.isEmpty()) {
return groupKeys;
}
if (topK > 0) {
return Preconditions.checkNotNull(session.splitAllTopK(field, groupKeys, topK, sortStat, isBottom));
} else if(isTermSubset()) {
if(field.isIntField()) {
Field.IntField intField = (Field.IntField) field;
long[] termsArray = new long[termSubset.size()];
for(int i = 0; i < termSubset.size(); i++) {
try {
termsArray[i] = Long.valueOf(termSubset.get(i));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("IN grouping for int field " + intField.getFieldName() +
" has a non integer argument: " + termSubset.get(i));
}
}
return Preconditions.checkNotNull(session.explodeEachGroup(intField, termsArray, groupKeys));
} else {
String[] termsArray = termSubset.toArray(new String[termSubset.size()]);
return Preconditions.checkNotNull(session.explodeEachGroup((Field.StringField) field, termsArray, groupKeys));
}
} else if(noExplode) {
return Preconditions.checkNotNull(session.splitAll(field, groupKeys));
} else {
return Preconditions.checkNotNull(session.splitAllExplode(field, groupKeys));
}
}
public Iterator<GroupStats> getGroupStats(final EZImhotepSession session, final Map<Integer, GroupKey> groupKeys, final List<StatReference> statRefs, long timeoutTS) throws ImhotepOutOfMemoryException {
if(groupKeys.isEmpty()) { // we don't have any parent groups probably because all docs were filtered out
return Collections.<GroupStats>emptyList().iterator(); // so no point doing FTGS
}
if (topK > 0) {
//TODO have some way of not potentially pushing counts() twice
final StatReference countStat = session.pushStatGeneric(sortStat);
final TopKGroupingFTGSCallback callback = new TopKGroupingFTGSCallback(session.getStackDepth(), topK, countStat, statRefs, groupKeys, isBottom);
session.ftgsIterate(Arrays.asList(field), callback);
return callback.getResults().iterator();
} else if(noExplode) {
final GroupingFTGSCallbackNoExplode callback = new GroupingFTGSCallbackNoExplode(session.getStackDepth(), statRefs, groupKeys);
if(!isTermSubset()) {
return session.ftgsGetIterator(Arrays.asList(field), callback);
} else {
final Map<Field, List<?>> fieldsToTermsSubsets = Maps.newHashMap();
fieldsToTermsSubsets.put(field, termSubset);
return session.ftgsGetSubsetIterator(fieldsToTermsSubsets, callback);
}
} else {
final GroupingFTGSCallback callback = new GroupingFTGSCallback(session.getStackDepth(), statRefs, groupKeys);
if(!isTermSubset()) {
session.ftgsIterate(Arrays.asList(field), callback);
} else {
final Map<Field, List<?>> fieldsToTermsSubsets = Maps.newHashMap();
fieldsToTermsSubsets.put(field, termSubset);
session.ftgsSubsetIterate(fieldsToTermsSubsets, callback);
}
return callback.getResults().iterator();
}
}
public Field getField() {
return field;
}
public int getTopK() {
return topK;
}
public boolean isNoExplode() {
return noExplode;
}
public boolean isTopK() {
return topK != 0;
}
public boolean isTermSubset() {
return termSubset.size() != 0;
}
}