/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.local;
import com.google.common.collect.ComparisonChain;
import com.indeed.util.core.sort.Quicksortable;
import com.indeed.util.core.sort.Quicksortables;
import com.indeed.flamdex.api.DocIdStream;
import com.indeed.flamdex.api.FlamdexReader;
import com.indeed.flamdex.api.IntTermIterator;
import com.indeed.flamdex.api.StringTermIterator;
import com.indeed.flamdex.datastruct.FastBitSet;
import com.indeed.imhotep.GroupMultiRemapRule;
import com.indeed.imhotep.RegroupCondition;
import com.indeed.imhotep.api.ImhotepOutOfMemoryException;
import java.util.Arrays;
/**
* @author jwolfe
*/
class MultiRegroupInternals {
static int countRemapConditions(GroupMultiRemapRule[] rules) {
int conditionCount = 0;
for (GroupMultiRemapRule rule : rules) {
conditionCount += rule.conditions.length;
}
return conditionCount;
}
static int findMaxGroup(GroupMultiRemapRule[] rules) {
int maxGroup = 0;
for (GroupMultiRemapRule rule : rules) {
for (int positiveGroup : rule.positiveGroups) {
maxGroup = Math.max(maxGroup, positiveGroup);
}
maxGroup = Math.max(maxGroup, rule.targetGroup);
maxGroup = Math.max(maxGroup, rule.negativeGroup);
}
return maxGroup;
}
public static int findMaxIntermediateGroup(GroupMultiRemapRule[] rules) {
int max = 0;
for (GroupMultiRemapRule rule : rules) {
max = Math.max(max, rule.conditions.length);
}
return max;
}
/*
* Reorders the variables prefixed with "sorted" by field, intType, inequality, targetGroup, and then position within their rule
*/
static void reorderRegroupConditions(final GroupMultiRemapRule[] rules, int numConditions, final RegroupCondition[] sortedConditions, final int[] sortedPositiveGroups, final int[] sortedInternalIndices, final int[] sortedRuleIndices) {
Quicksortables.sort(new Quicksortable() {
@Override
public void swap(int i, int j) {
Quicksortables.swap(sortedConditions, i, j);
Quicksortables.swap(sortedPositiveGroups, i, j);
Quicksortables.swap(sortedInternalIndices, i, j);
Quicksortables.swap(sortedRuleIndices, i, j);
}
@Override
public int compare(int i, int j) {
return ComparisonChain.start()
.compare(sortedConditions[i].field, sortedConditions[j].field)
.compareFalseFirst(sortedConditions[i].intType, sortedConditions[j].intType)
.compareFalseFirst(sortedConditions[i].inequality, sortedConditions[j].inequality)
.compare(rules[sortedRuleIndices[i]].targetGroup, rules[sortedRuleIndices[j]].targetGroup)
.compare(sortedInternalIndices[i], sortedInternalIndices[j])
.result();
}
}, numConditions);
}
static void reorderOnTerm(final int start, int end, final boolean intType, final RegroupCondition[] sortedConditions, final int[] sortedPositiveGroups, final int[] sortedInternalIndices, final int[] sortedRuleIndices) {
Quicksortables.sort(new Quicksortable() {
@Override
public void swap(int i, int j) {
Quicksortables.swap(sortedConditions, start + i, start + j);
Quicksortables.swap(sortedPositiveGroups, start + i, start + j);
Quicksortables.swap(sortedInternalIndices, start + i, start + j);
Quicksortables.swap(sortedRuleIndices, start + i, start + j);
}
@Override
public int compare(int i, int j) {
if (intType) {
return ComparisonChain.start()
.compare(sortedConditions[start + i].intTerm, sortedConditions[start + j].intTerm)
.result();
} else {
return ComparisonChain.start()
.compare(sortedConditions[start + i].stringTerm, sortedConditions[start + j].stringTerm)
.result();
}
}
}, end - start);
}
/*
* Verifies that targetGroups are unique and are >= 1, and returns the highest group found.
*/
static int validateTargets(GroupMultiRemapRule[] rules) {
final int[] targetGroups = new int[rules.length];
for (int i = 0; i < rules.length; i++) {
targetGroups[i] = rules[i].targetGroup;
}
// Verify no duplicates
Arrays.sort(targetGroups);
if (targetGroups[0] <= 0) {
throw new IllegalArgumentException("All groups must be >= 1");
}
for (int i = 0; i < targetGroups.length - 1; i++) {
if (targetGroups[i] == targetGroups[i+1]) {
throw new IllegalArgumentException("Cannot have duplicate target groups");
}
}
return targetGroups[targetGroups.length - 1];
}
/*
* Verifies that there are no unmatchable terms
* @throws IllegalArgumentException if the same rule has multiple conditions targeting
* the same term in the same field
*/
static void validateEqualitySplits(GroupMultiRemapRule[] rules) {
for (GroupMultiRemapRule rule : rules) {
final RegroupCondition[] sortedConditions = Arrays.copyOf(rule.conditions, rule.conditions.length);
sortConditions(sortedConditions);
for (int i = 0; i < sortedConditions.length-1; i++) {
final RegroupCondition s1 = sortedConditions[i];
final RegroupCondition s2 = sortedConditions[i+1];
if (s1.field.equals(s2.field)
&& !s1.inequality
&& !s2.inequality
&& (s1.intType == s2.intType)) {
if (s1.intType) {
if (s1.intTerm == s2.intTerm) {
throw new IllegalArgumentException("Duplicate equality split term "+s1.intTerm+" in int field "+s1.field);
}
} else {
if (s1.stringTerm.equals(s2.stringTerm)) {
throw new IllegalArgumentException("Duplicate equality split term \""+s1.stringTerm+"\" in string field "+s1.field);
}
}
}
}
}
}
private static void sortConditions(final RegroupCondition[] sortedConditions) {
Quicksortables.sort(new Quicksortable() {
@Override
public void swap(int i, int j) {
final RegroupCondition tmp = sortedConditions[i];
sortedConditions[i] = sortedConditions[j];
sortedConditions[j] = tmp;
}
@Override
public int compare(int i, int j) {
final RegroupCondition s1 = sortedConditions[i];
final RegroupCondition s2 = sortedConditions[j];
final int r = ComparisonChain.start()
.compare(s1.field, s2.field)
.compareFalseFirst(s1.inequality, s2.inequality)
.compareFalseFirst(s1.intType, s2.intType)
.result();
if (r != 0) {
return r;
} else if (s1.intType) {
// Both are int type
return ComparisonChain.start()
.compare(s1.intTerm, s2.intTerm)
.result();
} else {
// Both are string type
return ComparisonChain.start()
.compare(s1.stringTerm, s2.stringTerm)
.result();
}
}
}, sortedConditions.length);
}
/*
* Upon returning, barriers contains a sorted list of differentiation points for each targetGroup in this inequality split.
* resultingIndex is a parallel array saying, for each entry in barriers, what condition index it corresponds to.
* barrierLengths says to what extent the barriers for each group are full (i.e., the effective length)
*/
static void formStringDividers(GroupMultiRemapRule[] rules, RegroupCondition[] allConditions, int[] internalConditionIndices, int[] ruleIndices, int fieldStartIndex, int conditionIndex, int[] barrierLengths, String[][] barriers, int[][] resultingIndex) {
for (int ix = fieldStartIndex; ix < conditionIndex; ix++) {
final GroupMultiRemapRule rule = rules[ruleIndices[ix]];
final int targetGroup = rule.targetGroup;
final RegroupCondition condition = allConditions[ix];
final int currentIndex = barrierLengths[targetGroup];
if (currentIndex == 0) {
barriers[targetGroup] = new String[rule.conditions.length];
resultingIndex[targetGroup] = new int[rule.conditions.length];
}
if (currentIndex == 0 || condition.stringTerm.compareTo(barriers[targetGroup][currentIndex-1]) > 0) {
barriers[targetGroup][currentIndex] = condition.stringTerm;
resultingIndex[targetGroup][currentIndex] = internalConditionIndices[ix];
barrierLengths[targetGroup]++;
} else {
throw new IllegalArgumentException("String inequality conditions that can never be met.");
}
}
}
/*
* Upon returning, barriers contains a sorted list of differentiation points for each targetGroup in this inequality split.
* resultingIndex is a parallel array saying, for each entry in barriers, what condition index it corresponds to.
* barrierLengths says to what extent the barriers for each group are full (i.e., the effective length)
*/
static void formIntDividers(GroupMultiRemapRule[] rules, RegroupCondition[] allConditions, int[] internalConditionIndices, int[] ruleIndices, int fieldStartIndex, int conditionIndex, int[] barrierLengths, long[][] barriers, int[][] resultingIndex) {
for (int ix = fieldStartIndex; ix < conditionIndex; ix++) {
final GroupMultiRemapRule rule = rules[ruleIndices[ix]];
final int targetGroup = rule.targetGroup;
final RegroupCondition condition = allConditions[ix];
final int currentIndex = barrierLengths[targetGroup];
if (currentIndex == 0) {
barriers[targetGroup] = new long[rule.conditions.length];
resultingIndex[targetGroup] = new int[rule.conditions.length];
}
if (currentIndex == 0 || condition.intTerm > barriers[targetGroup][currentIndex-1]) {
barriers[targetGroup][currentIndex] = condition.intTerm;
resultingIndex[targetGroup][currentIndex] = internalConditionIndices[ix];
barrierLengths[targetGroup]++;
} else {
throw new IllegalArgumentException("int inequality conditions that can never be met.");
}
}
}
private static void remapDocsInTargetGroups(GroupLookup docIdToGroup, GroupLookup newLookup, int[] docIdBuf, DocIdStream docIdStream, int[] remappings, int placeHolderGroup) {
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; i++) {
final int docId = docIdBuf[i];
final int oldGroup = docIdToGroup.get(docId);
if (oldGroup != 0) {
final int currentGroup = newLookup.get(docId);
if (placeHolderGroup > 0) {
if (currentGroup != placeHolderGroup) {
throw new IllegalArgumentException("Regrouping on a multi-valued field doesn't work correctly so the operation is rejected.");
}
}
newLookup.set(docId, Math.min(currentGroup, remappings[oldGroup]));
}
}
if (n < docIdBuf.length) break;
}
}
static void performStringMultiEqualityRegroup(GroupLookup docIdToGroup, GroupLookup newLookup, int[] docIdBuf, DocIdStream docIdStream, StringTermIterator termIterator, int[] remappings, String term, int placeHolderGroup) {
termIterator.reset(term);
if (termIterator.next() && termIterator.term().equals(term)) {
docIdStream.reset(termIterator);
remapDocsInTargetGroups(docIdToGroup, newLookup, docIdBuf, docIdStream, remappings, placeHolderGroup);
}
}
static void performIntMultiEqualityRegroup(GroupLookup docIdToGroup, GroupLookup newLookup, int[] docIdBuf, DocIdStream docIdStream, IntTermIterator termIterator, int[] remappings, long term, int placeHolderGroup) {
termIterator.reset(term);
if (termIterator.next() && termIterator.term() == term) {
docIdStream.reset(termIterator);
remapDocsInTargetGroups(docIdToGroup, newLookup, docIdBuf, docIdStream, remappings, placeHolderGroup);
}
}
static void internalMultiRegroup(GroupLookup docIdToGroup, GroupLookup newDocIdToGroup, int[] docIdBuf, FlamdexReader flamdexReader, GroupMultiRemapRule[] rules, int highestTarget, int numConditions, int placeholderGroup, int maxGroup, boolean errorOnCollisions) throws ImhotepOutOfMemoryException {
// Make a bunch of parallel arrays so we can sort. Memory claimed in parallelArrayBytes.
final RegroupCondition[] sortedConditions = new RegroupCondition[numConditions];
final int[] sortedPositiveGroups = new int[numConditions];
final int[] sortedInternalIndices = new int[numConditions];
final int[] sortedRuleIndices = new int[numConditions];
{
int i = 0;
for (int ruleIndex = 0; ruleIndex < rules.length; ruleIndex++) {
final GroupMultiRemapRule rule = rules[ruleIndex];
for (int conditionIndex = 0; conditionIndex < rule.conditions.length; conditionIndex++) {
final RegroupCondition condition = rule.conditions[conditionIndex];
sortedConditions[i] = condition;
sortedPositiveGroups[i] = rule.positiveGroups[conditionIndex];
sortedInternalIndices[i] = conditionIndex;
sortedRuleIndices[i] = ruleIndex;
i++;
}
}
}
// memory claimed in remappingBytes
final int[] remappings = new int[maxGroup + 1];
Arrays.fill(remappings, placeholderGroup);
remappings[0] = 0;
// Arrange in order of field, and within field in order of field type;
reorderRegroupConditions(rules, numConditions, sortedConditions, sortedPositiveGroups, sortedInternalIndices, sortedRuleIndices);
int fieldStartIndex = 0;
String field = sortedConditions[0].field;
boolean intType = sortedConditions[0].intType;
boolean inequality = sortedConditions[0].inequality;
for (int conditionIndex = 1; conditionIndex <= numConditions; conditionIndex++) {
if ((conditionIndex != numConditions)
&& sortedConditions[conditionIndex].field.equals(field)
&& (sortedConditions[conditionIndex].intType == intType)
&& (sortedConditions[conditionIndex].inequality == inequality)) {
continue;
}
// End of field/field type clump; perform regroups.
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
if (inequality) {
// These two branches both rely on the fact that these parallel arrays
// are sorted within this subarray by condition index within their rule.
// Handle inequalities by finding, for each document, which range it falls
// in to, and reassigning if it is an earlier rule than the one it presently
// matches (or if it presently matches none at all).
if (intType) {
// Memory for these claimed earlier (see maxInequalityBytes)
final int[] barrierLengths = new int[highestTarget+1];
final long[][] barriers = new long[highestTarget+1][];
final int[][] resultingIndex = new int[highestTarget+1][];
formIntDividers(rules, sortedConditions, sortedInternalIndices, sortedRuleIndices, fieldStartIndex, conditionIndex, barrierLengths, barriers, resultingIndex);
final IntTermIterator termIterator = flamdexReader.getIntTermIterator(field);
performIntMultiInequalityRegroup(docIdToGroup, newDocIdToGroup, docIdBuf, docIdStream, barrierLengths, barriers, resultingIndex, termIterator);
termIterator.close();
} else {
// Memory for these claimed earlier (see maxInequalityBytes)
final int[] barrierLengths = new int[highestTarget+1];
final String[][] barriers = new String[highestTarget+1][];
final int[][] resultingIndex = new int[highestTarget+1][];
formStringDividers(rules, sortedConditions, sortedInternalIndices, sortedRuleIndices, fieldStartIndex, conditionIndex, barrierLengths, barriers, resultingIndex);
final StringTermIterator termIterator = flamdexReader.getStringTermIterator(field);
performStringMultiInequalityRegroup(docIdToGroup, newDocIdToGroup, docIdBuf, docIdStream, barrierLengths, barriers, resultingIndex, termIterator);
termIterator.close();
}
} else {
// Handle term splits by going to the term directly and applying the rule.
reorderOnTerm(fieldStartIndex, conditionIndex, intType, sortedConditions, sortedPositiveGroups, sortedInternalIndices, sortedRuleIndices);
if (intType) {
final IntTermIterator termIterator = flamdexReader.getIntTermIterator(field);
long currentTerm = sortedConditions[fieldStartIndex].intTerm;
int termStartIndex = fieldStartIndex;
final int fieldEndIndex = conditionIndex;
for (int ix = fieldStartIndex; ix <= fieldEndIndex; ix++) {
if (ix != fieldEndIndex) {
if (sortedConditions[ix].intTerm == currentTerm) {
final int targetGroup = rules[sortedRuleIndices[ix]].targetGroup;
remappings[targetGroup] = sortedInternalIndices[ix];
continue;
}
}
performIntMultiEqualityRegroup(docIdToGroup, newDocIdToGroup, docIdBuf, docIdStream, termIterator, remappings, currentTerm, errorOnCollisions ? placeholderGroup: -1);
for (int ix2 = termStartIndex; ix2 < ix; ix2++) {
final int targetGroup = rules[sortedRuleIndices[ix2]].targetGroup;
remappings[targetGroup] = placeholderGroup;
}
if (ix != fieldEndIndex) {
termStartIndex = ix;
currentTerm = sortedConditions[ix].intTerm;
final int targetGroup = rules[sortedRuleIndices[ix]].targetGroup;
remappings[targetGroup] = sortedInternalIndices[ix];
}
}
termIterator.close();
} else {
final StringTermIterator termIterator = flamdexReader.getStringTermIterator(field);
String currentTerm = sortedConditions[fieldStartIndex].stringTerm;
int termStartIndex = fieldStartIndex;
final int fieldEndIndex = conditionIndex;
for (int ix = fieldStartIndex; ix <= fieldEndIndex; ix++) {
if (ix != fieldEndIndex) {
if (sortedConditions[ix].stringTerm.equals(currentTerm)) {
final int targetGroup = rules[sortedRuleIndices[ix]].targetGroup;
remappings[targetGroup] = sortedInternalIndices[ix];
continue;
}
}
performStringMultiEqualityRegroup(docIdToGroup, newDocIdToGroup, docIdBuf, docIdStream, termIterator, remappings, currentTerm, errorOnCollisions ? placeholderGroup : -1);
// Reset the remapping entries to placeholderGroup
for (int ix2 = termStartIndex; ix2 < ix; ix2++) {
final int targetGroup = rules[sortedRuleIndices[ix2]].targetGroup;
remappings[targetGroup] = placeholderGroup;
}
if (ix != fieldEndIndex) {
termStartIndex = ix;
currentTerm = sortedConditions[ix].stringTerm;
final int targetGroup = rules[sortedRuleIndices[ix]].targetGroup;
remappings[targetGroup] = sortedInternalIndices[ix];
}
}
termIterator.close();
}
}
docIdStream.close();
if (conditionIndex != numConditions) {
// Identify next clump
fieldStartIndex = conditionIndex;
field = sortedConditions[conditionIndex].field;
intType = sortedConditions[conditionIndex].intType;
inequality = sortedConditions[conditionIndex].inequality;
}
}
}
private static void performStringMultiInequalityRegroup(GroupLookup docIdToGroup, GroupLookup newDocIdToGroup, int[] docIdBuf, DocIdStream docIdStream, int[] barrierLengths, String[][] barriers, int[][] resultingIndex, StringTermIterator termIterator) throws ImhotepOutOfMemoryException {
while (termIterator.next()) {
final String term = termIterator.term();
docIdStream.reset(termIterator);
// Memory claimed in regroup(GroupMultiRemapRule[]) (maxBarrierIndexBytes)
final int[] currentBarrierIndices = new int[barriers.length];
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; i++) {
final int docId = docIdBuf[i];
final int group = docIdToGroup.get(docId);
final String[] barriersForGroup = barriers[group];
if (barriersForGroup == null) continue;
while (currentBarrierIndices[group] < barrierLengths[group]
&& term.compareTo(barriersForGroup[currentBarrierIndices[group]]) > 0) {
currentBarrierIndices[group]++;
}
if (currentBarrierIndices[group] == barrierLengths[group]) continue;
final int newInternalConditionIndex = resultingIndex[group][currentBarrierIndices[group]];
newDocIdToGroup.set(docId, Math.min(newDocIdToGroup.get(docId), newInternalConditionIndex));
}
if (n < docIdBuf.length) break;
}
}
}
private static void performIntMultiInequalityRegroup(GroupLookup docIdToGroup, GroupLookup newDocIdToGroup, int[] docIdBuf, DocIdStream docIdStream, int[] barrierLengths, long[][] barriers, int[][] resultingIndex, IntTermIterator termIterator) throws ImhotepOutOfMemoryException {
while (termIterator.next()) {
final long term = termIterator.term();
docIdStream.reset(termIterator);
// Memory claimed in regroup(GroupMultiRemapRule[]) (maxBarrierIndexBytes)
final int[] currentBarrierIndices = new int[barriers.length];
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; i++) {
final int docId = docIdBuf[i];
final int group = docIdToGroup.get(docId);
final long[] barriersForGroup = barriers[group];
if (barriersForGroup == null) continue;
while (currentBarrierIndices[group] < barrierLengths[group]
&& term > barriersForGroup[currentBarrierIndices[group]]) {
currentBarrierIndices[group]++;
}
if (currentBarrierIndices[group] == barrierLengths[group]) continue;
final int newInternalConditionIndex = resultingIndex[group][currentBarrierIndices[group]];
newDocIdToGroup.set(docId, Math.min(newDocIdToGroup.get(docId), newInternalConditionIndex));
}
if (n < docIdBuf.length) break;
}
}
}
static void internalMultiRegroupCleanup(GroupLookup docIdToGroup, int numGroups, GroupMultiRemapRule[] rules, int highestTarget, GroupLookup newDocIdToGroup, int placeholderGroup) {
// Memory claimed in regroup(GroupMultiRemapRule[])
final GroupMultiRemapRule[] targetGroupToRule = new GroupMultiRemapRule[Math.max(highestTarget+1, numGroups)];
for (GroupMultiRemapRule rule : rules) {
targetGroupToRule[rule.targetGroup] = rule;
}
for (int i = 0; i < docIdToGroup.size(); i++) {
final GroupMultiRemapRule rule = targetGroupToRule[docIdToGroup.get(i)];
if (rule == null) continue;
final int currentGroup = newDocIdToGroup.get(i);
if (currentGroup == placeholderGroup) {
docIdToGroup.set(i, rule.negativeGroup);
} else {
docIdToGroup.set(i, rule.positiveGroups[currentGroup]);
}
}
}
public static void moveUntargeted(GroupLookup docIdToGroup, int numGroups, GroupMultiRemapRule[] rules) {
final FastBitSet moveToZero = new FastBitSet(numGroups);
moveToZero.setAll();
for (GroupMultiRemapRule rule : rules) {
moveToZero.clear(rule.targetGroup);
}
for (int i = 0; i < docIdToGroup.size(); i++) {
final int group = docIdToGroup.get(i);
if (moveToZero.get(group)) {
docIdToGroup.set(i, 0);
}
}
}
}