/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.api;
import com.indeed.imhotep.GroupMultiRemapRule;
import com.indeed.imhotep.GroupRemapRule;
import com.indeed.imhotep.QueryRemapRule;
import com.indeed.imhotep.RegroupCondition;
import com.indeed.imhotep.TermCount;
import javax.annotation.concurrent.NotThreadSafe;
import java.io.Closeable;
import java.net.InetSocketAddress;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@NotThreadSafe
public interface ImhotepSession extends Closeable {
/**
* get the sum of the docFreq of all terms in all of the given fields
* @param intFields int fields to iterate over
* @param stringFields string fields to iterate over
* @return the sum of all terms' docFreq
*/
long getTotalDocFreq(String[] intFields, String[] stringFields);
/**
* get the current total of a given metric for each group
* Trailing groups with 0 values can cause the returned array to be shorter than the total number of groups.
* @param stat the index of the metric
* @return an array with the metric values, indexed by group
*/
long[] getGroupStats(int stat);
/**
* get an iterator over all (field, term, group, stat) tuples for the given fields
* @param intFields list of int fields
* @param stringFields list of string fields
* @return an iterator
*/
FTGSIterator getFTGSIterator(String[] intFields, String[] stringFields);
FTGSIterator getSubsetFTGSIterator(Map<String, long[]> intFields, Map<String, String[]> stringFields);
RawFTGSIterator[] getSubsetFTGSIteratorSplits(Map<String, long[]> intFields, Map<String, String[]> stringFields);
DocIterator getDocIterator(String[] intFields, String[] stringFields) throws ImhotepOutOfMemoryException;
RawFTGSIterator[] getFTGSIteratorSplits(String[] intFields, String[] stringFields);
/**
* note: this call is weird.
* it is intended to be called numSplits times, once with each splitIndex from [0..n)
* if it is not called numSplits times the returned iterators will deadlock
* if it is called with different values for numSplits it will throw an error
* if it is called with the same value for splitIndex twice it will throw an error
* if you are a client of imhotep this is not the call you are looking for
*
* @param intFields list of int fields
* @param stringFields list of string fields
* @param splitIndex index of the split you want
* @param numSplits total number of splits
* @return iterator
*/
RawFTGSIterator getFTGSIteratorSplit(String[] intFields, String[] stringFields, int splitIndex, int numSplits);
RawFTGSIterator getSubsetFTGSIteratorSplit(Map<String, long[]> intFields, Map<String, String[]> stringFields, int splitIndex, int numSplits);
/**
* this is only really here to be called on ImhotepRemoteSession by RemoteImhotepMultiSession
*/
RawFTGSIterator mergeFTGSSplit(String[] intFields, String[] stringFields, String sessionId, InetSocketAddress[] nodes, int splitIndex);
RawFTGSIterator mergeSubsetFTGSSplit(Map<String, long[]> intFields, Map<String, String[]> stringFields, String sessionId, InetSocketAddress[] nodes, int splitIndex);
/**
* apply the list of remap rules to remap documents into a different group. Preconditions:
*
* <ul>
* <li>Each rule has a different targetGroup
* <li>All targetGroups are positive
* <li>All inequality conditions have the potential to be matched -- i.e., there is not an earlier inequality
* condition in the same GroupMultiRemapRule that targets the same field with a greater term.
* <li>All equality conditions have the potential to be matched -- i.e., there is not an earlier equality
* condition in the same GroupMultiRemapRule that targets the same field with the same term
* </ul>
*
* After the regroup operation:
*
* <ul>
* <li>If a document was matched by some rule (its original group was equal to the rule's targetGroup)
* then its new group is the positiveGroup of the earliest condition in the rule that matches the document,
* or the negativeGroup of the rule, depending on whether it matched any rules
* <li>Otherwise, the document's new group is 0.
* </ul>
*
* @param rawRules list of remap rules
* @return the number of groups after applying the regroup
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
* @throws IllegalArgumentException if there are duplicate targetGroups, non-positive targetGroups, or regroup
* conditions do not meet the above prescribed requirements
*/
int regroup(GroupMultiRemapRule[] rawRules) throws ImhotepOutOfMemoryException;
int regroup(int numRawRules, Iterator<GroupMultiRemapRule> rawRules) throws ImhotepOutOfMemoryException;
int regroup(GroupMultiRemapRule[] rawRules, boolean errorOnCollisions) throws ImhotepOutOfMemoryException;
int regroup(int numRawRules, Iterator<GroupMultiRemapRule> rawRules, boolean errorOnCollisions) throws ImhotepOutOfMemoryException;
/**
* apply the list of remap rules to remap documents into a different group. Preconditions:
*
* <ul>
* <li>Each rule has a different targetGroup
* <li>All targetGroups are non-negative
* </ul>
*
* After the regroup operation:
*
* <ul>
* <li>If a document was matched by some rule (its original group was equal to the rule's targetGroup)
* then its new group is the rule's positiveGroup or negativeGroup, depending on whether it matched
* the rule's condition.
* <li>Otherwise, the document's new group is 0.
* </ul>
*
* @param rawRules list of remap rules
* @return the number of groups after applying the regroup
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
int regroup(GroupRemapRule[] rawRules) throws ImhotepOutOfMemoryException;
int regroup2(int numRawRules, Iterator<GroupRemapRule> iterator) throws ImhotepOutOfMemoryException;
/**
* apply this query to the dataset and regroup based on whether or not a document matches the query
*
* After the regroup operation:
*
* <ul>
* <li>All documents in the rule's targetGroup that were matched by the rule's query will now be in the rule's positiveGroup</li>
* <li>All documents in the rule's targetGroup that were not matched by the rule's query will now be in the rule's negativeGroup</li>
* <li>All documents not in the rule's targetGroup will remain in the same group they were in before the regroup operation</li>
* </ul>
*
* @param rule the query to execute and the group parameters
* @return the number of groups after applying the regroup
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
int regroup(QueryRemapRule rule) throws ImhotepOutOfMemoryException;
/**
* a regroup for doing OR queries over int fields
* @param field the int field
* @param terms sorted list of terms, any doc matching any of these terms will be remapped
* @param targetGroup group to map from
* @param negativeGroup group into which to map docs that contain none of the terms
* @param positiveGroup group into witch to map docs that contain any of the terms
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
void intOrRegroup(String field, long[] terms, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;
/**
* a regroup for doing OR queries over string fields
* @param field the string field
* @param terms sorted list of terms, any doc matching any of these terms will be remapped
* @param targetGroup group to map from
* @param negativeGroup group into which to map docs that contain none of the terms
* @param positiveGroup group into witch to map docs that contain any of the terms
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
void stringOrRegroup(String field, String[] terms, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;
/**
* a regroup for doing regex filtering over string fields
* @param field the string field
* @param regex the regex to test the terms against
* @param targetGroup group to map from
* @param negativeGroup group into which to map docs that don't have any terms matching the regex
* @param positiveGroup group into witch to map docs that contain terms that match the regex
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
void regexRegroup(String field, String regex, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;
/**
* perform a random regrouping of documents based on a specific field
* this is done by applying the salt to each term, hashing, and mapping the hash to a value between 0 and 1
* all terms with value < p go in negativeGroup, all terms with value >= p go in positiveGroup
* the actual grouping is only as random as the salt
*
* @param field the field to use
* @param isIntField whether the field is int or string type
* @param salt the salt to use
* @param p the minimum value to go into positiveGroup
* @param targetGroup the group to apply the random regroup to
* @param negativeGroup the group where terms with values < p will go
* @param positiveGroup the group where terms with values >= p will go
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to go out of memory
*/
void randomRegroup(String field, boolean isIntField, String salt, double p, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;
/**
* Performs a random regroup, except instead of a binary decision, partitions into groups based on a percentage map.
*
* The percentage map is an array of split points for the result groups. Each document whose random value is less
* than or equal to percentages[i] will be mapped to groups[i], and all remaining documents will be mapped to
* groups[groups.length - 1]. Therefore, it is required that the percentage map be (1) in ascending order between
* 0.0 and 1.0, and (2) one element shorter than resultGroups.
*
* For example, if percentages = [0.40, 0.80] and resultGroups = [3, 4, 6], then 40% of documents currently in the
* target group will be placed in group 3, 40% will be placed in group 4, and 20% will be placed in group 6.
*
* All other behavior is the same as randomRegroup(). As always, the grouping is only as random as the salt.
*
* @param field Field to split randomly over
* @param isIntField true if 'field' is an integer field, false if it is a string field
* @param salt The salt to use
* @param targetGroup The group to apply the random regroup to
* @param percentages The group cutoff percentages, works together with resultGroups
* @param resultGroups The groups to regroup into, works together with percentages
*/
void randomMultiRegroup(String field, boolean isIntField, String salt, int targetGroup, double[] percentages, int[] resultGroups) throws ImhotepOutOfMemoryException;
int metricRegroup(int stat, long min, long max, long intervalSize) throws ImhotepOutOfMemoryException;
int metricRegroup(int stat, long min, long max, long intervalSize, boolean noGutters) throws ImhotepOutOfMemoryException;
int metricRegroup2D(int xStat, long xMin, long xMax, long xIntervalSize,
int yStat, long yMin, long yMax, long yIntervalSize) throws ImhotepOutOfMemoryException;
int metricFilter(int stat, long min, long max, boolean negate) throws ImhotepOutOfMemoryException;
/**
* Return a list of the top k terms for a field, sorted by document frequency descending.
*
* This method will be significantly faster than pushing count() and doing FTGS iteration but the results
* are not guaranteed to be exact.
*
* Additionally, this method is not guaranteed to respect values of k larger than 1000.
*
* @param field the field to retrieve top terms for
* @param isIntField whether or not the field is an int field
* @param k the desired number of terms
* @return approximate top terms
*/
List<TermCount> approximateTopTerms(String field, boolean isIntField, int k);
/**
* push the metric specified by statName
* @param statName the metric to push
* @return the number of stats after pushing this metric
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
int pushStat(String statName) throws ImhotepOutOfMemoryException;
/**
* push the metrics specified by statNames
* @param statNames the metrics to push
* @return the number of stats after pushing the last metric
* @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
*/
int pushStats(List<String> statNames) throws ImhotepOutOfMemoryException;
/**
* pop the most recently-added metric
* @return the number of stats after popping the metric
*/
int popStat();
/**
* @return the number of stats currently on the stack.
*/
int getNumStats();
/**
* @return number of groups including zero group (maxGroup+1)
*/
int getNumGroups();
/**
* create a per-document dynamic metric
* @param name the name of the metric to create
* @throws ImhotepOutOfMemoryException in case there's not enough memory
*/
void createDynamicMetric(String name) throws ImhotepOutOfMemoryException;
/**
* add a per-group constant to each element of a dynamic metric, using saturating arithmetic
* @param name the name of the metric to update
* @param deltas an array of constant values to add for each group
* @throws ImhotepOutOfMemoryException in case there's not enough memory
*/
void updateDynamicMetric(String name, int[] deltas) throws ImhotepOutOfMemoryException;
/**
* Adjusts the given dynamic metric on a per-document basis where the delta for each condition that matches
* is summed up and applied.
* The group that a document is in is irrelevant.
* Does not currently support inequality conditions.
* @param name the name of the metric to update
* @param conditions conditions to match against
* @param deltas deltas to adjust document by if the corresponding condition matches
*/
void conditionalUpdateDynamicMetric(String name, RegroupCondition[] conditions, int[] deltas);
void groupConditionalUpdateDynamicMetric(String name, int[] groups, RegroupCondition[] conditions, int[] deltas);
/**
* close the session and free up any associated resources
*/
void close();
/**
* reset groups to their original state (all documents in group 1)
*/
void resetGroups() throws ImhotepOutOfMemoryException;
/**
* Rebuilds the Indexes and removes all docs in group 0. May make
* future FTGS passes more efficent.
* @throws ImhotepOutOfMemoryException
*/
void rebuildAndFilterIndexes(List<String> intFields, List<String> stringFields) throws ImhotepOutOfMemoryException;
}