ImhotepSession.java example

Explorer
imhotep-master
/*
 * Copyright (C) 2014 Indeed Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package com.indeed.imhotep.api;

import com.indeed.imhotep.GroupMultiRemapRule;
import com.indeed.imhotep.GroupRemapRule;
import com.indeed.imhotep.QueryRemapRule;
import com.indeed.imhotep.RegroupCondition;
import com.indeed.imhotep.TermCount;

import javax.annotation.concurrent.NotThreadSafe;

import java.io.Closeable;
import java.net.InetSocketAddress;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

@NotThreadSafe
public interface ImhotepSession extends Closeable {
    /**
     * get the sum of the docFreq of all terms in all of the given fields
     * @param intFields int fields to iterate over
     * @param stringFields string fields to iterate over
     * @return the sum of all terms' docFreq
     */
    long getTotalDocFreq(String[] intFields, String[] stringFields);

    /**
     * get the current total of a given metric for each group
     * Trailing groups with 0 values can cause the returned array to be shorter than the total number of groups.
     * @param stat the index of the metric
     * @return an array with the metric values, indexed by group
     */
    long[] getGroupStats(int stat);

    /**
     * get an iterator over all (field, term, group, stat) tuples for the given fields
     * @param intFields list of int fields
     * @param stringFields list of string fields
     * @return an iterator
     */
    FTGSIterator getFTGSIterator(String[] intFields, String[] stringFields);

    FTGSIterator getSubsetFTGSIterator(Map<String, long[]> intFields, Map<String, String[]> stringFields);

    RawFTGSIterator[] getSubsetFTGSIteratorSplits(Map<String, long[]> intFields, Map<String, String[]> stringFields);

    DocIterator getDocIterator(String[] intFields, String[] stringFields) throws ImhotepOutOfMemoryException;

    RawFTGSIterator[] getFTGSIteratorSplits(String[] intFields, String[] stringFields);

    /**
     * note: this call is weird.
     * it is intended to be called numSplits times, once with each splitIndex from [0..n)
     * if it is not called numSplits times the returned iterators will deadlock
     * if it is called with different values for numSplits it will throw an error
     * if it is called with the same value for splitIndex twice it will throw an error
     * if you are a client of imhotep this is not the call you are looking for
     *
     * @param intFields list of int fields
     * @param stringFields list of string fields
     * @param splitIndex index of the split you want
     * @param numSplits total number of splits
     * @return iterator
     */
    RawFTGSIterator getFTGSIteratorSplit(String[] intFields, String[] stringFields, int splitIndex, int numSplits);

    RawFTGSIterator getSubsetFTGSIteratorSplit(Map<String, long[]> intFields, Map<String, String[]> stringFields, int splitIndex, int numSplits);

    /**
     * this is only really here to be called on ImhotepRemoteSession by RemoteImhotepMultiSession
     */
    RawFTGSIterator mergeFTGSSplit(String[] intFields, String[] stringFields, String sessionId, InetSocketAddress[] nodes, int splitIndex);

    RawFTGSIterator mergeSubsetFTGSSplit(Map<String, long[]> intFields, Map<String, String[]> stringFields, String sessionId, InetSocketAddress[] nodes, int splitIndex);

    /**
     * apply the list of remap rules to remap documents into a different group. Preconditions:
     *
     * <ul>
     *     <li>Each rule has a different targetGroup
     *     <li>All targetGroups are positive
     *     <li>All inequality conditions have the potential to be matched -- i.e., there is not an earlier inequality
     *         condition in the same GroupMultiRemapRule that targets the same field with a greater term.
     *     <li>All equality conditions have the potential to be matched -- i.e., there is not an earlier equality
     *         condition in the same GroupMultiRemapRule that targets the same field with the same term
     * </ul>
     *
     * After the regroup operation:
     *
     * <ul>
     *     <li>If a document was matched by some rule (its original group was equal to the rule's targetGroup)
     *     then its new group is the positiveGroup of the earliest condition in the rule that matches the document,
     *     or the negativeGroup of the rule, depending on whether it matched any rules
     *     <li>Otherwise, the document's new group is 0.
     * </ul>
     *
     * @param rawRules list of remap rules
     * @return the number of groups after applying the regroup
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     * @throws IllegalArgumentException if there are duplicate targetGroups, non-positive targetGroups, or regroup
     *                                  conditions do not meet the above prescribed requirements
     */
    int regroup(GroupMultiRemapRule[] rawRules) throws ImhotepOutOfMemoryException;

    int regroup(int numRawRules, Iterator<GroupMultiRemapRule> rawRules) throws ImhotepOutOfMemoryException;

    int regroup(GroupMultiRemapRule[] rawRules, boolean errorOnCollisions) throws ImhotepOutOfMemoryException;

    int regroup(int numRawRules, Iterator<GroupMultiRemapRule> rawRules, boolean errorOnCollisions) throws ImhotepOutOfMemoryException;

    /**
     * apply the list of remap rules to remap documents into a different group. Preconditions:
     *
     * <ul>
     *     <li>Each rule has a different targetGroup
     *     <li>All targetGroups are non-negative
     * </ul>
     *
     * After the regroup operation:
     *
     * <ul>
     *     <li>If a document was matched by some rule (its original group was equal to the rule's targetGroup)
     *     then its new group is the rule's positiveGroup or negativeGroup, depending on whether it matched
     *     the rule's condition.
     *     <li>Otherwise, the document's new group is 0.
     * </ul>
     *
     * @param rawRules list of remap rules
     * @return the number of groups after applying the regroup
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    int regroup(GroupRemapRule[] rawRules) throws ImhotepOutOfMemoryException;

    int regroup2(int numRawRules, Iterator<GroupRemapRule> iterator) throws ImhotepOutOfMemoryException;

    /**
     * apply this query to the dataset and regroup based on whether or not a document matches the query
     *
     * After the regroup operation:
     *
     * <ul>
     *      <li>All documents in the rule's targetGroup that were matched by the rule's query will now be in the rule's positiveGroup</li>
     *      <li>All documents in the rule's targetGroup that were not matched by the rule's query will now be in the rule's negativeGroup</li>
     *      <li>All documents not in the rule's targetGroup will remain in the same group they were in before the regroup operation</li>
     * </ul>
     *
     * @param rule the query to execute and the group parameters
     * @return the number of groups after applying the regroup
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    int regroup(QueryRemapRule rule) throws ImhotepOutOfMemoryException;

    /**
     * a regroup for doing OR queries over int fields
     * @param field the int field
     * @param terms sorted list of terms, any doc matching any of these terms will be remapped
     * @param targetGroup group to map from
     * @param negativeGroup group into which to map docs that contain none of the terms
     * @param positiveGroup group into witch to map docs that contain any of the terms
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    void intOrRegroup(String field, long[] terms, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;

    /**
     * a regroup for doing OR queries over string fields
     * @param field the string field
     * @param terms sorted list of terms, any doc matching any of these terms will be remapped
     * @param targetGroup group to map from
     * @param negativeGroup group into which to map docs that contain none of the terms
     * @param positiveGroup group into witch to map docs that contain any of the terms
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    void stringOrRegroup(String field, String[] terms, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;

    /**
     * a regroup for doing regex filtering over string fields
     * @param field the string field
     * @param regex the regex to test the terms against
     * @param targetGroup group to map from
     * @param negativeGroup group into which to map docs that don't have any terms matching the regex
     * @param positiveGroup group into witch to map docs that contain terms that match the regex
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    void regexRegroup(String field, String regex, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;


    /**
     * perform a random regrouping of documents based on a specific field
     * this is done by applying the salt to each term, hashing, and mapping the hash to a value between 0 and 1
     * all terms with value < p go in negativeGroup, all terms with value >= p go in positiveGroup
     * the actual grouping is only as random as the salt
     *
     * @param field the field to use
     * @param isIntField whether the field is int or string type
     * @param salt the salt to use
     * @param p the minimum value to go into positiveGroup
     * @param targetGroup the group to apply the random regroup to
     * @param negativeGroup the group where terms with values < p will go
     * @param positiveGroup the group where terms with values >= p will go
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to go out of memory
     */
    void randomRegroup(String field, boolean isIntField, String salt, double p, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException;

    /**
     * Performs a random regroup, except instead of a binary decision, partitions into groups based on a percentage map.
     *
     * The percentage map is an array of split points for the result groups. Each document whose random value is less
     * than or equal to percentages[i] will be mapped to groups[i], and all remaining documents will be mapped to
     * groups[groups.length - 1]. Therefore, it is required that the percentage map be (1) in ascending order between
     * 0.0 and 1.0, and (2) one element shorter than resultGroups.
     *
     * For example, if percentages = [0.40, 0.80] and resultGroups = [3, 4, 6], then 40% of documents currently in the
     * target group will be placed in group 3, 40% will be placed in group 4, and 20% will be placed in group 6.
     *
     * All other behavior is the same as randomRegroup(). As always, the grouping is only as random as the salt.
     *
     * @param field Field to split randomly over
     * @param isIntField true if 'field' is an integer field, false if it is a string field
     * @param salt The salt to use
     * @param targetGroup The group to apply the random regroup to
     * @param percentages The group cutoff percentages, works together with resultGroups
     * @param resultGroups The groups to regroup into, works together with percentages
     */
    void randomMultiRegroup(String field, boolean isIntField, String salt, int targetGroup, double[] percentages, int[] resultGroups) throws ImhotepOutOfMemoryException;

    int metricRegroup(int stat, long min, long max, long intervalSize) throws ImhotepOutOfMemoryException;

    int metricRegroup(int stat, long min, long max, long intervalSize, boolean noGutters) throws ImhotepOutOfMemoryException;

    int metricRegroup2D(int xStat, long xMin, long xMax, long xIntervalSize,
                        int yStat, long yMin, long yMax, long yIntervalSize) throws ImhotepOutOfMemoryException;

    int metricFilter(int stat, long min, long max, boolean negate) throws ImhotepOutOfMemoryException;

    /**
     * Return a list of the top k terms for a field, sorted by document frequency descending.
     *
     * This method will be significantly faster than pushing count() and doing FTGS iteration but the results
     * are not guaranteed to be exact.
     *
     * Additionally, this method is not guaranteed to respect values of k larger than 1000.
     * 
     * @param field the field to retrieve top terms for
     * @param isIntField whether or not the field is an int field
     * @param k the desired number of terms
     * @return approximate top terms
     */
    List<TermCount> approximateTopTerms(String field, boolean isIntField, int k);

    /**
     * push the metric specified by statName
     * @param statName the metric to push
     * @return the number of stats after pushing this metric
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    int pushStat(String statName) throws ImhotepOutOfMemoryException;

    /**
     * push the metrics specified by statNames
     * @param statNames the metrics to push
     * @return the number of stats after pushing the last metric
     * @throws ImhotepOutOfMemoryException if performing this operation would cause imhotep to run out of memory
     */
    int pushStats(List<String> statNames) throws ImhotepOutOfMemoryException;

    /**
     * pop the most recently-added metric
     * @return the number of stats after popping the metric
     */
    int popStat();

    /**
     * @return the number of stats currently on the stack.
     */
    int getNumStats();

    /**
     * @return number of groups including zero group (maxGroup+1)
     */
    int getNumGroups();

    /**
     * create a per-document dynamic metric
     * @param name the name of the metric to create
     * @throws ImhotepOutOfMemoryException in case there's not enough memory
     */
    void createDynamicMetric(String name) throws ImhotepOutOfMemoryException;

    /**
     * add a per-group constant to each element of a dynamic metric, using saturating arithmetic
     * @param name the name of the metric to update
     * @param deltas an array of constant values to add for each group
     * @throws ImhotepOutOfMemoryException in case there's not enough memory
     */
    void updateDynamicMetric(String name, int[] deltas) throws ImhotepOutOfMemoryException;

    /**
     * Adjusts the given dynamic metric on a per-document basis where the delta for each condition that matches
     * is summed up and applied.
     * The group that a document is in is irrelevant.
     * Does not currently support inequality conditions.
     * @param name the name of the metric to update
     * @param conditions conditions to match against
     * @param deltas deltas to adjust document by if the corresponding condition matches
     */
    void conditionalUpdateDynamicMetric(String name, RegroupCondition[] conditions, int[] deltas);

    void groupConditionalUpdateDynamicMetric(String name, int[] groups, RegroupCondition[] conditions, int[] deltas);

    /**
     * close the session and free up any associated resources
     */
    void close();

    /**
     * reset groups to their original state (all documents in group 1)
     */
    void resetGroups() throws ImhotepOutOfMemoryException;

    /**
     * Rebuilds the Indexes and removes all docs in group 0. May make 
     * future FTGS passes more efficent.
     * @throws ImhotepOutOfMemoryException 
     */
    void rebuildAndFilterIndexes(List<String> intFields, List<String> stringFields) throws ImhotepOutOfMemoryException;
}