/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.imhotep.local;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimaps;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import com.google.protobuf.InvalidProtocolBufferException;
import com.indeed.flamdex.api.DocIdStream;
import com.indeed.flamdex.api.FlamdexOutOfMemoryException;
import com.indeed.flamdex.api.FlamdexReader;
import com.indeed.flamdex.api.IntTermIterator;
import com.indeed.flamdex.api.IntValueLookup;
import com.indeed.flamdex.api.RawFlamdexReader;
import com.indeed.flamdex.api.StringTermDocIterator;
import com.indeed.flamdex.api.StringTermIterator;
import com.indeed.flamdex.api.StringValueLookup;
import com.indeed.flamdex.datastruct.FastBitSet;
import com.indeed.flamdex.datastruct.FastBitSetPooler;
import com.indeed.flamdex.fieldcache.ByteArrayIntValueLookup;
import com.indeed.flamdex.fieldcache.IntArrayIntValueLookup;
import com.indeed.flamdex.query.Query;
import com.indeed.flamdex.query.Term;
import com.indeed.flamdex.reader.FlamdexMetadata;
import com.indeed.flamdex.search.FlamdexSearcher;
import com.indeed.flamdex.simple.SimpleFlamdexReader;
import com.indeed.flamdex.simple.SimpleFlamdexWriter;
import com.indeed.flamdex.utils.FlamdexUtils;
import com.indeed.imhotep.AbstractImhotepSession;
import com.indeed.imhotep.FTGSSplitter;
import com.indeed.imhotep.GroupMultiRemapRule;
import com.indeed.imhotep.GroupRemapRule;
import com.indeed.imhotep.ImhotepMemoryPool;
import com.indeed.imhotep.MemoryReservationContext;
import com.indeed.imhotep.MemoryReserver;
import com.indeed.imhotep.QueryRemapRule;
import com.indeed.imhotep.RegroupCondition;
import com.indeed.imhotep.TermCount;
import com.indeed.imhotep.api.DocIterator;
import com.indeed.imhotep.api.FTGSIterator;
import com.indeed.imhotep.api.ImhotepOutOfMemoryException;
import com.indeed.imhotep.api.RawFTGSIterator;
import com.indeed.imhotep.group.ImhotepChooser;
import com.indeed.imhotep.marshal.ImhotepDaemonMarshaller;
import com.indeed.imhotep.metrics.AbsoluteValue;
import com.indeed.imhotep.metrics.Addition;
import com.indeed.imhotep.metrics.CachedInterleavedMetrics;
import com.indeed.imhotep.metrics.CachedMetric;
import com.indeed.imhotep.metrics.Constant;
import com.indeed.imhotep.metrics.Count;
import com.indeed.imhotep.metrics.DelegatingMetric;
import com.indeed.imhotep.metrics.Division;
import com.indeed.imhotep.metrics.Equal;
import com.indeed.imhotep.metrics.Exponential;
import com.indeed.imhotep.metrics.GreaterThan;
import com.indeed.imhotep.metrics.GreaterThanOrEqual;
import com.indeed.imhotep.metrics.LessThan;
import com.indeed.imhotep.metrics.LessThanOrEqual;
import com.indeed.imhotep.metrics.Log;
import com.indeed.imhotep.metrics.Log1pExp;
import com.indeed.imhotep.metrics.Logistic;
import com.indeed.imhotep.metrics.Max;
import com.indeed.imhotep.metrics.Min;
import com.indeed.imhotep.metrics.Modulus;
import com.indeed.imhotep.metrics.Multiplication;
import com.indeed.imhotep.metrics.MultiplyAndShiftRight;
import com.indeed.imhotep.metrics.NotEqual;
import com.indeed.imhotep.metrics.ShiftLeftAndDivide;
import com.indeed.imhotep.metrics.Subtraction;
import com.indeed.imhotep.protobuf.QueryMessage;
import com.indeed.imhotep.service.CachedFlamdexReader;
import com.indeed.imhotep.service.RawCachedFlamdexReader;
import com.indeed.util.core.Pair;
import com.indeed.util.core.Throwables2;
import com.indeed.util.core.io.Closeables2;
import com.indeed.util.core.reference.SharedReference;
import com.indeed.util.core.threads.ThreadSafeBitSet;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import it.unimi.dsi.fastutil.PriorityQueue;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectHeapPriorityQueue;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import javax.annotation.Nonnull;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class isn't even close to remotely thread safe, do not use it
* simultaneously from multiple threads
*/
public final class ImhotepLocalSession extends AbstractImhotepSession {
static final Logger log = Logger.getLogger(ImhotepLocalSession.class);
static final boolean logTiming;
static {
logTiming =
"true".equals(System.getProperty("com.indeed.imhotep.local.ImhotepLocalSession.logTiming"));
}
private static final int MAX_NUMBER_STATS = 64;
static final int BUFFER_SIZE = 2048;
private final AtomicLong tempFileSizeBytesLeft;
private int numDocs;
// buffers that will be reused to avoid excessive allocations
final int[] docIdBuf = new int[BUFFER_SIZE];
final long[] valBuf = new long[BUFFER_SIZE];
final int[] docGroupBuffer = new int[BUFFER_SIZE];
// do not close flamdexReader, it is separately refcounted
private FlamdexReader flamdexReader;
private SharedReference<FlamdexReader> flamdexReaderRef;
private FlamdexReader originalReader;
private SharedReference<FlamdexReader> originalReaderRef;
private final String optimizedIndexesDir;
final MemoryReservationContext memory;
GroupLookup docIdToGroup;
int[] groupDocCount;
int numStats;
private long[][] groupStats = new long[MAX_NUMBER_STATS][];
final IntValueLookup[] statLookup = new IntValueLookup[MAX_NUMBER_STATS];
private final List<String> statCommands;
private final boolean[] needToReCalcGroupStats = new boolean[MAX_NUMBER_STATS];
private boolean closed = false;
@VisibleForTesting
private Map<String, DynamicMetric> dynamicMetrics = Maps.newHashMap();
Map<Pair<String, Boolean>, FastBitSet> fieldZeroDocBitsets;
private final Exception constructorStackTrace;
private final File optimizationLog;
private FTGSSplitter ftgsIteratorSplits;
public ImhotepLocalSession(final FlamdexReader flamdexReader) throws ImhotepOutOfMemoryException {
this(flamdexReader, null,
new MemoryReservationContext(new ImhotepMemoryPool(Long.MAX_VALUE)), false, null);
}
public ImhotepLocalSession(FlamdexReader flamdexReader, boolean optimizeGroupZeroLookups) throws ImhotepOutOfMemoryException {
this(flamdexReader, null,
new MemoryReservationContext(new ImhotepMemoryPool(Long.MAX_VALUE)),
optimizeGroupZeroLookups, null);
}
public ImhotepLocalSession(final FlamdexReader flamdexReader,
String optimizedIndexDirectory,
final MemoryReservationContext memory,
boolean optimizeGroupZeroLookups,
AtomicLong tempFileSizeBytesLeft) throws ImhotepOutOfMemoryException {
this.tempFileSizeBytesLeft = tempFileSizeBytesLeft;
constructorStackTrace = new Exception();
flamdexReaderRef = SharedReference.create(flamdexReader);
this.flamdexReader = flamdexReader;
this.memory = memory;
this.numDocs = flamdexReader.getNumDocs();
this.optimizedIndexesDir = optimizedIndexDirectory;
if (!memory.claimMemory(BUFFER_SIZE * (4 + 4 + 4) + 12 * 2)) {
throw new ImhotepOutOfMemoryException();
}
docIdToGroup = new ConstantGroupLookup(this, 1, numDocs);
docIdToGroup.recalculateNumGroups();
groupDocCount = clearAndResize((int[]) null, docIdToGroup.getNumGroups(), memory);
groupDocCount[1] = numDocs;
this.statCommands = new ArrayList<String>();
this.optimizationLog =
new File(this.optimizedIndexesDir, UUID.randomUUID().toString()
+ ".optimization_log");
if (optimizeGroupZeroLookups) {
fieldZeroDocBitsets = Maps.newHashMap();
} else {
fieldZeroDocBitsets = null;
}
}
FlamdexReader getReader() {
return this.flamdexReader;
}
public Map<String, DynamicMetric> getDynamicMetrics() {
return dynamicMetrics;
}
int getNumDocs() {
return this.numDocs;
}
/*
* record structure to store the info necessary to rebuild the
* DynamicMetrics after one or more optimizes and a reset
*/
public static class OptimizationRecord implements Serializable {
private static final long serialVersionUID = 1L;
public long time;
List<String> intFieldsMerged;
List<String> stringFieldsMerged;
String shardLocation;
List<ShardMergeInfo> mergedShards;
}
public static class ShardMergeInfo implements Serializable {
private static final long serialVersionUID = 1L;
int numDocs;
Map<String, DynamicMetric> dynamicMetrics;
int[] newDocIdToOldDocId;
}
/*
* Finds a good place to store the new, optimized shard and opens a
* SimpleFlamdexWriter to it.
*/
private SimpleFlamdexWriter createNewTempWriter(int maxDocs) throws IOException {
final File tempIdxDir;
final String newShardName;
final File newShardDir;
newShardName = "temp." + UUID.randomUUID().toString();
tempIdxDir = new File(this.optimizedIndexesDir);
newShardDir = new File(tempIdxDir, newShardName);
newShardDir.mkdir();
return new SimpleFlamdexWriter(newShardDir.getCanonicalPath(), maxDocs);
}
/* wrapper for SimpleFlamdexReader which deletes the on disk data on close() */
private static class AutoDeletingReader extends SimpleFlamdexReader {
public AutoDeletingReader(String directory,
int numDocs,
Collection<String> intFields,
Collection<String> stringFields,
boolean useMMapMetrics) {
super(directory, numDocs, intFields, stringFields, useMMapMetrics);
}
public static AutoDeletingReader open(String directory) throws IOException {
return open(directory, new Config());
}
public static AutoDeletingReader open(String directory, Config config) throws IOException {
final FlamdexMetadata metadata = FlamdexMetadata.readMetadata(directory);
final Collection<String> intFields = scan(directory, ".intterms");
final Collection<String> stringFields = scan(directory, ".strterms");
if (config.isWriteBTreesIfNotExisting()) {
buildIntBTrees(directory, Lists.newArrayList(intFields));
buildStringBTrees(directory, Lists.newArrayList(stringFields));
}
return new AutoDeletingReader(directory, metadata.numDocs, intFields, stringFields,
config.isUseMMapMetrics());
}
@Override
public void close() throws IOException {
File dir = new File(this.directory);
super.close();
FileUtils.deleteDirectory(dir);
}
}
/* Tweak to ObjectOutputStream which allows it to append to an existing file */
public static class AppendingObjectOutputStream extends ObjectOutputStream {
public AppendingObjectOutputStream(OutputStream out) throws IOException {
super(out);
}
@Override
protected void writeStreamHeader() throws IOException {
// do not write a header, but reset:
reset();
}
}
@Override
public synchronized void rebuildAndFilterIndexes(@Nonnull final List<String> intFields,
@Nonnull final List<String> stringFields) throws ImhotepOutOfMemoryException {
final IndexReWriter rewriter;
final ObjectOutputStream oos;
final SimpleFlamdexWriter w;
final ArrayList<String> statsCopy;
long time = System.currentTimeMillis();
/* pop off all the stats, they will be repushed after the optimization */
statsCopy = new ArrayList<String>(this.statCommands);
while (this.numStats > 0) {
this.popStat();
}
this.statCommands.clear();
MemoryReservationContext rewriterMemory = new MemoryReservationContext(memory);
rewriter = new IndexReWriter(Arrays.asList(this), this, rewriterMemory);
try {
w = createNewTempWriter(this.numDocs);
rewriter.optimizeIndecies(intFields, stringFields, w);
w.close();
/*
* save a record of the merge, so it can be unwound later if the
* shards are reset
*/
if (this.optimizationLog.exists()) {
/* plain ObjectOutputStream does not append correctly */
oos =
new AppendingObjectOutputStream(new FileOutputStream(this.optimizationLog,
true));
} else {
oos = new ObjectOutputStream(new FileOutputStream(this.optimizationLog, true));
}
OptimizationRecord record = new OptimizationRecord();
record.time = time;
record.intFieldsMerged = intFields;
record.stringFieldsMerged = stringFields;
record.shardLocation = w.getOutputDirectory();
record.mergedShards = new ArrayList<ShardMergeInfo>();
ShardMergeInfo info = new ShardMergeInfo();
info.numDocs = this.flamdexReader.getNumDocs();
info.dynamicMetrics = this.getDynamicMetrics();
info.newDocIdToOldDocId = rewriter.getPerSessionMappings().get(0);
record.mergedShards.add(info);
oos.writeObject(record);
oos.close();
/* use rebuilt structures */
memory.releaseMemory(this.docIdToGroup.memoryUsed());
rewriterMemory.hoist(rewriter.getNewGroupLookup().memoryUsed());
this.docIdToGroup = rewriter.getNewGroupLookup();
for (DynamicMetric dm : this.dynamicMetrics.values()) {
memory.releaseMemory(dm.memoryUsed());
}
for (DynamicMetric dm : rewriter.getDynamicMetrics().values()) {
rewriterMemory.hoist(dm.memoryUsed());
}
this.dynamicMetrics = rewriter.getDynamicMetrics();
/* release memory used by the index rewriter */
rewriterMemory.close();
/*
* replace flamdexReader pointers, but keep the originals in case
* there is a reset() call
*/
if (this.originalReader == null) {
this.originalReader = this.flamdexReader;
}
if (this.originalReaderRef == null) {
this.originalReaderRef = this.flamdexReaderRef;
} else {
/* close the unnecessary optimized index */
this.flamdexReaderRef.close();
}
FlamdexReader flamdex = AutoDeletingReader.open(w.getOutputDirectory());
if (flamdex instanceof RawFlamdexReader) {
this.flamdexReader =
new RawCachedFlamdexReader(new MemoryReservationContext(memory),
(RawFlamdexReader) flamdex, null, null, null,
null);
} else {
this.flamdexReader =
new CachedFlamdexReader(new MemoryReservationContext(memory), flamdex,
null, null, null, null);
}
this.flamdexReaderRef = SharedReference.create(this.flamdexReader);
/* alter tracking fields to reflect the removal of group 0 docs */
this.numDocs = this.flamdexReader.getNumDocs();
this.groupDocCount[0] = 0;
/* push the stats back on */
for (String stat : statsCopy) {
if ("pop".equals(stat)) {
this.popStat();
} else {
this.pushStat(stat);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/*
* Resets the Flamdex readers to the original un-optimized versions and
* constructs the DynamicMetrics to match what they should be if no
* optimization had taken place.
*
* The GroupLookup does not have to be reconstructed since it will be set to
* a constant value as a result of a reset() call
*/
private synchronized void resetOptimizedReaders() throws ImhotepOutOfMemoryException {
ObjectInputStream ois = null;
ArrayList<OptimizationRecord> records = new ArrayList<OptimizationRecord>();
final long memoryUse;
final ArrayList<String> statsCopy;
/* check if this session has been optimized */
if (this.originalReader == null) {
return;
}
/* check for space in memory */
memoryUse = this.optimizationLog.length();
if (!this.memory.claimMemory(memoryUse)) {
throw new ImhotepOutOfMemoryException();
}
/* pop off all the stats, they will be repushed after the flamdex reset */
statsCopy = new ArrayList<String>(this.statCommands);
while (this.numStats > 0) {
this.popStat();
}
this.statCommands.clear();
/* read in all the optimization records */
try {
ois = new ObjectInputStream(new FileInputStream(this.optimizationLog));
while (true) {
/*
* adds the records so the last written record is first in the
* list
*/
records.add(0, (OptimizationRecord) ois.readObject());
}
} catch (EOFException e) {
// read all the records
try {
if (ois != null) {
ois.close();
this.optimizationLog.delete();
}
} catch (IOException e1) {
/* do nothing */
e.printStackTrace();
}
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
/* the log is no longer needed, so remove it */
this.optimizationLog.delete();
}
/* reconstruct the dynamic metrics */
Map<String, DynamicMetric> newMetrics;
Map<String, DynamicMetric> oldMetrics;
int numNewDocs;
int numOldDocs;
newMetrics = this.dynamicMetrics;
numNewDocs = this.flamdexReader.getNumDocs();
for (OptimizationRecord opRec : records) {
int[] newToOldIdMapping = opRec.mergedShards.get(0).newDocIdToOldDocId;
oldMetrics = opRec.mergedShards.get(0).dynamicMetrics;
numOldDocs = opRec.mergedShards.get(0).numDocs;
for (Map.Entry<String, DynamicMetric> e : newMetrics.entrySet()) {
DynamicMetric oldMetric = oldMetrics.get(e.getKey());
DynamicMetric newMetric = e.getValue();
if (oldMetric == null) {
oldMetric = new DynamicMetric(numOldDocs);
}
for (int i = 0; i < numNewDocs; i++) {
int oldId = newToOldIdMapping[i];
int value = newMetric.lookupSingleVal(i);
oldMetric.set(oldId, value);
}
oldMetrics.put(e.getKey(), oldMetric);
}
numNewDocs = numOldDocs;
newMetrics = oldMetrics;
}
/* adjust the memory tracking */
for (DynamicMetric dm : this.dynamicMetrics.values()) {
memory.releaseMemory(dm.memoryUsed());
}
for (DynamicMetric dm : newMetrics.values()) {
memory.claimMemory(dm.memoryUsed());
}
this.dynamicMetrics = newMetrics;
try {
/* close temp index reader */
this.flamdexReaderRef.close();
} catch (IOException e) {
log.error("Could not close optimized reader");
}
/* reopen the original flamdex readers */
this.flamdexReader = this.originalReader;
this.flamdexReaderRef = this.originalReaderRef;
this.originalReader = null;
this.originalReaderRef = null;
this.numDocs = this.flamdexReader.getNumDocs();
/* push the stats back on */
for (String stat : statsCopy) {
if ("pop".equals(stat)) {
this.popStat();
} else {
this.pushStat(stat);
}
}
/* release the memory used by the log reading */
this.memory.releaseMemory(memoryUse);
}
/**
* export the current docId -> group lookup into an array
*
* @param array
* the array to export docIdToGroup into
*/
public synchronized void exportDocIdToGroupId(int[] array) {
if (array.length != docIdToGroup.size()) {
throw new IllegalArgumentException("array length is invalid");
}
for (int i = array.length - 1; i >= 0; --i) {
array[i] = docIdToGroup.get(i);
}
}
@Override
public synchronized long getTotalDocFreq(String[] intFields, String[] stringFields) {
long ret = 0L;
for (final String intField : intFields) {
ret += flamdexReader.getIntTotalDocFreq(intField);
}
for (final String stringField : stringFields) {
ret += flamdexReader.getStringTotalDocFreq(stringField);
}
return ret;
}
@Override
public synchronized FTGSIterator getFTGSIterator(String[] intFields, String[] stringFields) {
if (fieldZeroDocBitsets != null) {
for (String intField : intFields) {
if (!fieldZeroDocBitsets.containsKey(Pair.of(intField, true))) {
if (memory.claimMemory(FastBitSet.calculateMemoryUsage(32))) {
fieldZeroDocBitsets.put(Pair.of(intField, true), new FastBitSet(32));
} else {
log.warn("Insufficient memory, not allocating ftgs group zero bitset for field "
+ intField);
}
}
}
for (String stringField : stringFields) {
if (!fieldZeroDocBitsets.containsKey(Pair.of(stringField, true))) {
if (memory.claimMemory(FastBitSet.calculateMemoryUsage(32))) {
fieldZeroDocBitsets.put(Pair.of(stringField, false), new FastBitSet(32));
} else {
log.warn("Insufficient memory, not allocating ftgs group zero bitset for field "
+ stringField);
}
}
}
}
if (flamdexReader instanceof RawFlamdexReader) {
return new RawFlamdexFTGSIterator(this, flamdexReaderRef.copy(), intFields,
stringFields);
}
return new FlamdexFTGSIterator(this, flamdexReaderRef.copy(), intFields, stringFields);
}
@Override
public FTGSIterator getSubsetFTGSIterator(Map<String, long[]> intFields, Map<String, String[]> stringFields) {
if (flamdexReader instanceof RawFlamdexReader) {
return new RawFlamdexSubsetFTGSIterator(this, flamdexReaderRef.copy(), intFields,
stringFields);
}
return new FlamdexSubsetFTGSIterator(this, flamdexReaderRef.copy(), intFields, stringFields);
}
public DocIterator getDocIterator(final String[] intFields, final String[] stringFields) throws ImhotepOutOfMemoryException {
boolean shardOnlyContainsGroupZero = true;
for (int group = 1; group < groupDocCount.length; group++) {
if (groupDocCount[group] != 0) {
shardOnlyContainsGroupZero = false;
break;
}
}
if (shardOnlyContainsGroupZero) {
return emptyDocIterator();
}
final IntValueLookup[] intValueLookups = new IntValueLookup[intFields.length];
final StringValueLookup[] stringValueLookups = new StringValueLookup[stringFields.length];
try {
for (int i = 0; i < intFields.length; i++) {
intValueLookups[i] = flamdexReader.getMetric(intFields[i]);
}
for (int i = 0; i < stringFields.length; i++) {
stringValueLookups[i] = flamdexReader.getStringLookup(stringFields[i]);
}
} catch (FlamdexOutOfMemoryException e) {
for (IntValueLookup lookup : intValueLookups) {
if (lookup != null) {
lookup.close();
}
}
for (StringValueLookup lookup : stringValueLookups) {
if (lookup != null) {
lookup.close();
}
}
throw new ImhotepOutOfMemoryException();
}
return new DocIterator() {
int[] groups = new int[1024];
int n = groups.length;
int bufferStart = -groups.length;
int docId = -1;
boolean done = false;
public boolean next() {
if (done) {
return false;
}
while (true) {
docId++;
if (docId - bufferStart >= n) {
if (!readGroups()) {
return endOfData();
}
}
if (groups[docId - bufferStart] != 0) {
return true;
}
}
}
boolean endOfData() {
done = true;
return false;
}
public boolean readGroups() {
bufferStart += n;
n = Math.min(numDocs - bufferStart, groups.length);
if (n <= 0) {
return false;
}
docIdToGroup.fillDocGrpBufferSequential(bufferStart, groups, n);
return true;
}
public int getGroup() {
return groups[docId - bufferStart];
}
int[] docIdRef = new int[1];
long[] valueRef = new long[1];
public long getInt(final int index) {
docIdRef[0] = docId;
intValueLookups[index].lookup(docIdRef, valueRef, 1);
return valueRef[0];
}
public String getString(final int index) {
return stringValueLookups[index].getString(docId);
}
public void close() throws IOException {
for (IntValueLookup lookup : intValueLookups) {
if (lookup != null) {
lookup.close();
}
}
for (StringValueLookup lookup : stringValueLookups) {
if (lookup != null) {
lookup.close();
}
}
}
};
}
private static DocIterator emptyDocIterator() {
return new DocIterator() {
@Override
public boolean next() {
return false;
}
@Override
public int getGroup() {
return 0;
}
@Override
public long getInt(int index) {
return 0;
}
@Override
public String getString(int index) {
return null;
}
@Override
public void close() {
}
};
}
public RawFTGSIterator[] getFTGSIteratorSplits(final String[] intFields, final String[] stringFields) {
final int numSplits = 16;
final RawFTGSIterator[] ret = new RawFTGSIterator[numSplits];
for (int i = 0; i < numSplits; i++) {
ret[i] = getFTGSIteratorSplit(intFields, stringFields, i, numSplits);
}
return ret;
}
public synchronized RawFTGSIterator getFTGSIteratorSplit(final String[] intFields,
final String[] stringFields,
final int splitIndex,
final int numSplits) {
if (ftgsIteratorSplits == null || ftgsIteratorSplits.isClosed()) {
try {
ftgsIteratorSplits = new FTGSSplitter(getFTGSIterator(intFields, stringFields), numSplits, numStats, "getIteratorSplitsLocalSession", 969168349, tempFileSizeBytesLeft);
} catch (IOException e) {
throw Throwables.propagate(e);
}
}
return ftgsIteratorSplits.getFtgsIterators()[splitIndex];
}
@Override
public RawFTGSIterator[] getSubsetFTGSIteratorSplits(Map<String, long[]> intFields, Map<String, String[]> stringFields) {
final int numSplits = 16;
final RawFTGSIterator[] ret = new RawFTGSIterator[numSplits];
for (int i = 0; i < numSplits; i++) {
ret[i] = getSubsetFTGSIteratorSplit(intFields, stringFields, i, numSplits);
}
return ret;
}
@Override
public synchronized RawFTGSIterator getSubsetFTGSIteratorSplit(Map<String, long[]> intFields, Map<String, String[]> stringFields, int splitIndex, int numSplits) {
if (ftgsIteratorSplits == null || ftgsIteratorSplits.isClosed()) {
try {
ftgsIteratorSplits = new FTGSSplitter(getSubsetFTGSIterator(intFields, stringFields), numSplits, numStats, "getIteratorSplitsLocalSession", 969168349, tempFileSizeBytesLeft);
} catch (IOException e) {
throw Throwables.propagate(e);
}
}
return ftgsIteratorSplits.getFtgsIterators()[splitIndex];
}
public RawFTGSIterator mergeFTGSSplit(final String[] intFields,
final String[] stringFields,
final String sessionId,
final InetSocketAddress[] nodes,
final int splitIndex) {
throw new UnsupportedOperationException();
}
@Override
public RawFTGSIterator mergeSubsetFTGSSplit(Map<String, long[]> intFields, Map<String, String[]> stringFields, String sessionId, InetSocketAddress[] nodes, int splitIndex) {
throw new UnsupportedOperationException();
}
@Override
public synchronized int regroup(final GroupMultiRemapRule[] rules, boolean errorOnCollisions) throws ImhotepOutOfMemoryException {
final int numRules = rules.length;
if (numRules == 0) {
resetGroupsTo(0);
return docIdToGroup.getNumGroups();
}
final int numConditions = MultiRegroupInternals.countRemapConditions(rules);
final int highestTarget;
final int targetGroupBytes = Math.max(numRules * 4, numConditions * 8);
if (!memory.claimMemory(targetGroupBytes)) {
throw new ImhotepOutOfMemoryException();
}
try {
highestTarget = MultiRegroupInternals.validateTargets(rules);
MultiRegroupInternals.validateEqualitySplits(rules);
} finally {
memory.releaseMemory(targetGroupBytes);
}
final int maxIntermediateGroup = Math.max(docIdToGroup.getNumGroups(), highestTarget);
final int maxNewGroup = MultiRegroupInternals.findMaxGroup(rules);
docIdToGroup =
GroupLookupFactory.resize(docIdToGroup,
Math.max(maxIntermediateGroup, maxNewGroup),
memory);
MultiRegroupInternals.moveUntargeted(docIdToGroup, maxIntermediateGroup, rules);
final int maxConditionIndex = MultiRegroupInternals.findMaxIntermediateGroup(rules);
final int placeholderGroup = maxConditionIndex + 1;
final int parallelArrayBytes = 3 * 4 * numConditions + 8 * numConditions;
// int[highestTarget+1], int[highestTarget+1][], <int or
// string>[highestTarget+1][]
// The last two are jagged arrays, and the cardinality of the subarrays
// sums to numConditions at most
final int maxInequalityBytes = (highestTarget + 1) * (4 + 8 + 8) + numConditions * (4 + 8);
final int maxBarrierIndexBytes = numConditions * 4;
final int remappingBytes = (maxIntermediateGroup + 1) * 4;
final int totalInternalRegroupBytes =
parallelArrayBytes + maxInequalityBytes + maxBarrierIndexBytes + remappingBytes;
final GroupLookup newDocIdToGroup;
newDocIdToGroup = newGroupLookupWithPlaceholders(placeholderGroup);
try {
if (!memory.claimMemory(totalInternalRegroupBytes)) {
throw new ImhotepOutOfMemoryException();
}
try {
MultiRegroupInternals.internalMultiRegroup(docIdToGroup,
newDocIdToGroup,
docIdBuf,
flamdexReader,
rules,
highestTarget,
numConditions,
placeholderGroup,
maxIntermediateGroup,
errorOnCollisions);
} finally {
memory.releaseMemory(totalInternalRegroupBytes);
}
final int targetGroupToRuleBytes =
Math.max(highestTarget + 1, docIdToGroup.getNumGroups()) * 8;
if (!memory.claimMemory(targetGroupToRuleBytes)) {
throw new ImhotepOutOfMemoryException();
}
try {
MultiRegroupInternals.internalMultiRegroupCleanup(docIdToGroup,
docIdToGroup.getNumGroups(),
rules,
highestTarget,
newDocIdToGroup,
placeholderGroup);
} finally {
memory.releaseMemory(targetGroupToRuleBytes);
}
} finally {
memory.releaseMemory(newDocIdToGroup.memoryUsed());
}
finalizeRegroup();
return docIdToGroup.getNumGroups();
}
// Makes a new GroupLookup with all documents having a nonzero group in the
// current docIdToGroup
// having a group of placeholderGroup.
private synchronized GroupLookup newGroupLookupWithPlaceholders(int placeholderGroup) throws ImhotepOutOfMemoryException {
final GroupLookup newLookup;
newLookup = GroupLookupFactory.create(placeholderGroup, docIdToGroup.size(), this, memory);
for (int i = 0; i < newLookup.size(); i++) {
if (docIdToGroup.get(i) != 0) {
newLookup.set(i, placeholderGroup);
}
}
return newLookup;
}
@Override
public synchronized int regroup(final GroupRemapRule[] rawRules) throws ImhotepOutOfMemoryException {
for (GroupRemapRule rule : rawRules) {
if (rule.targetGroup == 0) {
clearZeroDocBitsets();
}
}
final int requiredMemory = numDocs / 8 + 1;
if (!memory.claimMemory(requiredMemory)) {
throw new ImhotepOutOfMemoryException();
}
try {
internalRegroup(rawRules);
} finally {
memory.releaseMemory(requiredMemory);
}
return docIdToGroup.getNumGroups();
}
private void ensureGroupLookupCapacity(GroupRemapRule[] cleanRules) throws ImhotepOutOfMemoryException {
int maxGroup = 0;
for (final GroupRemapRule rule : cleanRules) {
if (rule != null) {
maxGroup = Math.max(maxGroup, Math.max(rule.negativeGroup, rule.positiveGroup));
}
}
docIdToGroup = GroupLookupFactory.resize(docIdToGroup, maxGroup, memory);
}
private void internalRegroup(GroupRemapRule[] rawRules) throws ImhotepOutOfMemoryException {
final GroupRemapRule[] cleanRules = cleanUpRules(rawRules, docIdToGroup.getNumGroups());
ensureGroupLookupCapacity(cleanRules);
final ThreadSafeBitSet docRemapped = new ThreadSafeBitSet(numDocs);
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
applyIntConditions(cleanRules, docIdStream, docRemapped);
applyStringConditions(cleanRules, docIdStream, docRemapped);
docIdStream.close();
// pick up everything else that was missed
for (int i = 0; i < docIdToGroup.size(); i++) {
if (docRemapped.get(i)) {
continue;
}
final int group = docIdToGroup.get(i);
final int newGroup;
if (cleanRules[group] != null) {
newGroup = cleanRules[group].negativeGroup;
} else {
newGroup = 0;
}
docIdToGroup.set(i, newGroup);
}
finalizeRegroup();
}
private void finalizeRegroup() throws ImhotepOutOfMemoryException {
final int oldNumGroups = docIdToGroup.getNumGroups();
final int newNumGroups;
docIdToGroup.recalculateNumGroups();
newNumGroups = docIdToGroup.getNumGroups();
accountForFlamdexFTGSIteratorMemChange(oldNumGroups, newNumGroups);
docIdToGroup = GroupLookupFactory.resize(docIdToGroup, 0, memory);
recalcGroupCounts(newNumGroups);
recalcGroupStats(newNumGroups);
}
private void accountForFlamdexFTGSIteratorMemChange(final int oldNumGroups,
final int newNumGroups) throws ImhotepOutOfMemoryException {
if (newNumGroups > oldNumGroups) {
// for memory in FlamdexFTGSIterator
if (!memory.claimMemory((12L + 8L * numStats) * (newNumGroups - oldNumGroups))) {
throw new ImhotepOutOfMemoryException();
}
} else if (newNumGroups < oldNumGroups) {
// for memory in FlamdexFTGSIterator
memory.releaseMemory((12L + 8L * numStats) * (oldNumGroups - newNumGroups));
}
}
@Override
public int regroup(QueryRemapRule rule) throws ImhotepOutOfMemoryException {
if (rule.getTargetGroup() == 0) {
clearZeroDocBitsets();
}
docIdToGroup =
GroupLookupFactory.resize(docIdToGroup, Math.max(rule.getNegativeGroup(),
rule.getPositiveGroup()), memory);
final FastBitSetPooler bitSetPooler = new ImhotepBitSetPooler(memory);
final FastBitSet bitSet;
try {
bitSet = bitSetPooler.create(flamdexReader.getNumDocs());
} catch (FlamdexOutOfMemoryException e) {
throw new ImhotepOutOfMemoryException(e);
}
try {
final FlamdexSearcher searcher = new FlamdexSearcher(flamdexReader);
final Query query = rule.getQuery();
searcher.search(query, bitSet, bitSetPooler);
docIdToGroup.bitSetRegroup(bitSet,
rule.getTargetGroup(),
rule.getNegativeGroup(),
rule.getPositiveGroup());
} catch (FlamdexOutOfMemoryException e) {
throw new ImhotepOutOfMemoryException(e);
} finally {
bitSetPooler.release(bitSet.memoryUsage());
}
finalizeRegroup();
return docIdToGroup.getNumGroups();
}
@Override
public synchronized void intOrRegroup(String field,
long[] terms,
int targetGroup,
int negativeGroup,
int positiveGroup) throws ImhotepOutOfMemoryException {
if (targetGroup == 0) {
clearZeroDocBitsets();
}
docIdToGroup =
GroupLookupFactory.resize(docIdToGroup,
Math.max(negativeGroup, positiveGroup),
memory);
final FastBitSetPooler bitSetPooler = new ImhotepBitSetPooler(memory);
final FastBitSet docRemapped;
try {
docRemapped = bitSetPooler.create(numDocs);
} catch (FlamdexOutOfMemoryException e) {
throw new ImhotepOutOfMemoryException(e);
}
try {
try (
final IntTermIterator iter = flamdexReader.getIntTermIterator(field);
final DocIdStream docIdStream = flamdexReader.getDocIdStream()
) {
int termsIndex = 0;
while (iter.next()) {
final long term = iter.term();
while (termsIndex < terms.length && terms[termsIndex] < term) {
++termsIndex;
}
if (termsIndex < terms.length && term == terms[termsIndex]) {
docIdStream.reset(iter);
remapPositiveDocs(docIdStream, docRemapped, targetGroup, positiveGroup);
++termsIndex;
}
if (termsIndex == terms.length) {
break;
}
}
}
remapNegativeDocs(docRemapped, targetGroup, negativeGroup);
} finally {
bitSetPooler.release(docRemapped.memoryUsage());
}
finalizeRegroup();
}
@Override
public synchronized void stringOrRegroup(String field,
String[] terms,
int targetGroup,
int negativeGroup,
int positiveGroup) throws ImhotepOutOfMemoryException {
if (targetGroup == 0) {
clearZeroDocBitsets();
}
docIdToGroup =
GroupLookupFactory.resize(docIdToGroup,
Math.max(negativeGroup, positiveGroup),
memory);
final FastBitSetPooler bitSetPooler = new ImhotepBitSetPooler(memory);
final FastBitSet docRemapped;
try {
docRemapped = bitSetPooler.create(numDocs);
} catch (FlamdexOutOfMemoryException e) {
throw new ImhotepOutOfMemoryException(e);
}
try {
try (
final StringTermIterator iter = flamdexReader.getStringTermIterator(field);
final DocIdStream docIdStream = flamdexReader.getDocIdStream()
) {
int termsIndex = 0;
while (iter.next()) {
final String term = iter.term();
while (termsIndex < terms.length && terms[termsIndex].compareTo(term) < 0) {
++termsIndex;
}
if (termsIndex < terms.length && terms[termsIndex].equals(term)) {
docIdStream.reset(iter);
remapPositiveDocs(docIdStream, docRemapped, targetGroup, positiveGroup);
++termsIndex;
}
if (termsIndex == terms.length) {
break;
}
}
}
remapNegativeDocs(docRemapped, targetGroup, negativeGroup);
} finally {
bitSetPooler.release(docRemapped.memoryUsage());
}
finalizeRegroup();
}
@Override
public void regexRegroup(String field, String regex, int targetGroup, int negativeGroup, int positiveGroup) throws ImhotepOutOfMemoryException {
if (getNumGroups() > 2) {
throw new IllegalStateException("regexRegroup should be applied as a filter when you have only one group");
}
if (targetGroup == 0) {
clearZeroDocBitsets();
}
docIdToGroup =
GroupLookupFactory.resize(docIdToGroup,
Math.max(negativeGroup, positiveGroup),
memory);
final FastBitSetPooler bitSetPooler = new ImhotepBitSetPooler(memory);
final FastBitSet docRemapped;
try {
docRemapped = bitSetPooler.create(numDocs);
} catch (FlamdexOutOfMemoryException e) {
throw new ImhotepOutOfMemoryException(e);
}
try {
try (
final StringTermIterator iter = flamdexReader.getStringTermIterator(field);
final DocIdStream docIdStream = flamdexReader.getDocIdStream()
) {
final Automaton automaton = new RegExp(regex).toAutomaton();
while (iter.next()) {
final String term = iter.term();
if (automaton.run(term)) {
docIdStream.reset(iter);
remapPositiveDocs(docIdStream, docRemapped, targetGroup, positiveGroup);
}
}
}
remapNegativeDocs(docRemapped, targetGroup, negativeGroup);
} finally {
bitSetPooler.release(docRemapped.memoryUsage());
}
finalizeRegroup();
}
private void remapNegativeDocs(FastBitSet docRemapped, int targetGroup, int negativeGroup) {
for (int doc = 0; doc < numDocs; ++doc) {
if (!docRemapped.get(doc) && docIdToGroup.get(doc) == targetGroup) {
docIdToGroup.set(doc, negativeGroup);
}
}
}
private void remapPositiveDocs(DocIdStream docIdStream,
FastBitSet docRemapped,
int targetGroup,
int positiveGroup) {
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (docIdToGroup.get(doc) == targetGroup) {
docIdToGroup.set(doc, positiveGroup);
docRemapped.set(doc);
}
}
if (n < docIdBuf.length) {
break;
}
}
}
@Override
public synchronized void randomRegroup(String field,
boolean isIntField,
String salt,
double p,
int targetGroup,
int negativeGroup,
int positiveGroup) throws ImhotepOutOfMemoryException {
if (targetGroup == 0) {
clearZeroDocBitsets();
}
docIdToGroup =
GroupLookupFactory.resize(docIdToGroup,
Math.max(negativeGroup, positiveGroup),
memory);
final ImhotepChooser chooser = new ImhotepChooser(salt, p);
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
if (isIntField) {
final IntTermIterator iter = flamdexReader.getIntTermIterator(field);
while (iter.next()) {
final long term = iter.term();
final int newGroup =
chooser.choose(Long.toString(term)) ? positiveGroup : negativeGroup;
docIdStream.reset(iter);
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (docIdToGroup.get(doc) == targetGroup) {
docIdToGroup.set(doc, newGroup);
}
}
if (n < docIdBuf.length) {
break;
}
}
}
iter.close();
} else {
final StringTermIterator iter = flamdexReader.getStringTermIterator(field);
while (iter.next()) {
final String term = iter.term();
final int newGroup = chooser.choose(term) ? positiveGroup : negativeGroup;
docIdStream.reset(iter);
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (docIdToGroup.get(doc) == targetGroup) {
docIdToGroup.set(doc, newGroup);
}
}
if (n < docIdBuf.length) {
break;
}
}
}
iter.close();
}
docIdStream.close();
finalizeRegroup();
}
@Override
public synchronized void randomMultiRegroup(String field,
boolean isIntField,
String salt,
int targetGroup,
double[] percentages,
int[] resultGroups) throws ImhotepOutOfMemoryException {
if (targetGroup == 0) {
clearZeroDocBitsets();
}
ensureValidMultiRegroupArrays(percentages, resultGroups);
docIdToGroup = GroupLookupFactory.resize(docIdToGroup, Ints.max(resultGroups), memory);
final ImhotepChooser chooser = new ImhotepChooser(salt, -1.0); // We're
// not
// using
// the
// chooser's
// percentage
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
if (isIntField) {
final IntTermIterator iter = flamdexReader.getIntTermIterator(field);
while (iter.next()) {
final long term = iter.term();
final int groupIndex =
indexOfFirstLessThan(chooser.getValue(Long.toString(term)), percentages);
final int newGroup = resultGroups[groupIndex];
docIdStream.reset(iter);
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (docIdToGroup.get(doc) == targetGroup) {
docIdToGroup.set(doc, newGroup);
}
}
if (n < docIdBuf.length) {
break;
}
}
}
iter.close();
} else {
final StringTermIterator iter = flamdexReader.getStringTermIterator(field);
while (iter.next()) {
final String term = iter.term();
final int groupIndex = indexOfFirstLessThan(chooser.getValue(term), percentages);
final int newGroup = resultGroups[groupIndex];
docIdStream.reset(iter);
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (docIdToGroup.get(doc) == targetGroup) {
docIdToGroup.set(doc, newGroup);
}
}
if (n < docIdBuf.length) {
break;
}
}
}
iter.close();
}
docIdStream.close();
finalizeRegroup();
}
@Override
public List<TermCount> approximateTopTerms(String field, boolean isIntField, int k) {
k = Math.min(k, 1000);
if (isIntField) {
final PriorityQueue<IntTermWithFreq> pq =
new ObjectHeapPriorityQueue<IntTermWithFreq>(k, INT_FREQ_COMPARATOR);
final IntTermIterator iter = flamdexReader.getIntTermIterator(field);
try {
while (iter.next()) {
final int docFreq = iter.docFreq();
if (pq.size() < k) {
pq.enqueue(new IntTermWithFreq(iter.term(), docFreq));
} else {
final IntTermWithFreq min = pq.first();
if (docFreq > min.docFreq) {
min.term = iter.term();
min.docFreq = docFreq;
pq.changed();
}
}
}
final List<TermCount> ret = Lists.newArrayListWithCapacity(pq.size());
while (!pq.isEmpty()) {
final IntTermWithFreq term = pq.dequeue();
ret.add(new TermCount(new Term(field, true, term.term, ""), term.docFreq));
}
Collections.reverse(ret);
return ret;
} finally {
iter.close();
}
} else {
final PriorityQueue<StringTermWithFreq> pq =
new ObjectHeapPriorityQueue<StringTermWithFreq>(k, STRING_FREQ_COMPARATOR);
final StringTermIterator iter = flamdexReader.getStringTermIterator(field);
try {
while (iter.next()) {
final int docFreq = iter.docFreq();
if (pq.size() < k) {
pq.enqueue(new StringTermWithFreq(iter.term(), docFreq));
} else {
final StringTermWithFreq min = pq.first();
if (docFreq > min.docFreq) {
min.term = iter.term();
min.docFreq = docFreq;
pq.changed();
}
}
}
final List<TermCount> ret = Lists.newArrayListWithCapacity(pq.size());
while (!pq.isEmpty()) {
final StringTermWithFreq term = pq.dequeue();
ret.add(new TermCount(new Term(field, false, 0, term.term), term.docFreq));
}
Collections.reverse(ret);
return ret;
} finally {
iter.close();
}
}
}
private static final Comparator<IntTermWithFreq> INT_FREQ_COMPARATOR =
new Comparator<IntTermWithFreq>() {
@Override
public int compare(IntTermWithFreq o1, IntTermWithFreq o2) {
return Ints.compare(o1.docFreq, o2.docFreq);
}
};
private static final class IntTermWithFreq {
public long term;
public int docFreq;
private IntTermWithFreq(long term, int docFreq) {
this.term = term;
this.docFreq = docFreq;
}
}
private static final Comparator<StringTermWithFreq> STRING_FREQ_COMPARATOR =
new Comparator<StringTermWithFreq>() {
@Override
public int compare(StringTermWithFreq o1, StringTermWithFreq o2) {
return Ints.compare(o1.docFreq, o2.docFreq);
}
};
private static final class StringTermWithFreq {
public String term;
public int docFreq;
private StringTermWithFreq(String term, int docFreq) {
this.term = term;
this.docFreq = docFreq;
}
}
/**
* Requires that array is non-null and sorted in ascending order.
*
* Returns the lowest index in the array such that value < array[index]. If
* value is greater than every element in array, returns array.length.
* Essentially, a wrapper around binarySearch to return an index in all
* cases.
*/
protected int indexOfFirstLessThan(double value, double[] array) {
int pos = Arrays.binarySearch(array, value);
if (pos > 0) { // if pos > 0, then value == array[pos] --> continue
// until we find a greater element & break
while (pos < array.length && array[pos] == value) {
pos++;
}
return pos;
} else {
// when pos < 0, pos = (-(insertion point) - 1)
return -(pos + 1);
}
}
/**
* Ensures that the percentages and resultGroups array are valid inputs for
* a randomMultiRegroup. Otherwise, throws an IllegalArgumentException.
* Specifically, checks to make sure
* <ul>
* <li>percentages is in ascending order,</li>
* <li>percentages contains only values between 0.0 & 1.0, and</li>
* <li>len(percentages) == len(resultGroups) - 1</li>
* </ul>
*
* @see ImhotepLocalSession#randomMultiRegroup(String, boolean, String, int,
* double[], int[])
*/
protected void ensureValidMultiRegroupArrays(double[] percentages, int[] resultGroups) throws IllegalArgumentException {
// Ensure non-null inputs
if (null == percentages || null == resultGroups) {
throw new IllegalArgumentException(
"received null percentages or resultGroups to randomMultiRegroup");
}
// Ensure that the lengths are correct
if (percentages.length != resultGroups.length - 1) {
throw new IllegalArgumentException(
"percentages should have 1 fewer element than resultGroups");
}
// Ensure validity of percentages values
double curr = 0.0;
for (int i = 0; i < percentages.length; i++) {
// Check: Increasing
if (percentages[i] < curr) {
throw new IllegalArgumentException("percentages values decreased between indices "
+ (i - 1) + " and " + i);
}
// Check: between 0 and 1
if (percentages[i] < 0.0 || percentages[i] > 1.0) {
throw new IllegalArgumentException("percentages values should be between 0 and 1");
}
curr = percentages[i];
}
}
@Override
public synchronized int metricRegroup(int stat, long min, long max, long intervalSize, boolean noGutters) throws ImhotepOutOfMemoryException {
clearZeroDocBitsets();
if (stat < 0 || stat >= statLookup.length) {
throw new IllegalArgumentException("invalid stat index: " + stat
+ ", must be between [0," + statLookup.length + ")");
}
final int numBuckets = (int) (((max - 1) - min) / intervalSize + 1);
final int newMaxGroup = (docIdToGroup.getNumGroups()-1)*(noGutters ? numBuckets : numBuckets+2);
docIdToGroup = GroupLookupFactory.resize(docIdToGroup, newMaxGroup, memory);
final IntValueLookup lookup = statLookup[stat];
final int numDocs = docIdToGroup.size();
for (int doc = 0; doc < numDocs; doc += BUFFER_SIZE) {
final int n = Math.min(BUFFER_SIZE, numDocs - doc);
docIdToGroup.fillDocGrpBufferSequential(doc, docGroupBuffer, n);
int numNonZero = 0;
for (int i = 0; i < n; ++i) {
if (docGroupBuffer[i] != 0) {
docGroupBuffer[numNonZero] = docGroupBuffer[i];
docIdBuf[numNonZero++] = doc + i;
}
}
if (numNonZero == 0) {
continue;
}
lookup.lookup(docIdBuf, valBuf, numNonZero);
if (noGutters) {
internalMetricRegroupNoGutters(min, max, intervalSize, numBuckets, numNonZero);
} else {
internalMetricRegroupGutters(min, max, intervalSize, numBuckets, numNonZero);
}
docIdToGroup.batchSet(docIdBuf, docGroupBuffer, numNonZero);
}
finalizeRegroup();
return docIdToGroup.getNumGroups();
}
private void internalMetricRegroupGutters(long min, long max, long intervalSize, int numBuckets, int numNonZero) {
for (int i = 0; i < numNonZero; ++i) {
final int group;
final long val = valBuf[i];
if (val < min) {
group = numBuckets + 1;
} else if (val >= max) {
group = numBuckets + 2;
} else {
group = (int) ((val - min) / intervalSize + 1);
}
docGroupBuffer[i] = (docGroupBuffer[i] - 1) * (numBuckets + 2) + group;
}
}
private void internalMetricRegroupNoGutters(long min, long max, long intervalSize, int numBuckets, int numNonZero) {
for (int i = 0; i < numNonZero; ++i) {
final long val = valBuf[i];
if (val < min) {
docGroupBuffer[i] = 0;
} else if (val >= max) {
docGroupBuffer[i] = 0;
} else {
final int group = (int) ((val - min) / intervalSize + 1);
docGroupBuffer[i] = (docGroupBuffer[i]-1)*numBuckets+group;
}
}
}
@Override
public synchronized int metricRegroup2D(int xStat,
long xMin,
long xMax,
long xIntervalSize,
int yStat,
long yMin,
long yMax,
long yIntervalSize) throws ImhotepOutOfMemoryException {
clearZeroDocBitsets();
final int xBuckets = (int) (((xMax - 1) - xMin) / xIntervalSize + 3);
final int yBuckets = (int) (((yMax - 1) - yMin) / yIntervalSize + 3);
final int numGroups = xBuckets * yBuckets;
docIdToGroup = GroupLookupFactory.resize(docIdToGroup, numGroups, memory);
if (!memory.claimMemory(BUFFER_SIZE * 8)) {
throw new ImhotepOutOfMemoryException();
}
try {
final long[] yValBuf = new long[BUFFER_SIZE];
final IntValueLookup xLookup = statLookup[xStat];
final IntValueLookup yLookup = statLookup[yStat];
final int numDocs = docIdToGroup.size();
for (int doc = 0; doc < numDocs; doc += BUFFER_SIZE) {
final int n = Math.min(BUFFER_SIZE, numDocs - doc);
docIdToGroup.fillDocGrpBufferSequential(doc, docGroupBuffer, n);
int numNonZero = 0;
for (int i = 0; i < n; ++i) {
if (docGroupBuffer[i] != 0) {
docIdBuf[numNonZero++] = doc + i;
}
}
if (numNonZero == 0) {
continue;
}
xLookup.lookup(docIdBuf, valBuf, numNonZero);
yLookup.lookup(docIdBuf, yValBuf, numNonZero);
for (int i = 0; i < numNonZero; ++i) {
final long xVal = valBuf[i];
final long yVal = yValBuf[i];
final int group;
if (xVal < xMin) {
if (yVal < yMin) {
group = 1;
} else if (yVal >= yMax) {
group = (yBuckets - 1) * xBuckets + 1;
} else {
group = (int) (((yVal - yMin) / yIntervalSize + 1) * xBuckets + 1);
}
} else if (xVal >= xMax) {
if (yVal < yMin) {
group = xBuckets;
} else if (yVal >= yMax) {
group = xBuckets * yBuckets;
} else {
group = (int) (((yVal - yMin) / yIntervalSize + 2) * xBuckets);
}
} else {
if (yVal < yMin) {
group = (int) ((xVal - xMin) / xIntervalSize + 2);
} else if (yVal >= yMax) {
group =
(int) ((yBuckets - 1) * xBuckets + (xVal - xMin)
/ xIntervalSize + 2);
} else {
group =
(int) (((yVal - yMin) / yIntervalSize + 1) * xBuckets
+ (xVal - xMin) / xIntervalSize + 2);
}
}
docGroupBuffer[i] = group;
}
docIdToGroup.batchSet(docIdBuf, docGroupBuffer, numNonZero);
}
} finally {
memory.releaseMemory(BUFFER_SIZE * 8);
}
finalizeRegroup();
return numGroups;
}
public synchronized int metricFilter(int stat, long min, long max, final boolean negate) throws ImhotepOutOfMemoryException {
clearZeroDocBitsets();
if (stat < 0 || stat >= statLookup.length) {
throw new IllegalArgumentException("invalid stat index: " + stat
+ ", must be between [0," + statLookup.length + ")");
}
docIdToGroup = GroupLookupFactory.resize(docIdToGroup, docIdToGroup.getNumGroups(), memory);
final IntValueLookup lookup = statLookup[stat];
final int numDocs = docIdToGroup.size();
for (int doc = 0; doc < numDocs; doc += BUFFER_SIZE) {
final int n = Math.min(BUFFER_SIZE, numDocs - doc);
docIdToGroup.fillDocGrpBufferSequential(doc, docGroupBuffer, n);
int numNonZero = 0;
for (int i = 0; i < n; ++i) {
final int group = docGroupBuffer[i];
if (group != 0) {
docIdBuf[numNonZero] = doc + i;
docGroupBuffer[numNonZero++] = group;
}
}
if (numNonZero == 0) {
continue;
}
lookup.lookup(docIdBuf, valBuf, numNonZero);
for (int i = 0; i < numNonZero; ++i) {
final long val = valBuf[i];
final boolean valInRange = val >= min && val <= max;
if (valInRange == negate) {
docGroupBuffer[i] = 0;
}
}
docIdToGroup.batchSet(docIdBuf, docGroupBuffer, numNonZero);
}
finalizeRegroup();
return docIdToGroup.getNumGroups();
}
@Override
public synchronized long[] getGroupStats(int stat) {
if (needToReCalcGroupStats[stat]) {
updateGroupStatsAllDocs(statLookup[stat],
groupStats[stat],
docIdToGroup,
docGroupBuffer,
docIdBuf,
valBuf);
needToReCalcGroupStats[stat] = false;
}
return groupStats[stat];
}
private static GroupRemapRule[] cleanUpRules(GroupRemapRule[] rawRules, int numGroups) {
final GroupRemapRule[] cleanRules = new GroupRemapRule[numGroups];
for (final GroupRemapRule rawRule : rawRules) {
if (rawRule.targetGroup >= cleanRules.length) {
continue; // or error?
}
if (cleanRules[rawRule.targetGroup] != null) {
continue; // or error?
}
cleanRules[rawRule.targetGroup] = rawRule;
}
return cleanRules;
}
private void recalcGroupStats(int numGroups) throws ImhotepOutOfMemoryException {
for (int statIndex = 0; statIndex < numStats; statIndex++) {
groupStats[statIndex] = clearAndResize(groupStats[statIndex], numGroups, memory);
needToReCalcGroupStats[statIndex] = true;
}
}
private void recalcGroupCounts(int numGroups) throws ImhotepOutOfMemoryException {
groupDocCount = clearAndResize(groupDocCount, numGroups, memory);
for (int i = 0; i < numDocs; i++) {
groupDocCount[docIdToGroup.get(i)]++;
}
}
private static final String decimalPattern = "-?[0-9]*\\.?[0-9]+";
private static final Pattern floatScalePattern =
Pattern.compile("floatscale\\s+(\\w+)\\s*\\*\\s*(" + decimalPattern + ")\\s*\\+\\s*("
+ decimalPattern + ")");
@Override
public synchronized int pushStat(String statName) throws ImhotepOutOfMemoryException {
if (numStats == MAX_NUMBER_STATS) {
throw new IllegalArgumentException("Maximum number of stats exceeded");
}
if (statName.startsWith("hasstr ")) {
final String s = statName.substring(7).trim();
final String[] split = s.split(":", 2);
if (split.length < 2) {
throw new IllegalArgumentException("invalid hasstr metric: " + statName);
}
statLookup[numStats] = hasStringTermFilter(split[0], split[1]);
} else if (statName.startsWith("hasint ")) {
final String s = statName.substring(7).trim();
final String[] split = s.split(":", 2);
if (split.length < 2) {
throw new IllegalArgumentException("invalid hasint metric: " + statName);
}
statLookup[numStats] = hasIntTermFilter(split[0], Integer.parseInt(split[1]));
} else if (statName.startsWith("regex ")) {
final String s = statName.substring(6).trim();
final String[] split = s.split(":", 2);
if (split.length < 2) {
throw new IllegalArgumentException("invalid regex metric: " + statName);
}
statLookup[numStats] = hasRegexFilter(split[0], split[1]);
} else if (statName.startsWith("inttermcount ")) {
final String field = statName.substring(13).trim();
statLookup[numStats] = intTermCountLookup(field);
} else if (statName.startsWith("strtermcount ")) {
final String field = statName.substring(13).trim();
statLookup[numStats] = stringTermCountLookup(field);
} else if (statName.startsWith("floatscale ")) {
final Matcher matcher = floatScalePattern.matcher(statName);
// accepted format is 'floatscale field*scale+offset' (or just look
// at the pattern)
if (!matcher.matches()) {
throw new IllegalArgumentException("invalid floatscale metric: " + statName);
}
final String field = matcher.group(1);
final double scale;
final double offset;
try {
scale = Double.parseDouble(matcher.group(2));
offset = Double.parseDouble(matcher.group(3));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("invalid offset or scale constant for metric: "
+ statName, e);
}
statLookup[numStats] = scaledFloatLookup(field, scale, offset);
} else if (statName.startsWith("dynamic ")) {
final String name = statName.substring(8).trim();
final DynamicMetric metric = getDynamicMetrics().get(name);
if (metric == null) {
throw new IllegalArgumentException("invalid dynamic metric: " + name);
}
statLookup[numStats] = metric;
} else if (statName.startsWith("exp ")) {
final int scaleFactor = Integer.valueOf(statName.substring(4).trim());
final IntValueLookup operand = popLookup();
statLookup[numStats] = new Exponential(operand, scaleFactor);
} else if (statName.startsWith("log ")) {
final int scaleFactor = Integer.valueOf(statName.substring(4).trim());
final IntValueLookup operand = popLookup();
statLookup[numStats] = new Log(operand, scaleFactor);
} else if (statName.startsWith("ref ")) {
final int depth = Integer.valueOf(statName.substring(4).trim());
statLookup[numStats] = new DelegatingMetric(statLookup[numStats - depth - 1]);
} else if (is32BitInteger(statName)) {
final int constant = Integer.parseInt(statName); // guaranteed not
// to fail
statLookup[numStats] = new Constant(constant);
} else if (statName.startsWith("interleave ")) {
final int count = Integer.valueOf(statName.substring(11).trim());
final IntValueLookup[] originals = new IntValueLookup[count];
final int start = numStats - count;
if (start < 0) {
throw new IllegalArgumentException(statName + ": expected at least " + count
+ " metrics on stack, found " + numStats);
}
for (int i = 0; i < count; i++) {
originals[i] = statLookup[start + i];
}
final IntValueLookup[] cached =
new CachedInterleavedMetrics(memory, flamdexReader.getNumDocs(), originals).getLookups();
for (int i = 0; i < count; i++) {
statLookup[start + i].close();
statLookup[start + i] = cached[i];
}
/* this request is valid, so keep track of the command */
this.statCommands.add(statName);
return numStats; // cleanup below only applies if we're increasing
// the number of metrics
} else if (statName.startsWith("mulshr ")) {
final int shift = Integer.valueOf(statName.substring(7).trim());
if (shift < 0 || shift > 31) {
throw new IllegalArgumentException(
"mulshr shift value must be between 0 and 31 (inclusive)");
}
final IntValueLookup b = popLookup();
final IntValueLookup a = popLookup();
statLookup[numStats] = new MultiplyAndShiftRight(a, b, shift);
} else if (statName.startsWith("shldiv ")) {
final int shift = Integer.valueOf(statName.substring(7).trim());
if (shift < 0 || shift > 31) {
throw new IllegalArgumentException(
"shldiv shift value must be between 0 and 31 (inclusive)");
}
final IntValueLookup b = popLookup();
final IntValueLookup a = popLookup();
statLookup[numStats] = new ShiftLeftAndDivide(a, b, shift);
} else if (statName.startsWith("log1pexp ")) {
final int scale = Integer.valueOf(statName.substring(9).trim());
final IntValueLookup operand = popLookup();
statLookup[numStats] = new Log1pExp(operand, scale);
} else if (statName.startsWith("logistic ")) {
final String[] params = statName.substring(9).split(" ");
if (params.length != 2) {
throw new IllegalArgumentException("logistic requires 2 arguments: "+statName);
}
final double scaleDown;
final double scaleUp;
try {
scaleDown = Double.parseDouble(params[0]);
scaleUp = Double.parseDouble(params[1]);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("invalid scale factor for metric: "
+ statName, e);
}
final IntValueLookup operand = popLookup();
statLookup[numStats] = new Logistic(operand, scaleDown, scaleUp);
} else if (statName.startsWith("lucene ")) {
final String queryBase64 = statName.substring(7);
final byte[] queryBytes = Base64.decodeBase64(queryBase64.getBytes());
final QueryMessage queryMessage;
try {
queryMessage = QueryMessage.parseFrom(queryBytes);
} catch (InvalidProtocolBufferException e) {
throw Throwables.propagate(e);
}
final Query query = ImhotepDaemonMarshaller.marshal(queryMessage);
final int bitSetMemory = (flamdexReader.getNumDocs() + 64) / 64 * 8;
memory.claimMemory(bitSetMemory);
try {
final FastBitSet bitSet = new FastBitSet(flamdexReader.getNumDocs());
final FastBitSetPooler bitSetPooler = new ImhotepBitSetPooler(memory);
final FlamdexSearcher searcher = new FlamdexSearcher(flamdexReader);
searcher.search(query, bitSet, bitSetPooler);
statLookup[numStats] = new com.indeed.flamdex.fieldcache.BitSetIntValueLookup(bitSet);
} catch (Throwable t) {
memory.releaseMemory(bitSetMemory);
if (t instanceof FlamdexOutOfMemoryException) throw new ImhotepOutOfMemoryException(t);
throw Throwables2.propagate(t, ImhotepOutOfMemoryException.class);
}
} else if (Metric.getMetric(statName) != null) {
final IntValueLookup a;
final IntValueLookup b;
switch (Metric.getMetric(statName)) {
case COUNT:
statLookup[numStats] = new Count();
break;
case CACHED:
a = popLookup();
try {
statLookup[numStats] = new CachedMetric(a, flamdexReader.getNumDocs(), memory);
} finally {
a.close();
}
break;
case ADD:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Addition(a, b);
break;
case SUBTRACT:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Subtraction(a, b);
break;
case MULTIPLY:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Multiplication(a, b);
break;
case DIVIDE:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Division(a, b);
break;
case MODULUS:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Modulus(a, b);
break;
case ABSOLUTE_VALUE:
a = popLookup();
statLookup[numStats] = new AbsoluteValue(a);
break;
case MIN:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Min(a, b);
break;
case MAX:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Max(a, b);
break;
case EQ:
b = popLookup();
a = popLookup();
statLookup[numStats] = new Equal(a, b);
break;
case NE:
b = popLookup();
a = popLookup();
statLookup[numStats] = new NotEqual(a, b);
break;
case LT:
b = popLookup();
a = popLookup();
statLookup[numStats] = new LessThan(a, b);
break;
case LTE:
b = popLookup();
a = popLookup();
statLookup[numStats] = new LessThanOrEqual(a, b);
break;
case GT:
b = popLookup();
a = popLookup();
statLookup[numStats] = new GreaterThan(a, b);
break;
case GTE:
b = popLookup();
a = popLookup();
statLookup[numStats] = new GreaterThanOrEqual(a, b);
break;
default:
throw new RuntimeException("this is a bug");
}
} else {
try {
// Temporary hack to allow transition from Lucene to Flamdex shards where the time metric has a different name
if(statName.equals("time") && flamdexReader.getIntFields().contains("unixtime")) {
statName = "unixtime";
} else if(statName.equals("unixtime") && !flamdexReader.getIntFields().contains("unixtime")) {
statName = "time";
}
statLookup[numStats] = flamdexReader.getMetric(statName);
} catch (FlamdexOutOfMemoryException e) {
throw new ImhotepOutOfMemoryException(e);
}
}
// todo: check if metric is invalid... ?
groupStats[numStats] = clearAndResize((long[]) null, docIdToGroup.getNumGroups(), memory);
needToReCalcGroupStats[numStats] = true;
numStats++;
// FlamdexFTGSIterator.termGrpStats
if (!memory.claimMemory(8L * docIdToGroup.getNumGroups())) {
throw new ImhotepOutOfMemoryException();
}
/* this request is valid, so keep track of the command */
this.statCommands.add(statName);
return numStats;
}
@Override
public synchronized int pushStats(final List<String> statNames) throws ImhotepOutOfMemoryException {
for (String statName : statNames) {
this.pushStat(statName);
}
return numStats;
}
private static boolean is32BitInteger(String s) {
try {
Integer.parseInt(s);
return true;
} catch (NumberFormatException e) {
return false;
}
}
private IntValueLookup popLookup() {
if (numStats == 0) {
throw new IllegalStateException("no stat to pop");
}
--numStats;
final IntValueLookup ret = statLookup[numStats];
statLookup[numStats] = null;
final long memFreed = groupStats[numStats].length * 8 + 8L * docIdToGroup.getNumGroups();
groupStats[numStats] = null;
memory.releaseMemory(memFreed);
return ret;
}
@Override
public synchronized int popStat() {
popLookup().close();
/* this request is valid, so keep track of the command */
this.statCommands.add("pop");
return numStats;
}
@Override
public synchronized int getNumStats() {
return numStats;
}
@Override
public int getNumGroups() {
return docIdToGroup.getNumGroups();
}
@Override
public synchronized void createDynamicMetric(String name) throws ImhotepOutOfMemoryException {
if (getDynamicMetrics().containsKey(name)) {
throw new RuntimeException("dynamic metric \"" + name + "\" already exists");
}
if (!memory.claimMemory(flamdexReader.getNumDocs() * 4L)) {
throw new ImhotepOutOfMemoryException();
}
getDynamicMetrics().put(name, new DynamicMetric(flamdexReader.getNumDocs()));
}
@Override
public synchronized void updateDynamicMetric(String name, int[] deltas) throws ImhotepOutOfMemoryException {
final DynamicMetric metric = getDynamicMetrics().get(name);
if (metric == null) {
throw new RuntimeException("dynamic metric \"" + name + "\" does not exist");
}
final int numDocs = flamdexReader.getNumDocs();
for (int doc = 0; doc < numDocs; doc++) {
final int group = docIdToGroup.get(doc);
if (group >= 0 && group < deltas.length) {
metric.add(doc, deltas[group]);
}
}
// pessimistically recompute all stats -- other metrics may indirectly
// refer to this one
for (int i = 0; i < numStats; i++) {
needToReCalcGroupStats[i] = true;
groupStats[i] = clearAndResize(groupStats[i], docIdToGroup.getNumGroups(), memory);
}
}
@Override
public synchronized void conditionalUpdateDynamicMetric(String name,
final RegroupCondition[] conditions,
final int[] deltas) {
validateConditionalUpdateDynamicMetricInput(conditions, deltas);
final DynamicMetric metric = getDynamicMetrics().get(name);
if (metric == null) {
throw new RuntimeException("dynamic metric \"" + name + "\" does not exist");
}
final List<Integer> indexes = Lists.newArrayList();
for (int i = 0; i < conditions.length; i++) {
indexes.add(i);
}
// I don't think it's really worth claiming memory for this, so I won't.
final ImmutableListMultimap<Pair<String, Boolean>, Integer> fieldIndexMap =
Multimaps.index(indexes, new Function<Integer, Pair<String, Boolean>>() {
@Override
public Pair<String, Boolean> apply(Integer index) {
return Pair.of(conditions[index].field, conditions[index].intType);
}
});
for (Pair<String, Boolean> field : fieldIndexMap.keySet()) {
final String fieldName = field.getFirst();
final boolean fieldIsIntType = field.getSecond();
final List<Integer> indices = Lists.newArrayList(fieldIndexMap.get(field));
// Sort within the field
Collections.sort(indices, new Comparator<Integer>() {
@Override
public int compare(Integer o1, Integer o2) {
if (fieldIsIntType) {
return Longs.compare(conditions[o1].intTerm, conditions[o2].intTerm);
} else {
return conditions[o1].stringTerm.compareTo(conditions[o2].stringTerm);
}
}
});
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
if (fieldIsIntType) {
final IntTermIterator termIterator = flamdexReader.getIntTermIterator(fieldName);
for (int index : indices) {
final long term = conditions[index].intTerm;
final int delta = deltas[index];
termIterator.reset(term);
if (termIterator.next() && termIterator.term() == term) {
docIdStream.reset(termIterator);
adjustDeltas(metric, docIdStream, delta);
}
}
} else {
final StringTermIterator termIterator =
flamdexReader.getStringTermIterator(fieldName);
for (int index : indices) {
final String term = conditions[index].stringTerm;
final int delta = deltas[index];
termIterator.reset(term);
if (termIterator.next() && termIterator.term().equals(term)) {
docIdStream.reset(termIterator);
adjustDeltas(metric, docIdStream, delta);
}
}
}
}
}
private void validateConditionalUpdateDynamicMetricInput(RegroupCondition[] conditions,
int[] deltas) {
if (conditions.length != deltas.length) {
throw new IllegalArgumentException("conditions and deltas must be of the same length");
}
for (RegroupCondition condition : conditions) {
if (condition.inequality) {
throw new IllegalArgumentException(
"inequality conditions not currently supported by conditionalUpdateDynamicMetric!");
}
}
}
public void groupConditionalUpdateDynamicMetric(String name, int[] groups, RegroupCondition[] conditions, int[] deltas) {
if (groups.length != conditions.length) {
throw new IllegalArgumentException("groups and conditions must be the same length");
}
validateConditionalUpdateDynamicMetricInput(conditions, deltas);
final DynamicMetric metric = getDynamicMetrics().get(name);
if (metric == null) {
throw new RuntimeException("dynamic metric \"" + name + "\" does not exist");
}
final IntArrayList groupsSet = new IntArrayList();
final FastBitSet groupsWithCurrentTerm = new FastBitSet(docIdToGroup.getNumGroups());
final int[] groupToDelta = new int[docIdToGroup.getNumGroups()];
final Map<String, Long2ObjectMap<Pair<IntArrayList, IntArrayList>>> intFields = Maps.newHashMap();
final Map<String, Map<String, Pair<IntArrayList, IntArrayList>>> stringFields = Maps.newHashMap();
for (int i = 0; i < groups.length; i++) {
//if the last group(s) exist on other shards but not this one docIdToGroup.getNumGroups() is wrong
if (groups[i] >= groupToDelta.length) continue;
final RegroupCondition condition = conditions[i];
Pair<IntArrayList, IntArrayList> groupDeltas;
if (condition.intType) {
Long2ObjectMap<Pair<IntArrayList, IntArrayList>> termToGroupDeltas = intFields.get(condition.field);
if (termToGroupDeltas == null) {
termToGroupDeltas = new Long2ObjectOpenHashMap<Pair<IntArrayList, IntArrayList>>();
intFields.put(condition.field, termToGroupDeltas);
}
groupDeltas = termToGroupDeltas.get(condition.intTerm);
if (groupDeltas == null) {
groupDeltas = Pair.of(new IntArrayList(), new IntArrayList());
termToGroupDeltas.put(condition.intTerm, groupDeltas);
}
} else {
Map<String, Pair<IntArrayList, IntArrayList>> termToGroupDeltas = stringFields.get(condition.field);
if (termToGroupDeltas == null) {
termToGroupDeltas = Maps.newHashMap();
stringFields.put(condition.field, termToGroupDeltas);
}
groupDeltas = termToGroupDeltas.get(condition.stringTerm);
if (groupDeltas == null) {
groupDeltas = Pair.of(new IntArrayList(), new IntArrayList());
termToGroupDeltas.put(condition.stringTerm, groupDeltas);
}
}
groupDeltas.getFirst().add(groups[i]);
groupDeltas.getSecond().add(deltas[i]);
}
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
IntTermIterator intTermIterator = null;
StringTermIterator stringTermIterator = null;
try {
for (Map.Entry<String, Long2ObjectMap<Pair<IntArrayList, IntArrayList>>> entry : intFields.entrySet()) {
final String field = entry.getKey();
final Long2ObjectMap<Pair<IntArrayList, IntArrayList>> termToGroupDeltas = entry.getValue();
intTermIterator = flamdexReader.getIntTermIterator(field);
for (Long2ObjectMap.Entry<Pair<IntArrayList, IntArrayList>> entry2 : termToGroupDeltas.long2ObjectEntrySet()) {
for (int i = 0; i < groupsSet.size(); i++) {
groupsWithCurrentTerm.clear(groupsSet.getInt(i));
}
groupsSet.clear();
final long term = entry2.getLongKey();
final Pair<IntArrayList, IntArrayList> groupDeltas = entry2.getValue();
final IntArrayList termGroups = groupDeltas.getFirst();
final IntArrayList termDeltas = groupDeltas.getSecond();
for (int i = 0; i < termGroups.size(); i++) {
final int group = termGroups.getInt(i);
groupsWithCurrentTerm.set(group);
groupToDelta[group] = termDeltas.getInt(i);
groupsSet.add(group);
}
intTermIterator.reset(term);
if (!intTermIterator.next()) continue;
if (intTermIterator.term() != term) continue;
docIdStream.reset(intTermIterator);
updateDocsWithTermDynamicMetric(metric, groupsWithCurrentTerm, groupToDelta, docIdStream);
}
intTermIterator.close();
}
for (Map.Entry<String, Map<String, Pair<IntArrayList, IntArrayList>>> entry : stringFields.entrySet()) {
final String field = entry.getKey();
final Map<String, Pair<IntArrayList, IntArrayList>> termToGroupDeltas = entry.getValue();
stringTermIterator = flamdexReader.getStringTermIterator(field);
for (Map.Entry<String, Pair<IntArrayList, IntArrayList>> entry2 : termToGroupDeltas.entrySet()) {
for (int i = 0; i < groupsSet.size(); i++) {
groupsWithCurrentTerm.clear(groupsSet.getInt(i));
}
groupsSet.clear();
final String term = entry2.getKey();
final Pair<IntArrayList, IntArrayList> groupDeltas = entry2.getValue();
final IntArrayList termGroups = groupDeltas.getFirst();
final IntArrayList termDeltas = groupDeltas.getSecond();
for (int i = 0; i < termGroups.size(); i++) {
final int group = termGroups.getInt(i);
groupsWithCurrentTerm.set(group);
groupToDelta[group] = termDeltas.getInt(i);
groupsSet.add(group);
}
stringTermIterator.reset(term);
if (!stringTermIterator.next()) continue;
if (!stringTermIterator.term().equals(term)) continue;
docIdStream.reset(stringTermIterator);
updateDocsWithTermDynamicMetric(metric, groupsWithCurrentTerm, groupToDelta, docIdStream);
}
stringTermIterator.close();
}
// pessimistically recompute all stats -- other metrics may indirectly
// refer to this one
for (int i = 0; i < numStats; i++) {
needToReCalcGroupStats[i] = true;
groupStats[i] = clearAndResize(groupStats[i], docIdToGroup.getNumGroups(), memory);
}
} catch (ImhotepOutOfMemoryException e) {
throw Throwables.propagate(e);
} finally {
Closeables2.closeAll(log, docIdStream, intTermIterator, stringTermIterator);
}
}
private void updateDocsWithTermDynamicMetric(DynamicMetric metric, FastBitSet groupsWithCurrentTerm, int[] groupToDelta, DocIdStream docIdStream) {
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
docIdToGroup.fillDocGrpBuffer(docIdBuf, docGroupBuffer, n);
for (int i = 0; i < n; i++) {
if (groupsWithCurrentTerm.get(docGroupBuffer[i])) {
metric.add(docIdBuf[i], groupToDelta[docGroupBuffer[i]]);
}
}
if (n < docIdBuf.length) break;
}
}
private synchronized void adjustDeltas(DynamicMetric metric, DocIdStream docIdStream, int delta) {
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; i++) {
metric.add(docIdBuf[i], delta);
}
if (n != docIdBuf.length) {
break;
}
}
}
@Override
public synchronized void close() {
if (!closed) {
try {
tryClose();
} finally {
closed = true;
}
}
}
private void tryClose() {
try {
Closeables2.closeQuietly(flamdexReaderRef, log);
while (numStats > 0) {
popStat();
}
if (docIdToGroup != null) {
final long memFreed =
docIdToGroup.memoryUsed() + groupDocCount.length * 4L + BUFFER_SIZE
* (4 + 4 + 4) + 12L * docIdToGroup.getNumGroups();
docIdToGroup = null;
groupDocCount = null;
memory.releaseMemory(memFreed);
}
long dynamicMetricUsage = 0;
for (DynamicMetric metric : getDynamicMetrics().values()) {
dynamicMetricUsage += metric.memoryUsed();
}
getDynamicMetrics().clear();
if (dynamicMetricUsage > 0) {
memory.releaseMemory(dynamicMetricUsage);
}
if (memory.usedMemory() > 0) {
log.error("ImhotepLocalSession is leaking! memory reserved after all memory has been freed: "
+ memory.usedMemory());
}
/* clean up the optimization log */
if (this.optimizationLog.exists()) {
this.optimizationLog.delete();
}
try {
/* close temp readers, if there are any */
if (this.originalReader != null) {
this.flamdexReaderRef.close();
this.flamdexReader = this.originalReader;
this.flamdexReaderRef = this.originalReaderRef;
}
} catch (IOException e) {
log.error("Could not close optimized reader");
}
} finally {
Closeables2.closeQuietly(memory, log);
}
}
protected void finalize() throws Throwable {
if (!closed) {
log.error("ImhotepLocalSession was not closed!!!!!!! stack trace at construction:",
constructorStackTrace);
close();
}
}
@Override
public synchronized void resetGroups() throws ImhotepOutOfMemoryException {
resetOptimizedReaders();
resetGroupsTo(1);
}
private void resetGroupsTo(int group) throws ImhotepOutOfMemoryException {
final long bytesToFree = docIdToGroup.memoryUsed();
final int newNumGroups = group + 1;
clearZeroDocBitsets();
accountForFlamdexFTGSIteratorMemChange(docIdToGroup.getNumGroups(), newNumGroups);
docIdToGroup = new ConstantGroupLookup(this, group, numDocs);
recalcGroupCounts(newNumGroups);
recalcGroupStats(newNumGroups);
memory.releaseMemory(bytesToFree);
}
void clearZeroDocBitsets() {
// Remove cache of what terms only exist in group zero
if (fieldZeroDocBitsets != null) {
long memoryFreed = 0L;
for (Pair<String, Boolean> field : fieldZeroDocBitsets.keySet()) {
memoryFreed += fieldZeroDocBitsets.get(field).memoryUsage();
}
fieldZeroDocBitsets.clear();
memory.releaseMemory(memoryFreed);
}
}
private static int[] clearAndResize(int[] a, int newSize, MemoryReserver memory) throws ImhotepOutOfMemoryException {
if (a == null || newSize > a.length) {
if (!memory.claimMemory(newSize * 4)) {
throw new ImhotepOutOfMemoryException();
}
final int[] ret = new int[newSize];
if (a != null) {
memory.releaseMemory(a.length * 4);
}
return ret;
}
Arrays.fill(a, 0);
return a;
}
private static long[] clearAndResize(long[] a, int newSize, MemoryReserver memory) throws ImhotepOutOfMemoryException {
if (a == null || newSize > a.length) {
if (!memory.claimMemory(newSize * 8)) {
throw new ImhotepOutOfMemoryException();
}
final long[] ret = new long[newSize];
if (a != null) {
memory.releaseMemory(a.length * 8);
}
return ret;
}
Arrays.fill(a, 0);
return a;
}
private static void updateGroupStatsAllDocs(IntValueLookup statLookup,
long[] groupStats,
GroupLookup docIdToGroup,
int[] docGrpBuffer,
int[] docIdBuf,
long[] valBuf) {
// populate new group stats
final int numDocs = docIdToGroup.size();
for (int start = 0; start < numDocs; start += BUFFER_SIZE) {
final int n = Math.min(BUFFER_SIZE, numDocs - start);
for (int i = 0; i < n; i++) {
docIdBuf[i] = start + i;
}
docIdToGroup.fillDocGrpBuffer(docIdBuf, docGrpBuffer, n);
updateGroupStatsDocIdBuf(statLookup, groupStats, docGrpBuffer, docIdBuf, valBuf, n);
}
}
static void updateGroupStatsDocIdBuf(IntValueLookup statLookup,
long[] groupStats,
int[] docGrpBuffer,
int[] docIdBuf,
long[] valBuf,
int n) {
statLookup.lookup(docIdBuf, valBuf, n);
for (int i = 0; i < n; i++) {
groupStats[docGrpBuffer[i]] += valBuf[i];
}
}
static void clear(long[] array, int[] groupsSeen, int groupsSeenCount) {
for (int i = 0; i < groupsSeenCount; i++) {
array[groupsSeen[i]] = 0;
}
}
private static class IntFieldConditionSummary {
long maxInequalityTerm = Long.MIN_VALUE;
Set<Long> otherTerms = new HashSet<Long>();
}
private static class StringFieldConditionSummary {
String maxInequalityTerm = null;
Set<String> otherTerms = new HashSet<String>();
}
private void applyIntConditions(GroupRemapRule[] remapRules,
DocIdStream docIdStream,
ThreadSafeBitSet docRemapped) {
final Map<String, IntFieldConditionSummary> intFields =
buildIntFieldConditionSummaryMap(remapRules);
for (final String intField : intFields.keySet()) {
final IntFieldConditionSummary summary = intFields.get(intField);
log.debug("Splitting groups using int field: " + intField);
final IntTermIterator itr = flamdexReader.getIntTermIterator(intField);
if (summary.maxInequalityTerm >= 0) {
while (itr.next()) {
final long itrTerm = itr.term();
if (itrTerm > summary.maxInequalityTerm
&& !summary.otherTerms.contains(itrTerm)) {
continue;
}
docIdStream.reset(itr);
do {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
docIdToGroup.applyIntConditionsCallback(n,
docRemapped,
remapRules,
intField,
itrTerm);
if (n != docIdBuf.length) {
break;
}
} while (true);
}
} else {
for (final long term : summary.otherTerms) {
itr.reset(term);
if (itr.next() && itr.term() == term) {
docIdStream.reset(itr);
do {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
docIdToGroup.applyIntConditionsCallback(n,
docRemapped,
remapRules,
intField,
term);
if (n != docIdBuf.length) {
break;
}
} while (true);
}
}
}
itr.close();
}
}
private void applyStringConditions(GroupRemapRule[] remapRules,
DocIdStream docIdStream,
ThreadSafeBitSet docRemapped) {
final Map<String, StringFieldConditionSummary> stringFields =
buildStringFieldConditionSummaryMap(remapRules);
for (final String stringField : stringFields.keySet()) {
final StringFieldConditionSummary summary = stringFields.get(stringField);
log.debug("Splitting groups using string field: " + stringField);
final StringTermIterator itr = flamdexReader.getStringTermIterator(stringField);
if (summary.maxInequalityTerm != null) {
while (itr.next()) {
final String itrTerm = itr.term();
if ((summary.maxInequalityTerm.compareTo(itrTerm) < 0)
&& !summary.otherTerms.contains(itrTerm)) {
continue;
}
docIdStream.reset(itr);
do {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
docIdToGroup.applyStringConditionsCallback(n,
docRemapped,
remapRules,
stringField,
itrTerm);
if (n != docIdBuf.length) {
break;
}
} while (true);
}
} else {
for (final String term : summary.otherTerms) {
itr.reset(term);
if (itr.next() && itr.term().equals(term)) {
docIdStream.reset(itr);
do {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
docIdToGroup.applyStringConditionsCallback(n,
docRemapped,
remapRules,
stringField,
term);
if (n != docIdBuf.length) {
break;
}
} while (true);
}
}
}
itr.close();
}
}
static boolean checkStringCondition(RegroupCondition condition,
String stringField,
String itrTerm) {
if (condition == null) {
return true;
}
if (condition.intType) {
return true;
}
// field is interned
if (stringField != condition.field) {
return true;
}
if (condition.inequality) {
if (itrTerm.compareTo(condition.stringTerm) > 0) {
return true;
}
} else {
if (!itrTerm.equals(condition.stringTerm)) {
return true;
}
}
return false;
}
static boolean checkIntCondition(RegroupCondition condition, String intField, long itrTerm) {
if (condition == null) {
return true;
}
if (!condition.intType) {
return true;
}
// field is interned
if (intField != condition.field) {
return true;
}
if (condition.inequality) {
if (itrTerm > condition.intTerm) {
return true;
}
} else {
if (itrTerm != condition.intTerm) {
return true;
}
}
return false;
}
private static Map<String, IntFieldConditionSummary> buildIntFieldConditionSummaryMap(GroupRemapRule[] rules) {
final Map<String, IntFieldConditionSummary> ret =
new HashMap<String, IntFieldConditionSummary>();
for (final GroupRemapRule rule : rules) {
if (rule == null) {
continue;
}
final RegroupCondition condition = rule.condition;
if (condition == null) {
continue;
}
if (!condition.intType) {
continue;
}
if (!condition.inequality) {
continue;
}
IntFieldConditionSummary entry = ret.get(condition.field);
if (entry == null) {
entry = new IntFieldConditionSummary();
ret.put(condition.field, entry);
}
entry.maxInequalityTerm = Math.max(entry.maxInequalityTerm, condition.intTerm);
}
for (final GroupRemapRule rule : rules) {
if (rule == null) {
continue;
}
final RegroupCondition condition = rule.condition;
if (condition == null) {
continue;
}
if (!condition.intType) {
continue;
}
if (condition.inequality) {
continue;
}
IntFieldConditionSummary entry = ret.get(condition.field);
if (entry == null) {
entry = new IntFieldConditionSummary();
ret.put(condition.field, entry);
}
if (condition.intTerm <= entry.maxInequalityTerm) {
continue;
}
entry.otherTerms.add(condition.intTerm);
}
return ret;
}
private static Map<String, StringFieldConditionSummary> buildStringFieldConditionSummaryMap(GroupRemapRule[] rules) {
final Map<String, StringFieldConditionSummary> ret =
new HashMap<String, StringFieldConditionSummary>();
for (final GroupRemapRule rule : rules) {
if (rule == null) {
continue;
}
final RegroupCondition condition = rule.condition;
if (condition == null) {
continue;
}
if (condition.intType) {
continue;
}
if (!condition.inequality) {
continue;
}
StringFieldConditionSummary entry = ret.get(condition.field);
if (entry == null) {
entry = new StringFieldConditionSummary();
ret.put(condition.field, entry);
}
entry.maxInequalityTerm = stringMax(entry.maxInequalityTerm, condition.stringTerm);
}
for (final GroupRemapRule rule : rules) {
if (rule == null) {
continue;
}
final RegroupCondition condition = rule.condition;
if (condition == null) {
continue;
}
if (condition.intType) {
continue;
}
if (condition.inequality) {
continue;
}
StringFieldConditionSummary entry = ret.get(condition.field);
if (entry == null) {
entry = new StringFieldConditionSummary();
ret.put(condition.field, entry);
}
if (entry.maxInequalityTerm != null
&& condition.stringTerm.compareTo(entry.maxInequalityTerm) <= 0) {
continue;
}
entry.otherTerms.add(condition.stringTerm);
}
return ret;
}
private static String stringMax(String a, String b) {
if (a == null) {
return b;
}
if (b == null) {
return a;
}
if (a.compareTo(b) >= 0) {
return a;
}
return b;
}
private static enum Metric {
COUNT("count()"),
CACHED("cached()"),
ADD("+"),
SUBTRACT("-"),
MULTIPLY("*"),
DIVIDE("/"),
MODULUS("%"),
ABSOLUTE_VALUE("abs()"),
MIN("min()"),
MAX("max()"),
EQ("="),
NE("!="),
LT("<"),
LTE("<="),
GT(">"),
GTE(">=");
private final String key;
private Metric(final String key) {
this.key = key;
}
private static final Map<String, Metric> map;
static {
final ImmutableMap.Builder<String, Metric> builder = ImmutableMap.builder();
for (final Metric metric : Metric.values()) {
builder.put(metric.key, metric);
}
map = builder.build();
}
static Metric getMetric(final String statName) {
return map.get(statName);
}
}
private IntValueLookup hasIntTermFilter(final String field, final long term) throws ImhotepOutOfMemoryException {
final long memoryUsage = getBitSetMemoryUsage();
if (!memory.claimMemory(memoryUsage)) {
throw new ImhotepOutOfMemoryException();
}
return new BitSetIntValueLookup(FlamdexUtils.cacheHasIntTerm(field, term, flamdexReader),
memoryUsage);
}
private IntValueLookup hasStringTermFilter(final String field, final String term) throws ImhotepOutOfMemoryException {
final long memoryUsage = getBitSetMemoryUsage();
if (!memory.claimMemory(memoryUsage)) {
throw new ImhotepOutOfMemoryException();
}
return new BitSetIntValueLookup(
FlamdexUtils.cacheHasStringTerm(field, term, flamdexReader),
memoryUsage);
}
private IntValueLookup hasRegexFilter(String field, String regex) throws ImhotepOutOfMemoryException {
final long memoryUsage = getBitSetMemoryUsage();
if (!memory.claimMemory(memoryUsage)) {
throw new ImhotepOutOfMemoryException();
}
return new BitSetIntValueLookup(
FlamdexUtils.cacheRegex(field, regex, flamdexReader),
memoryUsage
);
}
private IntValueLookup intTermCountLookup(final String field) throws ImhotepOutOfMemoryException {
final long memoryUsage = flamdexReader.getNumDocs();
if (!memory.claimMemory(memoryUsage)) {
throw new ImhotepOutOfMemoryException();
}
final byte[] array = new byte[flamdexReader.getNumDocs()];
final IntTermIterator iterator = flamdexReader.getIntTermIterator(field);
try {
final DocIdStream docIdStream = flamdexReader.getDocIdStream();
try {
while (iterator.next()) {
docIdStream.reset(iterator);
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (array[doc] != (byte) 255) {
++array[doc];
}
}
if (n < BUFFER_SIZE) {
break;
}
}
}
} finally {
docIdStream.close();
}
} finally {
iterator.close();
}
return new MemoryReservingIntValueLookupWrapper(new ByteArrayIntValueLookup(array, 0, 255));
}
private IntValueLookup stringTermCountLookup(final String field) throws ImhotepOutOfMemoryException {
final long memoryUsage = flamdexReader.getNumDocs();
if (!memory.claimMemory(memoryUsage)) {
throw new ImhotepOutOfMemoryException();
}
final byte[] array = new byte[flamdexReader.getNumDocs()];
final StringTermDocIterator iterator = flamdexReader.getStringTermDocIterator(field);
try {
while (iterator.nextTerm()) {
while (true) {
final int n = iterator.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
final int doc = docIdBuf[i];
if (array[doc] != (byte) 255) {
++array[doc];
}
}
if (n < BUFFER_SIZE) {
break;
}
}
}
} finally {
Closeables2.closeQuietly(iterator, log);
}
return new MemoryReservingIntValueLookupWrapper(new ByteArrayIntValueLookup(array, 0, 255));
}
private IntValueLookup scaledFloatLookup(final String field, double scale, double offset) throws ImhotepOutOfMemoryException {
final long memoryUsage = 4 * flamdexReader.getNumDocs();
if (!memory.claimMemory(memoryUsage)) {
throw new ImhotepOutOfMemoryException();
}
final int[] array = new int[flamdexReader.getNumDocs()];
int min = Integer.MAX_VALUE;
int max = Integer.MIN_VALUE;
final StringTermDocIterator iterator = flamdexReader.getStringTermDocIterator(field);
try {
while (iterator.nextTerm()) {
final String term = iterator.term();
int number;
try {
final double termFloat = Double.parseDouble(term);
number = (int) Math.round(termFloat * scale + offset);
} catch (NumberFormatException e) {
number = 0;
}
min = Math.min(min, number);
max = Math.max(max, number);
while (true) {
final int n = iterator.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; i++) {
final int doc = docIdBuf[i];
array[doc] = number;
}
if (n < BUFFER_SIZE) {
break;
}
}
}
} finally {
Closeables2.closeQuietly(iterator, log);
}
return new MemoryReservingIntValueLookupWrapper(new IntArrayIntValueLookup(array, min, max));
}
private int getBitSetMemoryUsage() {
return flamdexReader.getNumDocs() / 8 + ((flamdexReader.getNumDocs() % 8) != 0 ? 1 : 0);
}
private class BitSetIntValueLookup implements IntValueLookup {
private ThreadSafeBitSet bitSet;
private final long memoryUsage;
private BitSetIntValueLookup(ThreadSafeBitSet bitSet, long memoryUsage) {
this.bitSet = bitSet;
this.memoryUsage = memoryUsage;
}
@Override
public long getMin() {
return 0;
}
@Override
public long getMax() {
return 1;
}
@Override
public void lookup(int[] docIds, long[] values, int n) {
for (int i = 0; i < n; ++i) {
values[i] = bitSet.get(docIds[i]) ? 1 : 0;
}
}
@Override
public long memoryUsed() {
return 0;
}
@Override
public void close() {
bitSet = null;
memory.releaseMemory(memoryUsage);
}
}
private final class MemoryReservingIntValueLookupWrapper implements IntValueLookup {
final IntValueLookup lookup;
private MemoryReservingIntValueLookupWrapper(final IntValueLookup lookup) {
this.lookup = lookup;
}
@Override
public long getMin() {
return lookup.getMin();
}
@Override
public long getMax() {
return lookup.getMax();
}
@Override
public void lookup(final int[] docIds, final long[] values, final int n) {
lookup.lookup(docIds, values, n);
}
@Override
public long memoryUsed() {
return lookup.memoryUsed();
}
@Override
public void close() {
final long usedMemory = memoryUsed();
lookup.close();
memory.releaseMemory(usedMemory);
}
}
}