/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.flamdex.utils;
import com.indeed.util.core.threads.ThreadSafeBitSet;
import com.indeed.util.core.io.Closeables2;
import com.indeed.flamdex.api.DocIdStream;
import com.indeed.flamdex.api.FlamdexReader;
import com.indeed.flamdex.api.IntTermIterator;
import com.indeed.flamdex.api.StringTermIterator;
import com.indeed.flamdex.datastruct.FastBitSet;
import com.indeed.flamdex.datastruct.MMapFastBitSet;
import com.indeed.flamdex.fieldcache.UnsortedIntTermDocIterator;
import com.indeed.flamdex.fieldcache.UnsortedIntTermDocIteratorImpl;
import com.indeed.util.io.VIntUtils;
import com.indeed.util.mmap.ByteArray;
import com.indeed.util.mmap.CharArray;
import com.indeed.util.mmap.IntArray;
import com.indeed.util.mmap.LongArray;
import com.indeed.util.mmap.MMapBuffer;
import com.indeed.util.mmap.ShortArray;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import org.apache.log4j.Logger;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
/**
* @author jsgroth
*/
public class FlamdexUtils {
private static final Logger LOG = Logger.getLogger(FlamdexUtils.class);
private static final int BUFFER_SIZE = 32;
public static int[] cacheIntField(String field, FlamdexReader reader) {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(reader, field);
try {
return cacheIntField(iterator, reader.getNumDocs());
} finally {
iterator.close();
}
}
public static long[] cacheLongField(String field, FlamdexReader reader) {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(reader, field);
try {
return cacheLongField(iterator, reader.getNumDocs());
} finally {
iterator.close();
}
}
public static long[] cacheLongField(UnsortedIntTermDocIterator iterator, int numDocs) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final long[] cache = new long[numDocs];
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = term;
}
if (n < BUFFER_SIZE) break;
}
}
return cache;
}
public static MMapBuffer cacheLongFieldToFile(UnsortedIntTermDocIterator iterator, int numDocs, File file) throws IOException {
final int[] docIdBuf = new int[BUFFER_SIZE];
final int length = numDocs * 8;
final MMapBuffer buffer = new MMapBuffer(file, 0L, length, FileChannel.MapMode.READ_WRITE, ByteOrder.LITTLE_ENDIAN);
final LongArray longArray = buffer.memory().longArray(0, numDocs);
try {
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
longArray.set(docIdBuf[i], term);
}
if (n < docIdBuf.length) {
break;
}
}
}
buffer.sync(0, length);
} catch (RuntimeException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
} catch (IOException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
}
return buffer;
}
public static int[] cacheIntField(UnsortedIntTermDocIterator iterator, int numDocs) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final int[] cache = new int[numDocs];
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = (int)term;
}
if (n < BUFFER_SIZE) break;
}
}
return cache;
}
public static MMapBuffer cacheIntFieldToFile(UnsortedIntTermDocIterator iterator, int numDocs, File file) throws IOException {
final int[] docIdBuf = new int[BUFFER_SIZE];
final int length = numDocs * 4;
final MMapBuffer buffer = new MMapBuffer(file, 0L, length, FileChannel.MapMode.READ_WRITE, ByteOrder.LITTLE_ENDIAN);
final IntArray intArray = buffer.memory().intArray(0, numDocs);
try {
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
intArray.set(docIdBuf[i], (int)term);
}
if (n < docIdBuf.length) {
break;
}
}
}
buffer.sync(0, length);
} catch (RuntimeException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
} catch (IOException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
}
return buffer;
}
public static char[] cacheCharField(String field, FlamdexReader reader) {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(reader, field);
try {
return cacheCharField(iterator, reader.getNumDocs());
} finally {
iterator.close();
}
}
public static char[] cacheCharField(UnsortedIntTermDocIterator iterator, int numDocs) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final char[] cache = new char[numDocs];
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = (char)term;
}
if (n < BUFFER_SIZE) break;
}
}
return cache;
}
public static MMapBuffer cacheCharFieldToFile(UnsortedIntTermDocIterator iterator, int numDocs, File file) throws IOException {
final int[] docIdBuf = new int[BUFFER_SIZE];
final int length = numDocs * 2;
final MMapBuffer buffer = new MMapBuffer(file, 0L, length, FileChannel.MapMode.READ_WRITE, ByteOrder.LITTLE_ENDIAN);
final CharArray charArray = buffer.memory().charArray(0, numDocs);
try {
while (iterator.nextTerm()) {
final char term = (char)iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
charArray.set(docIdBuf[i], term);
}
if (n < docIdBuf.length) {
break;
}
}
}
buffer.sync(0, length);
} catch (RuntimeException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
} catch (IOException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
}
return buffer;
}
public static short[] cacheShortField(UnsortedIntTermDocIterator iterator, int numDocs) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final short[] cache = new short[numDocs];
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = (short)term;
}
if (n < BUFFER_SIZE) break;
}
}
return cache;
}
public static MMapBuffer cacheShortFieldToFile(UnsortedIntTermDocIterator iterator, int numDocs, File file) throws IOException {
final int[] docIdBuf = new int[BUFFER_SIZE];
final int length = numDocs * 2;
final MMapBuffer buffer = new MMapBuffer(file, 0L, length, FileChannel.MapMode.READ_WRITE, ByteOrder.LITTLE_ENDIAN);
final ShortArray shortArray = buffer.memory().shortArray(0, numDocs);
try {
while (iterator.nextTerm()) {
final short term = (short)iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
shortArray.set(docIdBuf[i], term);
}
if (n < docIdBuf.length) {
break;
}
}
}
buffer.sync(0, length);
} catch (RuntimeException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
} catch (IOException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
}
return buffer;
}
public static byte[] cacheByteField(String field, FlamdexReader reader) {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(reader, field);
try {
return cacheByteField(iterator, reader.getNumDocs());
} finally {
iterator.close();
}
}
public static byte[] cacheByteField(UnsortedIntTermDocIterator iterator, int numDocs) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final byte[] cache = new byte[numDocs];
while (iterator.nextTerm()) {
final long term = iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = (byte)term;
}
if (n < BUFFER_SIZE) break;
}
}
return cache;
}
public static MMapBuffer cacheByteFieldToFile(UnsortedIntTermDocIterator iterator, int numDocs, File file) throws IOException {
final int[] docIdBuf = new int[BUFFER_SIZE];
final MMapBuffer buffer = new MMapBuffer(file, 0L, numDocs, FileChannel.MapMode.READ_WRITE, ByteOrder.LITTLE_ENDIAN);
final ByteArray byteArray = buffer.memory().byteArray(0, numDocs);
try {
while (iterator.nextTerm()) {
final byte term = (byte)iterator.term();
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
byteArray.set(docIdBuf[i], term);
}
if (n < docIdBuf.length) {
break;
}
}
}
buffer.sync(0, numDocs);
} catch (RuntimeException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
} catch (IOException e) {
Closeables2.closeQuietly(buffer, LOG);
throw e;
}
return buffer;
}
public static FastBitSet cacheBitSetField(String field, FlamdexReader reader) {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(reader, field);
try {
return cacheBitSetField(iterator, reader.getNumDocs());
} finally {
iterator.close();
}
}
public static FastBitSet cacheBitSetField(UnsortedIntTermDocIterator iterator, int numDocs) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final FastBitSet cache = new FastBitSet(numDocs);
while (iterator.nextTerm()) {
final long term = iterator.term();
final boolean boolVal = (term == 1);
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache.set(docIdBuf[i], boolVal);
}
if (n < BUFFER_SIZE) break;
}
}
return cache;
}
public static MMapFastBitSet cacheBitSetFieldToFile(UnsortedIntTermDocIterator iterator, int numDocs, File file) throws IOException {
final int[] docIdBuf = new int[BUFFER_SIZE];
final MMapFastBitSet cache = new MMapFastBitSet(file, numDocs, FileChannel.MapMode.READ_WRITE);
try {
while (iterator.nextTerm()) {
final long term = iterator.term();
final boolean boolVal = (term == 1);
while (true) {
final int n = iterator.nextDocs(docIdBuf);
for (int i = 0; i < n; ++i) {
cache.set(docIdBuf[i], boolVal);
}
if (n < BUFFER_SIZE) break;
}
}
cache.sync();
} catch (RuntimeException e) {
Closeables2.closeQuietly(cache, LOG);
throw e;
} catch (IOException e) {
Closeables2.closeQuietly(cache, LOG);
throw e;
}
return cache;
}
public static String[] cacheStringField(String field, FlamdexReader reader) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final String[] cache = new String[reader.getNumDocs()];
final DocIdStream docIdStream = reader.getDocIdStream();
try {
final StringTermIterator it = reader.getStringTermIterator(field);
try {
while (it.next()) {
docIdStream.reset(it);
final String term = it.term();
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = term;
}
if (n < BUFFER_SIZE) break;
}
}
} finally {
it.close();
}
} finally {
docIdStream.close();
}
return cache;
}
public static float[] cacheStringFieldAsFloat(String field, FlamdexReader reader, boolean ignoreNonFloats) {
final int[] docIdBuf = new int[BUFFER_SIZE];
final float[] cache = new float[reader.getNumDocs()];
final DocIdStream docIdStream = reader.getDocIdStream();
try {
final StringTermIterator it = reader.getStringTermIterator(field);
try {
while (it.next()) {
final float term;
try {
term = Float.parseFloat(it.term());
} catch (NumberFormatException e) {
if (!ignoreNonFloats) {
throw e;
}
continue;
}
docIdStream.reset(it);
while (true) {
final int n = docIdStream.fillDocIdBuffer(docIdBuf);
for (int i = 0; i < n; ++i) {
cache[docIdBuf[i]] = term;
}
if (n < BUFFER_SIZE) break;
}
}
} finally {
it.close();
}
} finally {
docIdStream.close();
}
return cache;
}
public static long[] getMinMaxTerm(String field, FlamdexReader r) {
final IntTermIterator iterator = r.getIntTermIterator(field);
long minTerm = Long.MAX_VALUE;
long maxTerm = Long.MIN_VALUE;
try {
while (iterator.next()) {
maxTerm = Math.max(maxTerm, iterator.term());
minTerm = Math.min(minTerm, iterator.term());
}
} finally {
iterator.close();
}
return new long[]{minTerm, maxTerm};
}
public static int writeVLong(long i, OutputStream out) throws IOException {
return VIntUtils.writeVInt64(out, i);
}
public static long readVLong(InputStream in) throws IOException {
long ret = 0L;
int shift = 0;
do {
int b = in.read();
if (b < 0) {
//sorry
if (shift != 0) {
throw new IllegalStateException();
}
throw new EOFException();
}
ret |= ((b & 0x7FL) << shift);
if (b < 0x80) return ret;
shift += 7;
} while (true);
}
public static ThreadSafeBitSet cacheHasIntTerm(final String field, final long term, final FlamdexReader reader) {
final ThreadSafeBitSet ret = new ThreadSafeBitSet(reader.getNumDocs());
final IntTermIterator iter = reader.getIntTermIterator(field);
try {
iter.reset(term);
if (iter.next() && iter.term() == term) {
final DocIdStream dis = reader.getDocIdStream();
dis.reset(iter);
fillBitSet(dis, ret);
dis.close();
}
} finally {
iter.close();
}
return ret;
}
private static void fillBitSet(DocIdStream dis, ThreadSafeBitSet ret) {
final int[] docIdBuffer = new int[64];
while (true) {
final int n = dis.fillDocIdBuffer(docIdBuffer);
for (int i = 0; i < n; ++i) {
ret.set(docIdBuffer[i]);
}
if (n < docIdBuffer.length) break;
}
}
public static ThreadSafeBitSet cacheHasStringTerm(final String field, final String term, final FlamdexReader reader) {
final ThreadSafeBitSet ret = new ThreadSafeBitSet(reader.getNumDocs());
final StringTermIterator iter = reader.getStringTermIterator(field);
try {
iter.reset(term);
if (iter.next() && iter.term().equals(term)) {
final DocIdStream dis = reader.getDocIdStream();
dis.reset(iter);
fillBitSet(dis, ret);
dis.close();
}
} finally {
iter.close();
}
return ret;
}
public static ThreadSafeBitSet cacheRegex(final String field, final String regex, final FlamdexReader reader) {
final Automaton automaton = new RegExp(regex).toAutomaton();
final ThreadSafeBitSet ret = new ThreadSafeBitSet(reader.getNumDocs());
if (reader.getIntFields().contains(field)) {
cacheIntFieldRegex(field, reader, automaton, ret);
} else if (reader.getStringFields().contains(field)) {
cacheStringFieldRegex(field, reader, automaton, ret);
} else {
// No exception on unknown field because fields can be added and queries can legitimately cross boundaries
// where the field isn't defined. Instead, just return an empty bitset.
}
return ret;
}
private static void cacheIntFieldRegex(String field, FlamdexReader reader, Automaton automaton, ThreadSafeBitSet ret) {
try (final IntTermIterator iter = reader.getIntTermIterator(field);
final DocIdStream dis = reader.getDocIdStream()) {
while (iter.next()) {
if (automaton.run(String.valueOf(iter.term()))) {
dis.reset(iter);
fillBitSet(dis, ret);
}
}
}
}
// TODO: Use automaton.getCommonPrefix() to reset to a start point and short circuit after that prefix?
private static void cacheStringFieldRegex(String field, FlamdexReader reader, Automaton automaton, ThreadSafeBitSet ret) {
try (final StringTermIterator iter = reader.getStringTermIterator(field);
final DocIdStream dis = reader.getDocIdStream()) {
while (iter.next()) {
if (automaton.run(iter.term())) {
dis.reset(iter);
fillBitSet(dis, ret);
}
}
}
}
public static long getIntTotalDocFreq(final FlamdexReader r, final String field) {
final IntTermIterator iter = r.getIntTermIterator(field);
long totalDocFreq = 0L;
try {
while (iter.next()) {
totalDocFreq += iter.docFreq();
}
} finally {
iter.close();
}
return totalDocFreq;
}
public static long getStringTotalDocFreq(final FlamdexReader r, final String field) {
final StringTermIterator iter = r.getStringTermIterator(field);
long totalDocFreq = 0L;
try {
while (iter.next()) {
totalDocFreq += iter.docFreq();
}
} finally {
iter.close();
}
return totalDocFreq;
}
}