/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.flamdex.fieldcache;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.io.Closer;
import com.google.common.io.CountingOutputStream;
import com.google.common.io.LittleEndianDataOutputStream;
import com.indeed.util.core.Pair;
import com.indeed.util.core.Throwables2;
import com.indeed.util.core.io.Closeables2;
import com.indeed.flamdex.api.FlamdexReader;
import com.indeed.flamdex.api.IntValueLookup;
import com.indeed.flamdex.api.StringTermDocIterator;
import com.indeed.flamdex.api.StringValueLookup;
import com.indeed.flamdex.datastruct.MMapFastBitSet;
import com.indeed.flamdex.utils.FlamdexUtils;
import com.indeed.util.mmap.BufferResource;
import com.indeed.util.mmap.IntArray;
import com.indeed.util.mmap.MMapBuffer;
import com.indeed.util.mmap.NativeBuffer;
import com.indeed.util.mmap.ZeroCopyOutputStream;
import org.apache.log4j.Logger;
import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.util.UUID;
/**
* @author jsgroth
*/
public enum FieldCacher {
LONG {
@Override
public long memoryRequired(int numDocs) {
return 8L * numDocs;
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new LongArrayIntValueLookup(FlamdexUtils.cacheLongField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
MMapBuffer buffer;
try {
buffer = new MMapBuffer(cacheFile, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
} catch (FileNotFoundException e) {
buffer = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapBuffer>() {
@Override
public MMapBuffer execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheLongFieldToFile(iterator, numDocs, f);
}
});
}
return new MMapLongArrayIntValueLookup(buffer, numDocs);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".longcache";
}
},
INT {
@Override
public long memoryRequired(int numDocs) {
return 4L * numDocs;
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new IntArrayIntValueLookup(FlamdexUtils.cacheIntField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
MMapBuffer buffer;
try {
buffer = new MMapBuffer(cacheFile, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
} catch (FileNotFoundException e) {
buffer = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapBuffer>() {
@Override
public MMapBuffer execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheIntFieldToFile(iterator, numDocs, f);
}
});
}
return new MMapIntArrayIntValueLookup(buffer, numDocs);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".intcache";
}
},
CHAR {
@Override
public long memoryRequired(int numDocs) {
return 2L * numDocs;
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new CharArrayIntValueLookup(FlamdexUtils.cacheCharField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
MMapBuffer buffer;
try {
buffer = new MMapBuffer(cacheFile, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
} catch (FileNotFoundException e) {
buffer = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapBuffer>() {
@Override
public MMapBuffer execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheCharFieldToFile(iterator, numDocs, f);
}
});
}
return new MMapCharArrayIntValueLookup(buffer, numDocs);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".charcache";
}
},
SHORT {
@Override
public long memoryRequired(int numDocs) {
return 2L * numDocs;
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new ShortArrayIntValueLookup(FlamdexUtils.cacheShortField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
MMapBuffer buffer;
try {
buffer = new MMapBuffer(cacheFile, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
} catch (FileNotFoundException e) {
buffer = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapBuffer>() {
@Override
public MMapBuffer execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheShortFieldToFile(iterator, numDocs, f);
}
});
}
return new MMapShortArrayIntValueLookup(buffer, numDocs);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".shortcache";
}
},
BYTE {
@Override
public long memoryRequired(int numDocs) {
return numDocs;
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new ByteArrayIntValueLookup(FlamdexUtils.cacheByteField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
MMapBuffer buffer;
try {
buffer = new MMapBuffer(cacheFile, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
} catch (FileNotFoundException e) {
buffer = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapBuffer>() {
@Override
public MMapBuffer execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheByteFieldToFile(iterator, numDocs, f);
}
});
}
return new MMapByteArrayIntValueLookup(buffer, numDocs);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".bytecache";
}
},
SIGNED_BYTE {
@Override
public long memoryRequired(int numDocs) {
return numDocs;
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new SignedByteArrayIntValueLookup(FlamdexUtils.cacheByteField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
MMapBuffer buffer;
try {
buffer = new MMapBuffer(cacheFile, FileChannel.MapMode.READ_ONLY, ByteOrder.LITTLE_ENDIAN);
} catch (FileNotFoundException e) {
buffer = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapBuffer>() {
@Override
public MMapBuffer execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheByteFieldToFile(iterator, numDocs, f);
}
});
}
return new MMapSignedByteArrayIntValueLookup(buffer, numDocs);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".sbytecache";
}
},
BITSET {
@Override
public long memoryRequired(int numDocs) {
return 8L * (((long)numDocs + 64) >> 6);
}
@Override
public IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs) {
return new BitSetIntValueLookup(FlamdexUtils.cacheBitSetField(iterator, numDocs));
}
@Override
public IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException {
final File cacheFile = new File(directory, getMMapFileName(field));
try {
return new MMapBitSetIntValueLookup(cacheFile, numDocs);
} catch (FileNotFoundException e) {
// ignore
}
final MMapFastBitSet bitSet = cacheToFileAtomically(iterator, numDocs, field, directory, cacheFile, new CacheToFileOperation<MMapFastBitSet>() {
@Override
public MMapFastBitSet execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException {
return FlamdexUtils.cacheBitSetFieldToFile(iterator, numDocs, f);
}
});
return new MMapBitSetIntValueLookup(bitSet);
}
@Override
public String getMMapFileName(String field) {
return "fld-" + field + ".bitsetcache";
}
};
private static final Logger log = Logger.getLogger(FieldCacher.class);
public abstract long memoryRequired(int numDocs);
public final IntValueLookup newFieldCache(String field, FlamdexReader r) {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(r, field);
try {
return newFieldCache(iterator, r.getNumDocs());
} finally {
iterator.close();
}
}
public abstract IntValueLookup newFieldCache(UnsortedIntTermDocIterator iterator, int numDocs);
public final IntValueLookup newMMapFieldCache(String field, FlamdexReader r, String directory) throws IOException {
final UnsortedIntTermDocIterator iterator = UnsortedIntTermDocIteratorImpl.create(r, field);
try {
return newMMapFieldCache(iterator, r.getNumDocs(), field, directory);
} finally {
iterator.close();
}
}
public static StringValueLookup newStringValueLookup(String field, FlamdexReader r, String directory) throws IOException {
final Pair<? extends BufferResource, ? extends BufferResource> pair = buildStringValueLookup(field, r, directory);
return new MMapStringValueLookup(pair.getFirst(), pair.getSecond());
}
private static Pair<? extends BufferResource, ? extends BufferResource> buildStringValueLookup(final String field,
final FlamdexReader r,
final String directory) throws IOException {
final Closer closer = Closer.create();
StringTermDocIterator stringTermDocIterator = null;
try {
final NativeBuffer offsets;
offsets = closer.register(new NativeBuffer(4*r.getNumDocs(), ByteOrder.LITTLE_ENDIAN));
final IntArray intArray = offsets.memory().intArray(0, r.getNumDocs());
final ZeroCopyOutputStream valuesFileOut = new ZeroCopyOutputStream();
final CountingOutputStream counter = new CountingOutputStream(new BufferedOutputStream(valuesFileOut));
final LittleEndianDataOutputStream valuesOut = closer.register(new LittleEndianDataOutputStream(counter));
valuesOut.writeByte(0);
stringTermDocIterator = closer.register(r.getStringTermDocIterator(field));
final int[] docIdBuffer = new int[1024];
while (stringTermDocIterator.nextTerm()) {
final int offset = (int) counter.getCount();
final String term = stringTermDocIterator.term();
final byte[] bytes = term.getBytes(Charsets.UTF_8);
if (bytes.length < 0xFF) {
valuesOut.writeByte(bytes.length);
} else {
valuesOut.writeByte(0xFF);
valuesOut.writeInt(bytes.length);
}
valuesOut.write(bytes);
while (true) {
final int n = stringTermDocIterator.fillDocIdBuffer(docIdBuffer);
for (int i = 0; i < n; i++) {
intArray.set(docIdBuffer[i], offset);
}
if (n < docIdBuffer.length) break;
}
}
valuesOut.flush();
final NativeBuffer buffer = valuesFileOut.getBuffer().realloc(valuesFileOut.position());
return Pair.of(offsets, buffer);
} catch (Throwable t) {
closer.close();
throw Throwables2.propagate(t, IOException.class);
} finally {
Closeables2.closeQuietly(stringTermDocIterator, log);
}
}
public abstract IntValueLookup newMMapFieldCache(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory) throws IOException;
@VisibleForTesting
abstract String getMMapFileName(String field);
public static FieldCacher getCacherForField(String field, FlamdexReader r) {
final long[] minMaxTerm = FlamdexUtils.getMinMaxTerm(field, r);
final long minTermVal = minMaxTerm[0];
final long maxTermVal = minMaxTerm[1];
if (minTermVal >= 0 && maxTermVal <= 1) {
return BITSET;
} else if (minTermVal >= 0 && maxTermVal <= 255) {
return BYTE;
} else if (minTermVal >= Byte.MIN_VALUE && maxTermVal <= Byte.MAX_VALUE) {
return SIGNED_BYTE;
} else if (minTermVal >= 0 && maxTermVal <= 65535) {
return CHAR;
} else if (minTermVal >= Short.MIN_VALUE && maxTermVal <= Short.MAX_VALUE) {
return SHORT;
} else if (minTermVal >= Integer.MIN_VALUE && maxTermVal <= Integer.MAX_VALUE) {
return INT;
} else {
return LONG;
}
}
private static void delete(File f) {
if (!f.delete()) {
log.error("unable to delete file " + f);
}
}
private static <T extends Closeable> T cacheToFileAtomically(UnsortedIntTermDocIterator iterator, int numDocs, String field, String directory, File cacheFile, CacheToFileOperation<T> op) throws IOException {
final File tmp = new File(directory, "fld-" + field + ".intcache." + UUID.randomUUID());
final T ret;
try {
ret = op.execute(iterator, numDocs, tmp);
} catch (RuntimeException e) {
delete(tmp);
throw e;
} catch (IOException e) {
delete(tmp);
throw e;
}
if (!tmp.renameTo(cacheFile)) {
delete(tmp);
Closeables2.closeQuietly(ret, log);
throw new IOException("unable to rename " + tmp + " to " + cacheFile);
}
return ret;
}
private static interface CacheToFileOperation<T> {
T execute(UnsortedIntTermDocIterator iterator, int numDocs, File f) throws IOException;
}
}