/*
* Copyright (C) 2014 Indeed Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.indeed.flamdex.simple;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.indeed.util.io.Files;
import com.indeed.flamdex.AbstractFlamdexReader;
import com.indeed.flamdex.api.DocIdStream;
import com.indeed.flamdex.api.GenericIntTermDocIterator;
import com.indeed.flamdex.api.GenericRawStringTermDocIterator;
import com.indeed.flamdex.api.IntTermDocIterator;
import com.indeed.flamdex.api.RawFlamdexReader;
import com.indeed.flamdex.api.RawStringTermDocIterator;
import com.indeed.flamdex.fieldcache.UnsortedIntTermDocIterator;
import com.indeed.flamdex.reader.FlamdexMetadata;
import com.indeed.flamdex.utils.FlamdexUtils;
import com.indeed.imhotep.io.caching.CachedFile;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
/**
* @author jsgroth
*/
public class SimpleFlamdexReader extends AbstractFlamdexReader implements RawFlamdexReader {
private final Collection<String> intFields;
private final Collection<String> stringFields;
private final MapCache mapCache = new MapCache();
private static final boolean useNativeDocIdStream;
static {
final String useNative = System.getProperties().getProperty("com.indeed.flamdex.simple.useNative");
useNativeDocIdStream = "true".equalsIgnoreCase(useNative);
}
protected SimpleFlamdexReader(String directory,
int numDocs,
Collection<String> intFields,
Collection<String> stringFields,
boolean useMMapMetrics) {
super(directory, numDocs, useMMapMetrics);
this.intFields = intFields;
this.stringFields = stringFields;
}
public static SimpleFlamdexReader open(String directory) throws IOException {
return open(directory, new Config());
}
public static SimpleFlamdexReader open(String directory, Config config) throws IOException {
final FlamdexMetadata metadata = FlamdexMetadata.readMetadata(directory);
final Collection<String> intFields = scan(directory, ".intterms");
final Collection<String> stringFields = scan(directory, ".strterms");
if (config.writeBTreesIfNotExisting) {
buildIntBTrees(directory, Lists.newArrayList(intFields));
buildStringBTrees(directory, Lists.newArrayList(stringFields));
}
return new SimpleFlamdexReader(directory, metadata.numDocs, intFields, stringFields, config.useMMapMetrics);
}
protected static Collection<String> scan(final String directory, final String ending) throws IOException {
final Set<String> fields = Sets.newTreeSet();
final CachedFile dir = CachedFile.create(directory);
for (final String name : dir.list()) {
if (name.startsWith("fld-") && name.endsWith(ending)) {
fields.add(name.substring(4, name.length() - ending.length()));
}
}
return fields;
}
@Override
public Collection<String> getIntFields() {
return intFields;
}
@Override
public Collection<String> getStringFields() {
return stringFields;
}
@Override
public DocIdStream getDocIdStream() {
return useNativeDocIdStream ? new NativeDocIdStream(mapCache) : new SimpleDocIdStream(mapCache);
}
@Override
public SimpleIntTermIterator getIntTermIterator(String field) {
final String termsFilename = CachedFile.buildPath(directory, SimpleIntFieldWriter.getTermsFilename(field));
final String docsFilename = CachedFile.buildPath(directory, SimpleIntFieldWriter.getDocsFilename(field));
if (CachedFile.create(termsFilename).length() == 0L) {
// try to read it as a String field and convert
final SimpleStringTermIterator stringTermIterator = getStringTermIterator(field);
if(!(stringTermIterator instanceof NullStringTermIterator)) {
return new StringToIntTermIterator(stringTermIterator);
}
// string field not found. return a null iterator
return new NullIntTermIterator(docsFilename);
}
final String indexFilename = CachedFile.buildPath(directory, "fld-"+field);
try {
return new SimpleIntTermIteratorImpl(mapCache, termsFilename, docsFilename, indexFilename);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public SimpleStringTermIterator getStringTermIterator(String field) {
final String termsFilename = CachedFile.buildPath(directory, SimpleStringFieldWriter.getTermsFilename(field));
final String docsFilename = CachedFile.buildPath(directory, SimpleStringFieldWriter.getDocsFilename(field));
if (CachedFile.create(termsFilename).length() == 0L) {
return new NullStringTermIterator(docsFilename);
}
final String indexFilename = CachedFile.buildPath(directory, "fld-"+field+".strindex");
try {
return new SimpleStringTermIteratorImpl(mapCache, termsFilename, docsFilename, indexFilename);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
protected UnsortedIntTermDocIterator createUnsortedIntTermDocIterator(final String field) {
return getIntTermDocIterator(field);
}
@Override
public IntTermDocIterator getIntTermDocIterator(final String field) {
final SimpleIntTermIterator termIterator = getIntTermIterator(field);
if (useNativeDocIdStream && CachedFile.create(termIterator.getFilename()).length() > 0) {
try {
return new NativeIntTermDocIterator(termIterator, mapCache);
} catch (IOException e) {
throw Throwables.propagate(e);
}
} else {
return new GenericIntTermDocIterator(termIterator, getDocIdStream());
}
}
@Override
public RawStringTermDocIterator getStringTermDocIterator(final String field) {
final SimpleStringTermIterator termIterator = getStringTermIterator(field);
if (useNativeDocIdStream && CachedFile.create(termIterator.getFilename()).length() > 0) {
try {
return new NativeStringTermDocIterator(termIterator, mapCache);
} catch (IOException e) {
throw Throwables.propagate(e);
}
} else {
return new GenericRawStringTermDocIterator(termIterator, getDocIdStream());
}
}
@Override
public long getIntTotalDocFreq(String field) {
return FlamdexUtils.getIntTotalDocFreq(this, field);
}
@Override
public long getStringTotalDocFreq(String field) {
return FlamdexUtils.getStringTotalDocFreq(this, field);
}
@Override
public Collection<String> getAvailableMetrics() {
return intFields;
}
@Override
public void close() throws IOException {
mapCache.close();
}
protected static void buildIntBTrees(final String directory, final List<String> intFields) throws IOException {
for (final String intField : intFields) {
final File btreeDir = new File(Files.buildPath(directory, "fld-" + intField + ".intindex"));
final File btreeDir64 = new File(Files.buildPath(directory, "fld-" + intField + ".intindex64"));
if (!btreeDir.exists() && !btreeDir64.exists()) {
SimpleFlamdexWriter.writeIntBTree(directory, intField, btreeDir64);
}
}
}
protected static void buildStringBTrees(final String directory, final List<String> stringFields) throws IOException {
for (final String stringField : stringFields) {
final File btreeDir = new File(Files.buildPath(directory, "fld-" + stringField + ".strindex"));
if (!btreeDir.exists()) {
SimpleFlamdexWriter.writeStringBTree(directory, stringField, btreeDir);
}
}
}
public static final class Config {
private boolean writeBTreesIfNotExisting = true;
private boolean useMMapMetrics = System.getProperty("flamdex.mmap.fieldcache") != null;
public boolean isWriteBTreesIfNotExisting() {
return writeBTreesIfNotExisting;
}
public Config setWriteBTreesIfNotExisting(boolean writeBTreesIfNotExisting) {
this.writeBTreesIfNotExisting = writeBTreesIfNotExisting;
return this;
}
public boolean isUseMMapMetrics() {
return useMMapMetrics;
}
public Config setUseMMapMetrics(boolean useMMapMetrics) {
this.useMMapMetrics = useMMapMetrics;
return this;
}
}
}