/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.flamdex.reader; import com.google.common.base.Throwables; import com.indeed.util.io.Files; import com.indeed.flamdex.api.DocIdStream; import com.indeed.flamdex.api.FlamdexOutOfMemoryException; import com.indeed.flamdex.api.FlamdexReader; import com.indeed.flamdex.api.GenericFlamdexFactory; import com.indeed.flamdex.api.GenericIntTermDocIterator; import com.indeed.flamdex.api.GenericStringTermDocIterator; import com.indeed.flamdex.api.IntTermDocIterator; import com.indeed.flamdex.api.IntTermIterator; import com.indeed.flamdex.api.IntValueLookup; import com.indeed.flamdex.api.StringTermDocIterator; import com.indeed.flamdex.api.StringTermIterator; import com.indeed.flamdex.api.StringValueLookup; import com.indeed.flamdex.fieldcache.FieldCacher; import com.indeed.flamdex.fieldcache.IntArrayIntValueLookup; import com.indeed.flamdex.lucene.LuceneFlamdexReader; import com.indeed.flamdex.ramses.RamsesFlamdexWrapper; import com.indeed.flamdex.simple.SimpleFlamdexReader; import com.indeed.flamdex.utils.FlamdexUtils; import com.indeed.imhotep.io.caching.CachedFile; import org.apache.log4j.Logger; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.ParallelReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collection; /** * @author jplaisance */ public final class GenericFlamdexReader implements FlamdexReader { private static final Logger log = Logger.getLogger(GenericFlamdexReader.class); private final String directory; private final GenericFlamdexFactory factory; private final int numDocs; private final Collection<String> intFields; private final Collection<String> stringFields; private GenericFlamdexReader( String directory, GenericFlamdexFactory factory, int numDocs, Collection<String> intFields, Collection<String> stringFields ) { this.directory = directory; this.factory = factory; this.numDocs = numDocs; this.intFields = intFields; this.stringFields = stringFields; } public static FlamdexReader open( String directory ) throws IOException { final FlamdexReader r = internalOpen(directory); if (RamsesFlamdexWrapper.ramsesFilesExist(directory)) { return new RamsesFlamdexWrapper(r, directory); } return r; } private static FlamdexReader internalOpen(String directory) throws IOException { final CachedFile dir = CachedFile.create(directory); final String metadataPath = CachedFile.buildPath(directory, "metadata.txt"); final CachedFile metadataFile = CachedFile.create(metadataPath); if (! dir.exists()) { throw new FileNotFoundException(directory + " does not exist"); } if (! dir.isDirectory()) { throw new FileNotFoundException(directory + " is not a directory"); } if (! metadataFile.exists()) { final IndexReader luceneIndex; final File indexDir = dir.loadDirectory(); if (IndexReader.indexExists(indexDir)) { luceneIndex = IndexReader.open(indexDir); } else { throw new IOException("directory " + directory + " does not have a metadata.txt and is not a lucene index"); } // try finding and loading subindexes final ParallelReader pReader = new ParallelReader(); pReader.add(luceneIndex); final int maxDoc = luceneIndex.maxDoc(); final File[] files = indexDir.listFiles(); if (files != null) { for (final File file : files) { if (!file.isDirectory() || !IndexReader.indexExists(file)) { continue; // only interested in Lucene indexes in subdirectories } try { final IndexReader subIndexReader = IndexReader.open(file); final int siMaxDoc = subIndexReader.maxDoc(); if (siMaxDoc != maxDoc) { log.warn("unable to load subindex. (maxDoc) do not match index (" + siMaxDoc + ") != (" + maxDoc + ") for " + file.getAbsolutePath()); continue; } pReader.add(subIndexReader, true); } catch (IOException e) { log.warn("unable to open subindex: " + file.getAbsolutePath()); } } } return new LuceneFlamdexReader(pReader, directory); } final FlamdexMetadata metadata = FlamdexMetadata.readMetadata(directory); switch (metadata.getFormatVersion()) { case 0 : return SimpleFlamdexReader.open(directory); case 1 : throw new UnsupportedOperationException("pfordelta is no longer supported"); case 2 : final File indexDir = dir.loadDirectory(); return new LuceneFlamdexReader(IndexReader.open(indexDir), metadata.getIntFields(), metadata.getStringFields()); } throw new IllegalArgumentException("index format version "+metadata.getFormatVersion()+" not supported"); } public static GenericFlamdexReader open( String directory, GenericFlamdexFactory factory ) throws IOException { final FlamdexMetadata metadata = FlamdexMetadata.readMetadata(directory); return new GenericFlamdexReader(directory, factory, metadata.numDocs, metadata.intFields, metadata.stringFields); } @Override public Collection<String> getIntFields() { return intFields; } @Override public Collection<String> getStringFields() { return stringFields; } @Override public int getNumDocs() { return numDocs; } @Override public String getDirectory() { return directory; } @Override public DocIdStream getDocIdStream() { return factory.createDocIdStream(); } @Override public IntTermIterator getIntTermIterator(String field) { final String termsFilename = Files.buildPath(directory, factory.getIntTermsFilename(field)); final String docsFilename = Files.buildPath(directory, factory.getIntDocsFilename(field)); try { return factory.createIntTermIterator(termsFilename, docsFilename); } catch (IOException e) { throw new RuntimeException(e); } } @Override public StringTermIterator getStringTermIterator(String field) { final String termsFilename = Files.buildPath(directory, factory.getStringTermsFilename(field)); final String docsFilename = Files.buildPath(directory, factory.getStringDocsFilename(field)); try { return factory.createStringTermIterator(termsFilename, docsFilename); } catch (IOException e) { throw new RuntimeException(e); } } @Override public IntTermDocIterator getIntTermDocIterator(final String field) { return new GenericIntTermDocIterator(getIntTermIterator(field), getDocIdStream()); } @Override public StringTermDocIterator getStringTermDocIterator(final String field) { return new GenericStringTermDocIterator(getStringTermIterator(field), getDocIdStream()); } @Override public long getIntTotalDocFreq(String field) { return FlamdexUtils.getIntTotalDocFreq(this, field); } @Override public long getStringTotalDocFreq(String field) { return FlamdexUtils.getStringTotalDocFreq(this, field); } @Override public Collection<String> getAvailableMetrics() { return intFields; } @Override public IntValueLookup getMetric(String metric) throws FlamdexOutOfMemoryException { return new IntArrayIntValueLookup(FlamdexUtils.cacheIntField(metric, this)); } public StringValueLookup getStringLookup(final String field) throws FlamdexOutOfMemoryException { try { return FieldCacher.newStringValueLookup(field, this, directory); } catch (IOException e) { throw Throwables.propagate(e); } } @Override public long memoryRequired(String metric) { return 4L * getNumDocs(); } @Override public void close() throws IOException { } }