package com.yahoo.glimmer.query; /* * Copyright (c) 2012 Yahoo! Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. * See accompanying LICENSE file. */ import it.unimi.di.big.mg4j.document.DocumentCollection; import it.unimi.di.big.mg4j.document.IdentityDocumentFactory; import it.unimi.di.big.mg4j.index.BitStreamIndex; import it.unimi.di.big.mg4j.index.DiskBasedIndex; import it.unimi.di.big.mg4j.index.Index; import it.unimi.di.big.mg4j.index.Index.UriKeys; import it.unimi.di.big.mg4j.index.IndexIterator; import it.unimi.di.big.mg4j.index.QuasiSuccinctIndex; import it.unimi.di.big.mg4j.index.TermProcessor; import it.unimi.di.big.mg4j.query.QueryEngine; import it.unimi.di.big.mg4j.query.SelectedInterval; import it.unimi.di.big.mg4j.query.nodes.Query; import it.unimi.di.big.mg4j.query.nodes.QueryBuilderVisitorException; import it.unimi.di.big.mg4j.query.nodes.Select; import it.unimi.di.big.mg4j.search.DocumentIteratorBuilderVisitor; import it.unimi.di.big.mg4j.search.score.CountScorer; import it.unimi.di.big.mg4j.search.score.DocumentScoreInfo; import it.unimi.di.big.mg4j.search.score.Scorer; import it.unimi.dsi.big.util.ImmutableExternalPrefixMap; import it.unimi.dsi.big.util.LongBigListSignedStringMap; import it.unimi.dsi.big.util.SemiExternalGammaBigList; import it.unimi.dsi.big.util.StringMap; import it.unimi.dsi.fastutil.BigList; import it.unimi.dsi.fastutil.Hash; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2ReferenceLinkedOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2ReferenceMap; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectSet; import it.unimi.dsi.fastutil.objects.Reference2DoubleMap; import it.unimi.dsi.fastutil.objects.Reference2DoubleOpenHashMap; import it.unimi.dsi.fastutil.objects.Reference2ObjectMap; import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap; import it.unimi.dsi.fastutil.objects.Reference2ReferenceOpenHashMap; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.sux4j.io.FileLinesBigList; import it.unimi.dsi.sux4j.io.FileLinesList; import java.io.File; import java.io.FileNotFoundException; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.semanticweb.yars.nx.namespace.RDF; import com.yahoo.glimmer.indexing.TitleListDocumentCollection; import com.yahoo.glimmer.util.BlockCompressedDocumentCollection; import com.yahoo.glimmer.util.Util; public class RDFIndex { private final static Logger LOGGER = Logger.getLogger(RDFIndex.class); public final static int MAX_STEMMING = 1024; private final static String TYPE_FEILD_NAME = Util.encodeFieldName(RDF.TYPE.toString()); private final static String BASENAME_INDEX_PROPERTY_KEY = "basename"; private final static String ALIGNMENT_INDEX_NAME = "alignment"; private final static String SUBJECT_INDEX_KEY = "subject"; private final static String SUBJECT_TEXT_INDEX_KEY = "subjectText"; private final static String PREDICATE_INDEX_KEY = "predicate"; private final static String OBJECT_INDEX_KEY = "object"; private final static String CONTEXT_INDEX_KEY = "context"; private static final String[] HORIZONTAL_INDECIES = new String[] { SUBJECT_INDEX_KEY, SUBJECT_TEXT_INDEX_KEY, PREDICATE_INDEX_KEY, OBJECT_INDEX_KEY, CONTEXT_INDEX_KEY }; private static final String[] MANDITORY_HORIZONTAL_INDECIES = new String[] { PREDICATE_INDEX_KEY, OBJECT_INDEX_KEY }; private final String indexName; /** The query engine. */ private QueryEngine queryEngine; /** The document collection. */ private DocumentCollection documentCollection = null; /** Term counts in the token index */ protected SemiExternalGammaBigList frequencies = null; /** Document priors */ protected HashMap<Integer, Integer> documentPriors = null; /** Map used to encode URIs for retrieving from the collection */ protected Object2LongFunction<CharSequence> allResourcesToIds; /** Map used to decode URIs */ protected FileLinesList allIdsToResources; /** The alignment index **/ protected Index alignmentIndex; private String resourceIdPrefix = "@"; private Map<String, Integer> predicateDistribution; private Map<String, Integer> typeTermDistribution; private Set<String> verticalPredicates; protected RDFIndexStatistics stats; protected RDFQueryParser parser; @SuppressWarnings("unchecked") private static <T> T loadObjectOfType(File file) throws RDFIndexException { if (file == null) { return null; } try { return (T) BinIO.loadObject(file); } catch (Exception e) { throw new RDFIndexException("While loading from " + file.getPath(), e); } } public RDFIndex(String indexName, Context context) throws RDFIndexException { this.indexName = indexName; File kbRootPath = context.getKbRootPath(); if (kbRootPath == null) { throw new IllegalArgumentException("path to knowledge base root is not set."); } if (!kbRootPath.isDirectory()) { throw new IllegalArgumentException("path to knowledge base root is not a directory."); } File verticalIndexDir = context.getVerticalIndexDir(); if (verticalIndexDir == null) { throw new IllegalArgumentException("path to vertical indexes is not set."); } if (!verticalIndexDir.isDirectory()) { throw new IllegalArgumentException("path to vertical indexes is not a directory."); } File horizontalIndexDir = context.getHorizontalIndexDir(); if (horizontalIndexDir == null) { throw new IllegalArgumentException("path to horizontal indexes is not set."); } if (!horizontalIndexDir.isDirectory()) { throw new IllegalArgumentException("path to horizontal indexes is not a directory."); } // Load the collection or titlelist String indexBasename = new File(kbRootPath, "bySubject").getAbsolutePath(); try { BlockCompressedDocumentCollection collection = new BlockCompressedDocumentCollection("bySubject", new IdentityDocumentFactory(), 100000); collection.filename(indexBasename); documentCollection = collection; } catch (IOException e) { LOGGER.info("Couldn't open Bz2BlockIndexedDocumentCollection from " + indexBasename, e); } if (documentCollection == null) { LOGGER.info("No collection specified, we will try to use a title list..."); File titleListFile = context.getTitleListFile(); if (titleListFile != null) { LOGGER.info("Loading titlelist from " + titleListFile.getPath()); BigList<MutableString> titleList; try { titleList = new FileLinesBigList(titleListFile.getPath(), "ASCII"); LOGGER.info("Loaded titlelist of size " + titleList.size() + "."); documentCollection = new TitleListDocumentCollection(titleList); } catch (Exception e) { throw new IllegalArgumentException("Failed to open TitleListDocumentCollection.", e); } } } resourceIdPrefix = context.getResourceIdPrefix(); // Load all resources hash function allResourcesToIds = loadObjectOfType(context.getAllResourcesMapFile()); if (allResourcesToIds == null) { LOGGER.warn("Warning, no resources map specified!"); } else { LOGGER.info("Loaded resourses map " + context.getAllResourcesMapFile().getPath() + " with " + allResourcesToIds.size() + " entries."); } try { allResourcesToIds = new LongBigListSignedStringMap(allResourcesToIds, context.getAllResourcesSignatureFile().getPath()); } catch (Exception e) { throw new RDFIndexException("Exception while creating 'all' resources signed map", e); } // Load the reverse all resource function. File allResourcesFile = context.getAllResourcesFile(); if (!allResourcesFile.exists()) { throw new RDFIndexException("All resources file " + allResourcesFile.getPath() + " does not exist."); } try { allIdsToResources = new FileLinesList(allResourcesFile.getPath(), "UTF-8"); } catch (IOException e) { throw new RDFIndexException("Couldn't open all resources file " + allResourcesFile.getPath() + " as a FileLinesList.", e); } // Load vertical indexes Object2ReferenceMap<String, Index> indexMap = loadIndexesFromDir(verticalIndexDir, context.getLoadDocumentSizes(), context.getLoadIndexesInMemory()); LOGGER.info("Loaded " + indexMap.size() + " vertical indices."); verticalPredicates = Collections.unmodifiableSet(new HashSet<String>(indexMap.keySet())); try { LOGGER.info("Loading alignment index.."); String alignmentBasename = new File(verticalIndexDir, ALIGNMENT_INDEX_NAME).getPath(); alignmentIndex = Index.getInstance(alignmentBasename + "?mapped=1"); setTermMapDumpFile(alignmentIndex, alignmentBasename); } catch (Exception e) { LOGGER.error("Failed to load alignment index", e); } // Load horizontal indexes indexMap.putAll(loadIndexesFromDir(horizontalIndexDir, true, context.getLoadIndexesInMemory())); for (String indexKey : MANDITORY_HORIZONTAL_INDECIES) { if (!indexMap.containsKey(indexKey)) { throw new IllegalStateException("No " + indexKey + " index found."); } } if (!indexMap.containsKey(CONTEXT_INDEX_KEY)) { LOGGER.info("No context index found."); } // Loading frequencies Index objectIndex = indexMap.get(OBJECT_INDEX_KEY); String filename = (String) objectIndex.properties.getProperty(BASENAME_INDEX_PROPERTY_KEY); filename += DiskBasedIndex.FREQUENCIES_EXTENSION; try { LOGGER.info("Loading frequencies from " + filename); frequencies = new SemiExternalGammaBigList(new InputBitStream(filename), 1, objectIndex.numberOfTerms); if (frequencies.size64() != objectIndex.numberOfDocuments) { LOGGER.warn("Loaded " + frequencies.size64() + " frequency values but objectIndex.numberOfDocuments is " + objectIndex.numberOfDocuments); } } catch (Exception e) { throw new IllegalArgumentException("Failed to load frequences for objectText index from " + filename, e); } try { predicateDistribution = Collections.unmodifiableMap(getTermDistribution(indexMap.get(PREDICATE_INDEX_KEY), true)); Index typeField = indexMap.get(Util.encodeFieldName(RDF.TYPE.toString())); if (typeField == null) { typeTermDistribution = Collections.emptyMap(); } else { typeTermDistribution = Collections.unmodifiableMap(getTermDistribution(typeField, true)); } } catch (IOException e) { throw new RDFIndexException(e); } List<String> indexedPredicatesOrdered = new ArrayList<String>(); try { LOGGER.info("Loading indexed predicates list.."); for (MutableString line : new FileLinesList(context.getIndexedPredicatesFile().getPath(), "UTF-8")) { indexedPredicatesOrdered.add(Util.encodeFieldName(line.toString())); } } catch (IOException e1) { throw new RDFIndexException("Failed to load indexed predicated list from file:" + context.getIndexedPredicatesFile().getPath()); } // We need to maintain insertion order and test inclusion. Map<String, String> fieldNameSuffixToFieldNameOrderedMap = new LinkedHashMap<String, String>(); for (String indexKey : HORIZONTAL_INDECIES) { if (indexMap.containsKey(indexKey)) { fieldNameSuffixToFieldNameOrderedMap.put(indexKey, indexKey); } } List<String> shortNames = Util.generateShortNames(indexedPredicatesOrdered, fieldNameSuffixToFieldNameOrderedMap.keySet(), '_'); for (int i = 0; i < shortNames.size(); i++) { fieldNameSuffixToFieldNameOrderedMap.put(shortNames.get(i), indexedPredicatesOrdered.get(i)); LOGGER.info("Predicate short name: " + shortNames.get(i) + " -> " + indexedPredicatesOrdered.get(i)); } RDFIndexStatisticsBuilder statsBuilder = new RDFIndexStatisticsBuilder(); statsBuilder.setSortedPredicates(fieldNameSuffixToFieldNameOrderedMap); statsBuilder.setTypeTermDistribution(typeTermDistribution); // Load the ontology if provided if (context.getOntoPath() != null) { try { InputStream owlOntologgyInputStream = RDFIndexStatisticsBuilder.class.getClassLoader().getResourceAsStream(context.getOntoPath().getPath()); if (owlOntologgyInputStream == null) { throw new FileNotFoundException("Can open ontology file " + owlOntologgyInputStream); } statsBuilder.setOwlOntologyInputStream(owlOntologgyInputStream); statsBuilder.setPredicateTermDistribution(predicateDistribution); } catch (FileNotFoundException e) { throw new RDFIndexException("Ontology file not found:" + context.getOntoPath()); } catch (IOException e) { throw new RDFIndexException("Reading file " + context.getOntoPath(), e); } } stats = statsBuilder.build(); // This is empty for non-payload indices Reference2ReferenceMap<Index, Object> index2Parser = new Reference2ReferenceOpenHashMap<Index, Object>(); DocumentIteratorBuilderVisitor builderVisitor = new DocumentIteratorBuilderVisitor(indexMap, index2Parser, objectIndex, MAX_STEMMING); // QueryParser is null as we will only pass in parsed queries queryEngine = new QueryEngine(null, builderVisitor, indexMap); // We set up an interval selector only if there is a collection for // snippeting // queryEngine.intervalSelector = documentCollection != null ? new // IntervalSelector(4, 40): new IntervalSelector(); queryEngine.multiplex = false; queryEngine.intervalSelector = null; // Load priors documentPriors = loadObjectOfType(context.getDocumentPriorsFile()); if (documentPriors != null) { LOGGER.info("Loaded priors from " + context.getDocumentPriorsFile()); } else { LOGGER.info("Path to priors is null. None loaded."); } // Sets field weight and scorer reconfigure(context); // Init query parser final Object2ObjectOpenHashMap<String, TermProcessor> termProcessors = new Object2ObjectOpenHashMap<String, TermProcessor>(getIndexedFields().size()); for (String alias : getIndexedFields()) termProcessors.put(alias, getField(alias).termProcessor); parser = new RDFQueryParser(getAlignmentIndex(), indexedPredicatesOrdered, fieldNameSuffixToFieldNameOrderedMap, OBJECT_INDEX_KEY, termProcessors, allResourcesToIds); } public String getIndexName() { return indexName; } private Object2ReferenceMap<String, Index> loadIndexesFromDir(File indexDir, boolean loadDocSizes, boolean inMemory) throws RDFIndexException { EnumMap<UriKeys, String> indexOptionsmap = new EnumMap<UriKeys, String>(UriKeys.class); if (inMemory) { indexOptionsmap.put(UriKeys.INMEMORY, "true"); } else { indexOptionsmap.put(UriKeys.MAPPED, "true"); } // List .properties files in index directory File[] propertiesFiles = indexDir.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".properties"); } }); List<String> indexBasenames = new ArrayList<String>(); for (int i = 0; i < propertiesFiles.length; i++) { String baseName = propertiesFiles[i].getName(); baseName = baseName.substring(0, baseName.lastIndexOf('.')); if (ALIGNMENT_INDEX_NAME.equals(baseName)) { continue; } LOGGER.info("Loading vertical index: '" + baseName + "'"); indexBasenames.add(new File(indexDir, baseName).getPath()); } Reference2DoubleOpenHashMap<Index> index2Weight = new Reference2DoubleOpenHashMap<Index>(); return loadIndicesFromSpec(indexBasenames, documentCollection.size(), index2Weight, loadDocSizes, indexOptionsmap); } /** * Parses a given array of index URIs/weights, loading the correspoding * indices and writing the result of parsing in the given maps. * * @param indexBasenames * an array of index URIs of the form * <samp><var>uri</var>[:<var>weight</var>]</samp>, specifying * the URI of an index and the weight for the index (1, if * missing). * @param loadSizes * forces size loading. * @param documentCollection * an optional document collection, or <code>null</code>. * @param name2Index * an empty, writable map that will be filled with pairs given by * an index basename (or field name, if available) and an * {@link Index}. * @param index2Weight * an empty, writable map that will be filled with a map from * indices to respective weights. */ protected Object2ReferenceMap<String, Index> loadIndicesFromSpec(final List<String> indexBasenames, final long documentCollectionSize, final Reference2DoubleMap<Index> index2Weight, boolean documentSizes, EnumMap<UriKeys, String> map) throws RDFIndexException { Object2ReferenceLinkedOpenHashMap<String, Index> name2Index = new Object2ReferenceLinkedOpenHashMap<String, Index>(Hash.DEFAULT_INITIAL_SIZE, .5f); for (String indexBasename : indexBasenames) { // We must be careful, as ":" is used by Windows to separate the // device from the path. final int split = indexBasename.lastIndexOf(':'); double weight = 1; if (split != -1) { try { weight = Double.parseDouble(indexBasename.substring(split + 1)); } catch (NumberFormatException e) { } } final Index index; if (split == -1 || indexBasename.startsWith("mg4j://")) { // index = Index.getInstance(basenameWeight[i], true, // loadSizes); // System.out.println("BASENAME: " + basenameWeight[i]); try { index = DiskBasedIndex.getInstance(indexBasename, true, documentSizes, true, map); index2Weight.put(index, 1); } catch (ArrayIndexOutOfBoundsException e) { // Empty index System.err.println("Failed to open index: " + indexBasename); continue; } catch (Exception e) { throw new RDFIndexException(e); } index.properties.setProperty(BASENAME_INDEX_PROPERTY_KEY, indexBasename); } else { try { index = DiskBasedIndex.getInstance(indexBasename, true, documentSizes, true, map); } catch (Exception e) { throw new RDFIndexException(e); } // index = Index.getInstance(basenameWeight[i].substring(0, // split)); index2Weight.put(index, weight); } if (index.numberOfDocuments != documentCollectionSize) { LOGGER.warn("Index " + index + " has " + index.numberOfDocuments + " documents, but the document collection has size " + documentCollectionSize + ". This shouldn't be if the .blockOffsets file was produced by the MR job. With the BZip2BlockOffsetsTool the document collection will be slightly smaller."); } setTermMapDumpFile(index, indexBasename); name2Index.put(index.field != null ? index.field : indexBasename, index); } return name2Index; } private void setTermMapDumpFile(final Index index, final String indexBasename) throws RDFIndexException { // See the section of the MG4J Manual entitled 'Setup Time' if (index.termMap instanceof ImmutableExternalPrefixMap) { ImmutableExternalPrefixMap termMap = (ImmutableExternalPrefixMap) index.termMap; try { termMap.setDumpStream(indexBasename + DiskBasedIndex.TERMMAP_EXTENSION + ".dump"); } catch (FileNotFoundException e) { throw new RDFIndexException("Failed to set dump file for index " + indexBasename, e); } } } private Reference2DoubleOpenHashMap<Index> loadB(Context context) { Reference2DoubleOpenHashMap<Index> b = new Reference2DoubleOpenHashMap<Index>(); double db = context.getB(); for (String indexName : getIndexedFields()) { // TODO load from file if needed b.put(getField(indexName), db); } b.put(queryEngine.indexMap.get(OBJECT_INDEX_KEY), db); return b; } /** * Compute index weights from context * * * @param context * @return */ private Reference2DoubleOpenHashMap<Index> loadWeights(Context context) { Reference2DoubleOpenHashMap<Index> index2Weight = new Reference2DoubleOpenHashMap<Index>(); ObjectSet<String> indexNames = queryEngine.indexMap.keySet(); for (String indexName : indexNames) { Index index = queryEngine.indexMap.get(indexName); String w = context.getString("w." + indexName); if (w == null) { // unimportant index2Weight.put(index, context.getWfUnimportant() * indexNames.size()); } else { if (w.equals(SetDocumentPriors.IMPORTANT)) index2Weight.put(index, context.getWfImportant() * indexNames.size()); else if (w.equals(SetDocumentPriors.UNIMPORTANT)) index2Weight.put(index, context.getWfUnimportant() * indexNames.size()); else if (w.equals(SetDocumentPriors.NEUTRAL)) index2Weight.put(index, context.getWfNeutral() * indexNames.size()); } } // System.out.println("Final weights:"+index2Weight); return index2Weight; } protected Scorer configureScorer(Context context) throws FileNotFoundException, IOException { Reference2DoubleOpenHashMap<Index> bByIndex = loadB(context); double[] documentWeights = new double[3]; documentWeights[Integer.parseInt(SetDocumentPriors.IMPORTANT)] = context.getWsImportant(); documentWeights[Integer.parseInt(SetDocumentPriors.UNIMPORTANT)] = context.getWsUnimportant(); documentWeights[Integer.parseInt(SetDocumentPriors.NEUTRAL)] = context.getWsNeutral(); StringMap<? extends CharSequence> objectTermMap; Index objectIndex = getObjectIndex(); if (objectIndex instanceof BitStreamIndex) { objectTermMap = ((BitStreamIndex) objectIndex).termMap; } else if (objectIndex instanceof QuasiSuccinctIndex) { objectTermMap = ((QuasiSuccinctIndex) objectIndex).termMap; } else { throw new IllegalStateException("Subject index is not a BitStreamIndex. Don't know how to get its termMap."); } return new WOOScorer(context.getK1(), bByIndex, objectTermMap, frequencies, objectIndex.sizes, (double) objectIndex.numberOfOccurrences / objectIndex.numberOfDocuments, objectIndex.numberOfDocuments, context.getWMatches(), documentWeights, context.getDlCutoff(), documentPriors, context.getMaxNumberOfDieldsNorm()); } /** * We partially reinitialize the index: we reload the weights and the scorer * * @param context */ public void reconfigure(Context context) { // Recomputes index weights queryEngine.setWeights(loadWeights(context)); // Configure scorer try { // Configure scorer Scorer scorer = configureScorer(context); queryEngine.score(scorer); // Only valid if we have a scorer // ALERT WTF // queryEngine.equalize( context.SIZE_TOP_K ); } catch (Exception e) { e.printStackTrace(); System.err.println("WOO Scorer failed to configure, using default scorer"); queryEngine.score(new CountScorer()); System.exit(-1); } } private Index getObjectIndex() { return queryEngine.indexMap.get(OBJECT_INDEX_KEY); } /** * The indexed fields, including the token and uri fields of the horizontal * index. * * @return */ public Set<String> getIndexedFields() { return queryEngine.indexMap.keySet(); } public Index getField(String alias) { return queryEngine.indexMap.get(alias); } /** * * @param uri * - Resource or BNode * @return the doc id if the given uri is a valid doc uri. * @throws IOException * @throws RDFIndexException */ public Long getSubjectId(String uri) throws IOException { Long id = allResourcesToIds.get(uri); if (id != null) { // Check that the doc is a valid doc(has contents).. TODO could use // the subjects signed hash here.. InputStream docStream = documentCollection.stream(id); if (docStream.read() == -1) { id = null; } docStream.close(); } return id; } public DocumentCollection getCollection() { return documentCollection; } public Index getAlignmentIndex() { return alignmentIndex; } public String lookupIdByResourceId(String key) { if (key.startsWith("_:")) { key = key.substring(2); } Long id = allResourcesToIds.get(key); return id == null ? null : resourceIdPrefix + id.intValue(); } public synchronized String lookupResourceById(long id) { MutableString value = allIdsToResources.get((int) id); if (value != null) { return value.toString(); } return null; } public String getDefaultField() { return OBJECT_INDEX_KEY; } public RDFIndexStatistics getStatistics() { return stats; } public RDFQueryParser getParser() { return parser; } public int process(final int offset, final int length, final ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>>> results, final Query... queries) throws QueryBuilderVisitorException, IOException { QueryEngine engine = queryEngine.copy(); if (queries.length == 1 && queries[0] instanceof Select) { // If it is only a query by type disable the scorer for this query Select select = (Select) queries[0]; if (TYPE_FEILD_NAME.equals(select.index)) { engine.score(new Scorer[0], new double[0]); } } return engine.process(queries, offset, length, results); } public void destroy() { try { if (documentCollection != null) documentCollection.close(); } catch (IOException e) { e.printStackTrace(); } } private Map<String, Integer> getTermDistribution(Index index, boolean termsAreResourceIds) throws IOException { StringMap<? extends CharSequence> termMap = null; if (index instanceof BitStreamIndex) { termMap = ((BitStreamIndex) index).termMap; } else if (index instanceof QuasiSuccinctIndex) { termMap = ((QuasiSuccinctIndex) index).termMap; } if (termMap == null) { throw new IllegalArgumentException("termMap is null. Index is for field:" + index.field + ". Index class is:" + index.getClass().getSimpleName()); } Map<String, Integer> histogram = new HashMap<String, Integer>(); for (CharSequence term : termMap.list()) { long docId = termMap.get(term); IndexIterator it = index.documents(((int) docId)); int frequency = it.frequency() > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) it.frequency(); if (termsAreResourceIds) { String termString = term.toString(); if (!termString.startsWith(resourceIdPrefix)) { throw new RuntimeException("Expected resource id " + termString + " to be prefix with " + resourceIdPrefix); } int termAsId = Integer.parseInt(termString.substring(resourceIdPrefix.length())); histogram.put(lookupResourceById(termAsId), frequency); } else { histogram.put(term.toString(), frequency); } it.dispose(); } return histogram; } public static class RDFIndexException extends Exception { private static final long serialVersionUID = -6825941506094477867L; public RDFIndexException(Exception e) { super(e); } public RDFIndexException(String message) { super(message); } public RDFIndexException(String message, Exception e) { super(message, e); } } public InputStream getDocumentInputStream(long docId) throws IOException { return documentCollection.stream(docId); } public Integer getDocumentSize(int docId) { return getObjectIndex().sizes.get(docId); } public Set<String> getIndexedPredicates() { return verticalPredicates; } }