/* * Copyright (c) 2011 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.flaptor.indextank.index.rti.inverted; import java.util.BitSet; import java.util.Iterator; import java.util.Map; import java.util.NavigableMap; import java.util.Set; import java.util.SortedSet; import java.util.TreeMap; import java.util.Map.Entry; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentNavigableMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicInteger; import com.flaptor.indextank.Indexer; import com.flaptor.indextank.index.DocId; import com.flaptor.indextank.index.Document; import com.flaptor.indextank.index.QueryMatcher; import com.flaptor.indextank.index.ScoredMatch; import com.flaptor.indextank.index.TopMatches; import com.flaptor.indextank.index.scorer.FacetingManager; import com.flaptor.indextank.index.scorer.Scorer; import com.flaptor.indextank.index.term.DocTermMatch; import com.flaptor.indextank.index.term.TermMatcher; import com.flaptor.indextank.index.term.query.RawMatch; import com.flaptor.indextank.index.term.query.TermBasedQueryMatcher; import com.flaptor.indextank.query.AToken; import com.flaptor.indextank.query.IndexEngineParser; import com.flaptor.indextank.query.Query; import com.flaptor.indextank.util.AbstractSkippableIterable; import com.flaptor.indextank.util.AbstractSkippableIterator; import com.flaptor.indextank.util.SkippableIterable; import com.flaptor.indextank.util.SkippableIterator; import com.flaptor.indextank.util.Skippables; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.collect.Iterables; import com.google.common.collect.MapMaker; import com.google.common.collect.Maps; import com.google.common.collect.SortedSetMultimap; import com.google.common.collect.TreeMultimap; public class InvertedIndex implements Indexer, QueryMatcher, TermMatcher { private final int maxDocCount; private final DocId[] docids; private final BitSet internalDeletes; private final AtomicInteger docCount; private final ConcurrentMap<DocId, Integer> docidsIndexes = new MapMaker().makeMap(); private final ConcurrentNavigableMap<Key, DocTermMatchList> invertedIndex = new ConcurrentSkipListMap<Key, DocTermMatchList>(); private final ConcurrentMap<DocId, DocId> deletes = new MapMaker().makeMap(); //private final ConcurrentHashMap<DocId, DocId> deletes = Maps.newConcurrentHashMap(); private final QueryMatcher matcher; private final IndexEngineParser parser; public InvertedIndex(Scorer scorer, IndexEngineParser parser, int maxDocCount, FacetingManager facetingManager) { Preconditions.checkArgument(maxDocCount > 0); this.maxDocCount = maxDocCount; this.docids = new DocId[maxDocCount]; this.internalDeletes = new BitSet(maxDocCount); this.docCount = new AtomicInteger(0); this.matcher = new TermBasedQueryMatcher(scorer, this, facetingManager); this.parser = parser; } public void add(String sdocid, final Document document) { int idx = docCount.getAndIncrement(); if (idx < maxDocCount) { DocId docid = new DocId(sdocid); docids[idx] = docid; Integer oldIdx = docidsIndexes.put(docid, idx); if (oldIdx != null) { internalDel(oldIdx); } internalAdd(idx, document); } else { throw new IllegalStateException("MaxDocCount (" + maxDocCount + ") reached. Cannot add more documents."); } } public void del(String sdocid) { DocId docid = new DocId(sdocid); Integer idx = docidsIndexes.get(docid); if (idx != null) { internalDel(idx); } else { deletes.put(docid, docid); } } private void internalAdd(int idx, final Document document) { for (String field : document.getFieldNames()) { Iterator<AToken> tokens = parser.parseDocumentField(field, document.getField(field)); SortedSetMultimap<String, Integer> termPositions = TreeMultimap.create(); int tokenCount = 0; while (tokens.hasNext()) { tokenCount++; AToken token = tokens.next(); termPositions.put(token.getText(), token.getPosition()); } for (String term : termPositions.keySet()) { Key key = new Key(field, term); SortedSet<Integer> positionsSet = termPositions.get(term); int[] positions = new int[positionsSet.size()]; int p = 0; for (Integer i : positionsSet) { positions[p++] = i; } DocTermMatchList original = invertedIndex.putIfAbsent(key, new DocTermMatchList(idx, positions, tokenCount)); if (original != null) { original.add(idx, positions, tokenCount); } } } } private void internalDel(int idx) { internalDeletes.set(idx); } public SkippableIterable<DocTermMatch> getMatches(String field, String term) { DocTermMatchList docList = invertedIndex.get(new Key(field, term)); if (docList == null) { return Skippables.emptyIterable(); } else { return Skippables.filter(docList, notDeletedPredicate()); } } @Override public NavigableMap<String, SkippableIterable<DocTermMatch>> getMatches(String field, String termFrom, String termTo) { Key leftBoundary = new Key(field, termFrom); Key rightBoundary = new Key(field, termTo); ConcurrentNavigableMap<Key, DocTermMatchList> range = invertedIndex.subMap(leftBoundary, rightBoundary); NavigableMap<String, SkippableIterable<DocTermMatch>> result = new TreeMap<String, SkippableIterable<DocTermMatch>>(); int numberOfTerms = 0; for (Entry<Key, DocTermMatchList> entry : range.entrySet()) { result.put(entry.getKey().term, Skippables.filter(entry.getValue(), notDeletedPredicate())); numberOfTerms++; if (numberOfTerms >= 1000) { break; } } return result; } private Predicate<DocTermMatch> notDeletedPredicate() { return new Predicate<DocTermMatch>() { @Override public boolean apply(DocTermMatch item) { return !internalDeletes.get(item.getRawId()); } }; } public boolean hasChanges(DocId docid) { return docidsIndexes.containsKey(docid) || deletes.containsKey(docid); } @Override public Iterable<ScoredMatch> decode(Iterable<RawMatch> rawMatches, final double boostedNorm) { return Iterables.transform(rawMatches, new Function<RawMatch, ScoredMatch>() { @Override public ScoredMatch apply(RawMatch rawMatch) { //System.out.println("RESULT: "+rawMatch.getNormalizedScore()); return new ScoredMatch(rawMatch.getBoostedScore() / boostedNorm, docids[rawMatch.getRawId()]); } }); } /* QueryMatcher interface - delegates in internal matcher instance */ public TopMatches findMatches(Query query, int limit, int scoringFunctionIndex) throws InterruptedException { return matcher.findMatches(query, limit, scoringFunctionIndex); } public TopMatches findMatches(Query query, Predicate<DocId> idFilter, int limit, int scoringFunctionIndex) throws InterruptedException { return matcher.findMatches(query, idFilter, limit, scoringFunctionIndex); } @Override public SkippableIterable<Integer> getAllDocs() { return new AbstractSkippableIterable<Integer>() { @Override public SkippableIterator<Integer> iterator() { return new AbstractSkippableIterator<Integer>() { int current = -1; @Override public void skipTo(int i) { current = i-1; } @Override protected Integer computeNext() { while (++current < docCount.get()) { if (!internalDeletes.get(current)) { return current; } } return endOfData(); } }; } }; } @Override public int countMatches(Query query) throws InterruptedException { return matcher.countMatches(query); } @Override public int countMatches(Query query, Predicate<DocId> idFilter) throws InterruptedException { return matcher.countMatches(query, idFilter); } public Map<String, String> getStats(String prefix) { Map<String, String> stats = Maps.newHashMap(); stats.put(prefix + "size", String.valueOf(docCount.get())); stats.put(prefix + "terms", String.valueOf(invertedIndex.size())); stats.put(prefix + "deletes", String.valueOf(deletes.size())); stats.put(prefix + "internal_deletes", String.valueOf(internalDeletes.cardinality())); return stats; } }