/* * Copyright (c) 2011 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.flaptor.indextank.index.lsi.term; import java.io.IOException; import java.util.NavigableMap; import java.util.TreeMap; import org.apache.log4j.Logger; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermPositions; import org.apache.lucene.search.Similarity; import com.flaptor.indextank.index.DocId; import com.flaptor.indextank.index.ScoredMatch; import com.flaptor.indextank.index.term.DocTermMatch; import com.flaptor.indextank.index.term.TermMatcher; import com.flaptor.indextank.index.term.query.RawMatch; import com.flaptor.indextank.util.AbstractSkippableIterable; import com.flaptor.indextank.util.AbstractSkippableIterator; import com.flaptor.indextank.util.SkippableIterable; import com.flaptor.indextank.util.SkippableIterator; import com.flaptor.util.Execute; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.Iterables; public class IndexReaderTermMatcher implements TermMatcher { private static final Logger logger = Logger.getLogger(Execute.whoAmI()); private final IndexReader reader; private Term payloadTerm; public IndexReaderTermMatcher(IndexReader reader, Term payloadTerm) { Preconditions.checkNotNull(reader); Preconditions.checkNotNull(payloadTerm); this.reader = reader; this.payloadTerm = payloadTerm; } @Override public SkippableIterable<DocTermMatch> getMatches(final String field, String termText) { final Term term = new Term(field, termText); return getDocTermIterator(term); } @Override public NavigableMap<String, SkippableIterable<DocTermMatch>> getMatches(String field, String termFrom, String termTo) { TermEnum terms = null; NavigableMap<String, SkippableIterable<DocTermMatch>> result = new TreeMap<String, SkippableIterable<DocTermMatch>>(); try { terms = reader.terms(new Term(field, termFrom)); Term rightBoundary = new Term(field, termTo); int numberOfTerms = 0; if (terms.term() != null) { do { Term term = terms.term(); if (term.compareTo(rightBoundary) >= 0) { break; } SkippableIterable<DocTermMatch> docTermIterator = getDocTermIterator(term); result.put(term.text(), docTermIterator); numberOfTerms++; if (numberOfTerms >= 1000) { break; } } while (terms.next()); } return result; } catch (IOException e) { throw new RuntimeException(e); } finally { if (terms != null) { Execute.close(terms); } } } private SkippableIterable<DocTermMatch> getDocTermIterator(final Term term) { return new AbstractSkippableIterable<DocTermMatch>() { @Override public SkippableIterator<DocTermMatch> iterator() { try { return new AbstractSkippableIterator<DocTermMatch>() { final TermPositions tp = reader.termPositions(term); private Integer nextId = null; DocTermMatch m = null; private DocTermMatch match(int rawId, int freq, float norm) throws IOException { if (m == null) { m = new DocTermMatch(rawId, new int[freq], freq, norm); } else { m.setRawId(rawId); m.setPositionsLength(freq); m.setNormalization(norm); } int[] positions = m.getPositions(); if (freq > positions.length) { positions = new int[freq]; m.setPositions(positions); } for (int i = 0; i < freq; i++) { m.getPositions()[i] = tp.nextPosition(); } return m; } @Override protected DocTermMatch computeNext() { try { if (nextId == null ? tp.next() : tp.skipTo(nextId)) { //if (tp.next()) { int rawId = tp.doc(); nextId = rawId + 1; int freq = tp.freq(); float norm = Similarity.decodeNorm(reader.norms(term.field())[rawId]); return match(rawId, freq, norm); } else { return endOfData(); } } catch (IOException e) { throw new RuntimeException(e); } } @Override public void skipTo(int i) { nextId = i; } }; } catch (IOException e) { throw new RuntimeException(e); } } }; } @Override public boolean hasChanges(DocId docid) { throw new UnsupportedOperationException(); } @Override public Iterable<ScoredMatch> decode(Iterable<RawMatch> rawMatches, final double boostedNorm) { try { final TermPositions payloads = reader.termPositions(payloadTerm); return Iterables.transform(rawMatches, new Function<RawMatch, ScoredMatch>() { private byte[] data = new byte[256]; private ScoredMatch match = new ScoredMatch(0, new DocId(data, 0, 0)); @Override public ScoredMatch apply(RawMatch rawMatch) { int rawId = rawMatch.getRawId(); try { if (payloads.skipTo(rawId) && payloads.doc() == rawId) { payloads.nextPosition(); int size = payloads.getPayloadLength(); if (size > data.length) { data = new byte[size]; } payloads.getPayload(data, 0); match.getDocId().update(data, 0, size); match.setScore(rawMatch.getBoostedScore() / boostedNorm); return match; } else { throw new IllegalArgumentException("rawId:" + rawId + " doesn't exist. Payloads.doc():" + payloads.doc()); } } catch (IOException e) { try { org.apache.lucene.document.Document d = reader.document(rawId); logger.error("Document without payload: " + d.toString()); } catch (Exception ee) { logger.error(ee); } throw new RuntimeException(e); } } }); } catch (IOException e) { throw new RuntimeException(e); } } @Override public SkippableIterable<Integer> getAllDocs() { return new AbstractSkippableIterable<Integer>() { @Override public SkippableIterator<Integer> iterator() { return new AbstractSkippableIterator<Integer>() { int current = -1; @Override public void skipTo(int i) { current = i-1; } @Override protected Integer computeNext() { while (++current < reader.maxDoc()) { if (!reader.isDeleted(current)) { return current; } } return endOfData(); } }; } }; } }