package com.browseengine.bobo.sort; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; import com.browseengine.bobo.api.BoboSegmentReader; import com.browseengine.bobo.api.Browsable; import com.browseengine.bobo.api.BrowseFacet; import com.browseengine.bobo.api.BrowseHit; import com.browseengine.bobo.api.BrowseHit.BoboTerm; import com.browseengine.bobo.api.FacetAccessible; import com.browseengine.bobo.api.FacetSpec; import com.browseengine.bobo.facets.CombinedFacetAccessible; import com.browseengine.bobo.facets.FacetCountCollector; import com.browseengine.bobo.facets.FacetHandler; import com.browseengine.bobo.facets.data.FacetDataCache; import com.browseengine.bobo.facets.data.PrimitiveLongArrayWrapper; import com.browseengine.bobo.util.ListMerger; public class SortCollectorImpl extends SortCollector { private static final Comparator<MyScoreDoc> MERGE_COMPATATOR = new Comparator<MyScoreDoc>() { @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public int compare(MyScoreDoc o1, MyScoreDoc o2) { Comparable s1 = o1.getValue(); Comparable s2 = o2.getValue(); int r; if (s1 == null) { if (s2 == null) { r = 0; } else { r = -1; } } else if (s2 == null) { r = 1; } else { int v = s1.compareTo(s2); if (v == 0) { r = o1.doc + o1.queue.base - o2.doc - o2.queue.base; } else { r = v; } } return r; } }; private final LinkedList<DocIDPriorityQueue> _pqList; private final int _numHits; private int _totalHits; private ScoreDoc _bottom; private ScoreDoc _tmpScoreDoc; private boolean _queueFull; private DocComparator _currentComparator; private final DocComparatorSource _compSource; private DocIDPriorityQueue _currentQueue; private BoboSegmentReader _currentReader = null; private FacetCountCollector[] _facetCountCollectorMulti = null; private final boolean _doScoring; private Scorer _scorer; private final int _offset; private final int _count; private final Browsable _boboBrowser; private final boolean _collectDocIdCache; private CombinedFacetAccessible[] _groupAccessibles; private final List<FacetAccessible>[] _facetAccessibleLists; private final Int2ObjectOpenHashMap<ScoreDoc> _currentValueDocMaps; static class MyScoreDoc extends ScoreDoc { DocIDPriorityQueue queue; BoboSegmentReader reader; Comparable<?> sortValue; public MyScoreDoc() { this(0, 0.0f, null, null); } public MyScoreDoc(int docid, float score, DocIDPriorityQueue queue, BoboSegmentReader reader) { super(docid, score); this.queue = queue; this.reader = reader; this.sortValue = null; } Comparable<?> getValue() { if (sortValue == null) sortValue = queue.sortValue(this); return sortValue; } } private CollectorContext _currentContext; private int[] _currentDocIdArray; private float[] _currentScoreArray; private int _docIdArrayCursor = 0; private int _docIdCacheCapacity = 0; private final Set<String> _termVectorsToFetch; @SuppressWarnings("unchecked") public SortCollectorImpl(DocComparatorSource compSource, SortField[] sortFields, Browsable boboBrowser, int offset, int count, boolean doScoring, boolean fetchStoredFields, Set<String> termVectorsToFetch, String[] groupBy, int maxPerGroup, boolean collectDocIdCache) { super(sortFields, fetchStoredFields); assert (offset >= 0 && count >= 0); _boboBrowser = boboBrowser; _compSource = compSource; _pqList = new LinkedList<DocIDPriorityQueue>(); _numHits = offset + count; _offset = offset; _count = count; _totalHits = 0; _queueFull = false; _doScoring = doScoring; _tmpScoreDoc = new MyScoreDoc(); _termVectorsToFetch = termVectorsToFetch; _collectDocIdCache = collectDocIdCache || groupBy != null; if (groupBy != null && groupBy.length != 0) { List<FacetHandler<?>> groupByList = new ArrayList<FacetHandler<?>>(groupBy.length); for (String field : groupBy) { FacetHandler<?> handler = boboBrowser.getFacetHandler(field); if (handler != null) groupByList.add(handler); } if (groupByList.size() > 0) { this.groupByMulti = groupByList.toArray(new FacetHandler<?>[0]); this.groupBy = groupByMulti[0]; } if (this.groupBy != null && _count > 0) { if (groupByMulti.length == 1) { _currentValueDocMaps = new Int2ObjectOpenHashMap<ScoreDoc>(_count); _facetAccessibleLists = null; } else { _currentValueDocMaps = null; _facetCountCollectorMulti = new FacetCountCollector[groupByList.size() - 1]; _facetAccessibleLists = new List[_facetCountCollectorMulti.length]; for (int i = 0; i < _facetCountCollectorMulti.length; ++i) { _facetAccessibleLists[i] = new LinkedList<FacetAccessible>(); } } if (_collectDocIdCache) { contextList = new LinkedList<CollectorContext>(); docidarraylist = new LinkedList<int[]>(); if (doScoring) scorearraylist = new LinkedList<float[]>(); } } else { _currentValueDocMaps = null; _facetAccessibleLists = null; } } else { _currentValueDocMaps = null; _facetAccessibleLists = null; } } @Override public boolean acceptsDocsOutOfOrder() { return _collector == null ? true : _collector.acceptsDocsOutOfOrder(); } @Override public void collect(int doc) throws IOException { ++_totalHits; if (groupBy != null) { if (_facetCountCollectorMulti != null) { for (int i = 0; i < _facetCountCollectorMulti.length; ++i) { if (_facetCountCollectorMulti[i] != null) _facetCountCollectorMulti[i].collect(doc); } if (_count > 0) { final float score = (_doScoring ? _scorer.score() : 0.0f); if (_collectDocIdCache) { if (_totalHits > _docIdCacheCapacity) { _currentDocIdArray = intarraymgr.get(BLOCK_SIZE); docidarraylist.add(_currentDocIdArray); if (_doScoring) { _currentScoreArray = floatarraymgr.get(BLOCK_SIZE); scorearraylist.add(_currentScoreArray); } _docIdCacheCapacity += BLOCK_SIZE; _docIdArrayCursor = 0; } _currentDocIdArray[_docIdArrayCursor] = doc; if (_doScoring) _currentScoreArray[_docIdArrayCursor] = score; ++_docIdArrayCursor; ++_currentContext.length; } } return; } else { if (_count > 0) { final float score = (_doScoring ? _scorer.score() : 0.0f); if (_collectDocIdCache) { if (_totalHits > _docIdCacheCapacity) { _currentDocIdArray = intarraymgr.get(BLOCK_SIZE); docidarraylist.add(_currentDocIdArray); if (_doScoring) { _currentScoreArray = floatarraymgr.get(BLOCK_SIZE); scorearraylist.add(_currentScoreArray); } _docIdCacheCapacity += BLOCK_SIZE; _docIdArrayCursor = 0; } _currentDocIdArray[_docIdArrayCursor] = doc; if (_doScoring) _currentScoreArray[_docIdArrayCursor] = score; ++_docIdArrayCursor; ++_currentContext.length; } _tmpScoreDoc.doc = doc; _tmpScoreDoc.score = score; if (!_queueFull || _currentComparator.compare(_bottom, _tmpScoreDoc) > 0) { final int order = ((FacetDataCache<?>) groupBy.getFacetData(_currentReader)).orderArray .get(doc); ScoreDoc pre = _currentValueDocMaps.get(order); if (pre != null) { if (_currentComparator.compare(pre, _tmpScoreDoc) > 0) { ScoreDoc tmp = pre; _bottom = _currentQueue.replace(_tmpScoreDoc, pre); _currentValueDocMaps.put(order, _tmpScoreDoc); _tmpScoreDoc = tmp; } } else { if (_queueFull) { MyScoreDoc tmp = (MyScoreDoc) _bottom; _currentValueDocMaps .remove(((FacetDataCache<?>) groupBy.getFacetData(tmp.reader)).orderArray .get(tmp.doc)); _bottom = _currentQueue.replace(_tmpScoreDoc); _currentValueDocMaps.put(order, _tmpScoreDoc); _tmpScoreDoc = tmp; } else { ScoreDoc tmp = new MyScoreDoc(doc, score, _currentQueue, _currentReader); _bottom = _currentQueue.add(tmp); _currentValueDocMaps.put(order, tmp); _queueFull = (_currentQueue.size >= _numHits); } } } } } } else { if (_count > 0) { final float score = (_doScoring ? _scorer.score() : 0.0f); if (_queueFull) { _tmpScoreDoc.doc = doc; _tmpScoreDoc.score = score; if (_currentComparator.compare(_bottom, _tmpScoreDoc) > 0) { ScoreDoc tmp = _bottom; _bottom = _currentQueue.replace(_tmpScoreDoc); _tmpScoreDoc = tmp; } } else { _bottom = _currentQueue.add(new MyScoreDoc(doc, score, _currentQueue, _currentReader)); _queueFull = (_currentQueue.size >= _numHits); } } } if (_collector != null) _collector.collect(doc); } @Override public void setNextReader(AtomicReaderContext context) throws IOException { AtomicReader reader = context.reader(); if (!(reader instanceof BoboSegmentReader)) { throw new IllegalStateException("The reader is not instance of " + BoboSegmentReader.class); } _currentReader = (BoboSegmentReader) reader; int docBase = context.docBase; _currentComparator = _compSource.getComparator(reader, docBase); _currentQueue = new DocIDPriorityQueue(_currentComparator, _numHits, docBase); if (groupBy != null) { if (_facetCountCollectorMulti != null) { // _facetCountCollectorMulti.length >= 1 for (int i = 0; i < _facetCountCollectorMulti.length; ++i) { _facetCountCollectorMulti[i] = groupByMulti[i].getFacetCountCollectorSource(null, null, true).getFacetCountCollector(_currentReader, docBase); } if (_facetAccessibleLists != null) { for (int i = 0; i < _facetCountCollectorMulti.length; ++i) { _facetAccessibleLists[i].add(_facetCountCollectorMulti[i]); } } } if (_currentValueDocMaps != null) _currentValueDocMaps.clear(); if (contextList != null) { _currentContext = new CollectorContext(_currentReader, docBase, _currentComparator); contextList.add(_currentContext); } } MyScoreDoc myScoreDoc = (MyScoreDoc) _tmpScoreDoc; myScoreDoc.queue = _currentQueue; myScoreDoc.reader = _currentReader; myScoreDoc.sortValue = null; _pqList.add(_currentQueue); _queueFull = false; } @Override public void setScorer(Scorer scorer) throws IOException { _scorer = scorer; _currentComparator.setScorer(scorer); } @Override public int getTotalHits() { return _totalHits; } @Override public int getTotalGroups() { return _totalHits; } @Override public CombinedFacetAccessible[] getGroupAccessibles() { return _groupAccessibles; } @Override public BrowseHit[] topDocs() throws IOException { ArrayList<Iterator<MyScoreDoc>> iterList = new ArrayList<Iterator<MyScoreDoc>>(_pqList.size()); for (DocIDPriorityQueue pq : _pqList) { int count = pq.size(); MyScoreDoc[] resList = new MyScoreDoc[count]; for (int i = count - 1; i >= 0; i--) { resList[i] = (MyScoreDoc) pq.pop(); } iterList.add(Arrays.asList(resList).iterator()); } List<MyScoreDoc> resList; if (_count > 0) { if (groupBy == null) { resList = ListMerger.mergeLists(_offset, _count, iterList, MERGE_COMPATATOR); } else { int rawGroupValueType = 0; // 0: unknown, 1: normal, 2: long[] PrimitiveLongArrayWrapper primitiveLongArrayWrapperTmp = new PrimitiveLongArrayWrapper(null); Object rawGroupValue = null; if (_facetAccessibleLists != null) { _groupAccessibles = new CombinedFacetAccessible[_facetAccessibleLists.length]; for (int i = 0; i < _facetAccessibleLists.length; ++i) _groupAccessibles[i] = new CombinedFacetAccessible(new FacetSpec(), _facetAccessibleLists[i]); } resList = new ArrayList<MyScoreDoc>(_count); Iterator<MyScoreDoc> mergedIter = ListMerger.mergeLists(iterList, MERGE_COMPATATOR); Set<Object> groupSet = new HashSet<Object>(_offset + _count); int offsetLeft = _offset; while (mergedIter.hasNext()) { MyScoreDoc scoreDoc = mergedIter.next(); Object[] vals = groupBy.getRawFieldValues(scoreDoc.reader, scoreDoc.doc); rawGroupValue = null; if (vals != null && vals.length > 0) rawGroupValue = vals[0]; if (rawGroupValueType == 0) { if (rawGroupValue != null) { if (rawGroupValue instanceof long[]) rawGroupValueType = 2; else rawGroupValueType = 1; } } if (rawGroupValueType == 2) { primitiveLongArrayWrapperTmp.data = (long[]) rawGroupValue; rawGroupValue = primitiveLongArrayWrapperTmp; } if (!groupSet.contains(rawGroupValue)) { if (offsetLeft > 0) --offsetLeft; else { resList.add(scoreDoc); if (resList.size() >= _count) break; } groupSet.add(new PrimitiveLongArrayWrapper(primitiveLongArrayWrapperTmp.data)); } } } } else resList = Collections.emptyList(); Map<String, FacetHandler<?>> facetHandlerMap = _boboBrowser.getFacetHandlerMap(); return buildHits(resList.toArray(new MyScoreDoc[resList.size()]), _sortFields, facetHandlerMap, _fetchStoredFields, _termVectorsToFetch, groupBy, _groupAccessibles); } protected static BrowseHit[] buildHits(MyScoreDoc[] scoreDocs, SortField[] sortFields, Map<String, FacetHandler<?>> facetHandlerMap, boolean fetchStoredFields, Set<String> termVectorsToFetch, FacetHandler<?> groupBy, CombinedFacetAccessible[] groupAccessibles) throws IOException { BrowseHit[] hits = new BrowseHit[scoreDocs.length]; Collection<FacetHandler<?>> facetHandlers = facetHandlerMap.values(); for (int i = scoreDocs.length - 1; i >= 0; i--) { MyScoreDoc fdoc = scoreDocs[i]; BoboSegmentReader reader = fdoc.reader; BrowseHit hit = new BrowseHit(); if (fetchStoredFields) { hit.setStoredFields(reader.document(fdoc.doc)); } if (termVectorsToFetch != null && termVectorsToFetch.size() > 0) { Map<String, List<BoboTerm>> tvMap = new HashMap<String, List<BoboTerm>>(); hit.setTermVectorMap(tvMap); Fields fds = reader.getTermVectors(fdoc.doc); for (String field : termVectorsToFetch) { Terms terms = fds.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(null); BytesRef text; DocsAndPositionsEnum docsAndPositions = null; List<BoboTerm> boboTermList = new ArrayList<BoboTerm>(); while ((text = termsEnum.next()) != null) { BoboTerm boboTerm = new BoboTerm(); boboTerm.term = text.utf8ToString(); boboTerm.freq = (int) termsEnum.totalTermFreq(); docsAndPositions = termsEnum.docsAndPositions(null, docsAndPositions); if (docsAndPositions != null) { docsAndPositions.nextDoc(); boboTerm.positions = new ArrayList<Integer>(); boboTerm.startOffsets = new ArrayList<Integer>(); boboTerm.endOffsets = new ArrayList<Integer>(); for (int t = 0; t < boboTerm.freq; ++t) { boboTerm.positions.add(docsAndPositions.nextPosition()); boboTerm.startOffsets.add(docsAndPositions.startOffset()); boboTerm.endOffsets.add(docsAndPositions.endOffset()); } } boboTermList.add(boboTerm); } tvMap.put(field, boboTermList); } } Map<String, String[]> map = new HashMap<String, String[]>(); Map<String, Object[]> rawMap = new HashMap<String, Object[]>(); for (FacetHandler<?> facetHandler : facetHandlers) { map.put(facetHandler.getName(), facetHandler.getFieldValues(reader, fdoc.doc)); rawMap.put(facetHandler.getName(), facetHandler.getRawFieldValues(reader, fdoc.doc)); } hit.setFieldValues(map); hit.setRawFieldValues(rawMap); hit.setDocid(fdoc.doc + fdoc.queue.base); hit.setScore(fdoc.score); hit.setComparable(fdoc.getValue()); if (groupBy != null) { hit.setGroupField(groupBy.getName()); hit.setGroupValue(hit.getField(groupBy.getName())); hit.setRawGroupValue(hit.getRawField(groupBy.getName())); if (groupAccessibles != null && hit.getGroupValue() != null && groupAccessibles != null && groupAccessibles.length > 0) { BrowseFacet facet = groupAccessibles[0].getFacet(hit.getGroupValue()); hit.setGroupHitsCount(facet.getFacetValueHitCount()); } } hits[i] = hit; } return hits; } }