package org.fastcatsearch.ir.search;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.BytesRef;
import org.fastcatsearch.error.CoreErrorCode;
import org.fastcatsearch.error.SearchError;
import org.fastcatsearch.ir.analysis.AnalyzerPool;
import org.fastcatsearch.ir.common.IRException;
import org.fastcatsearch.ir.common.SettingException;
import org.fastcatsearch.ir.document.Document;
import org.fastcatsearch.ir.field.DocNoField;
import org.fastcatsearch.ir.field.Field;
import org.fastcatsearch.ir.field.ScoreField;
import org.fastcatsearch.ir.field.UnknownField;
import org.fastcatsearch.ir.filter.FilterException;
import org.fastcatsearch.ir.group.GroupDataMerger;
import org.fastcatsearch.ir.group.GroupHit;
import org.fastcatsearch.ir.group.GroupsData;
import org.fastcatsearch.ir.io.*;
import org.fastcatsearch.ir.query.*;
import org.fastcatsearch.ir.query.Term.Option;
import org.fastcatsearch.ir.search.clause.Clause;
import org.fastcatsearch.ir.search.clause.ClauseException;
import org.fastcatsearch.ir.settings.FieldSetting;
import org.fastcatsearch.ir.settings.FieldSetting.Type;
import org.fastcatsearch.ir.settings.Schema;
import org.fastcatsearch.ir.summary.BasicHighlightAndSummary;
import org.fastcatsearch.ir.util.Formatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class CollectionSearcher {
private static Logger logger = LoggerFactory.getLogger(CollectionSearcher.class);
private String collectionId;
private CollectionHandler collectionHandler;
private HighlightAndSummary has;
private int bundleMemMaxCountLimit = 10 * 10000;
private int bundleHashBucketSize = 100 * 10000;
public CollectionSearcher(CollectionHandler collectionHandler) {
this.collectionId = collectionHandler.collectionId();
this.collectionHandler = collectionHandler;
has = new BasicHighlightAndSummary();
//묶음검색 파일기반 해시셋 설정값.
String bundleMemMaxCount = System.getProperty("bundleMemMaxCount");
String bundleHashBucket = System.getProperty("bundleHashBucket");
if(bundleMemMaxCount != null) {
bundleMemMaxCountLimit = Integer.parseInt(bundleMemMaxCount);
}
if(bundleHashBucket != null) {
bundleHashBucketSize = Integer.parseInt(bundleHashBucket);
}
}
public GroupsData doGrouping(Query q) throws Exception {
int segmentSize = collectionHandler.segmentSize();
if (segmentSize == 0) {
throw new SearchError(CoreErrorCode.COLLECTION_NOT_INDEXED, collectionId);
}
Groups groups = q.getGroups();
if (groups == null) {
return null;
}
if (segmentSize == 1) {
// 머징필요없음.
GroupHit groupHit = collectionHandler.segmentSearcher(0).searchGroupHit(q);
return groupHit.groupData();
} else {
GroupDataMerger dataMerger = null;
if (groups != null) {
dataMerger = new GroupDataMerger(groups, segmentSize);
}
for (int i = 0; i < segmentSize; i++) {
GroupHit groupHit = collectionHandler.segmentSearcher(i).searchGroupHit(q);
if (dataMerger != null) {
dataMerger.put(groupHit.groupData());
}
}
GroupsData groupData = null;
if (dataMerger != null) {
groupData = dataMerger.merge();
}
return groupData;
}
}
// id리스트에 해당하는 document자체를 읽어서 리스트로 리턴한다.
@Deprecated
public List<Document> requestDocument(int[] docIdList) throws IOException {
// eachDocList에 해당하는 문서리스트를 리턴한다.
List<Document> documentList = new ArrayList<Document>(docIdList.length);
int segmentSize = collectionHandler.segmentSize();
for (int i = 0; i < docIdList.length; i++) {
int docNo = docIdList[i];
// make doc number lists to send each columns
for (int m = segmentSize - 1; m >= 0; m--) {
if (docNo >= collectionHandler.segmentReader(m).segmentInfo().getBaseNumber()) {
documentList.add(collectionHandler.segmentReader(m).segmentSearcher().getDocument(docNo));
break;
}
}
}
return documentList;
}
public Document requestDocument(int docNo) throws IOException {
int segmentSize = collectionHandler.segmentSize();
// make doc number lists to send each columns
for (int m = segmentSize - 1; m >= 0; m--) {
if (docNo >= collectionHandler.segmentReader(m).segmentInfo().getBaseNumber()) {
return collectionHandler.segmentReader(m).segmentSearcher().getDocument(docNo);
}
}
return null;
}
public InternalSearchResult searchInternal(Query q) throws IRException, IOException, SettingException {
return searchInternal(q, false, null);
}
/**
* @param forMerging : 머징용도이면 start + length 만큼을 앞에서부터 모두 가져온다.
* */
public InternalSearchResult searchInternal(Query q, boolean forMerging) throws IRException, IOException, SettingException {
return searchInternal(q, forMerging, null);
}
public InternalSearchResult searchInternal(Query q, boolean forMerging, PkScoreList boostList) throws IRException, IOException, SettingException {
int segmentSize = collectionHandler.segmentSize();
if (segmentSize == 0) {
throw new SearchError(CoreErrorCode.COLLECTION_NOT_INDEXED, collectionId);
}
// logger.debug("searchInternal incrementCount > {} ", q);
collectionHandler.queryCounter().incrementCount();
Schema schema = collectionHandler.schema();
Metadata meta = q.getMeta();
int start = meta.start();
int rows = meta.rows();
int sortMaxSize = start + rows - 1;
int resultRows = rows;
if (forMerging) {// 앞에서 부터 모두.
resultRows = sortMaxSize;
}
// if(collectionId == null){
// collectionId = meta.collectionId();
// }
Groups groups = q.getGroups();
Sorts sorts = q.getSorts();
// FixedMinHeap<FixedHitReader> hitMerger = null;
FixedMaxPriorityQueue<HitElement> ranker = null;
if (sorts == null) {
// hitMerger = sorts.createMerger(schema, segmentSize);
//TODO
//BundleDefaultRanker (fieldIndexesReader, bundle)
ranker = new DefaultRanker(sortMaxSize);
} else {
// hitMerger = new FixedMinHeap<FixedHitReader>(segmentSize);
// ranker에 정렬 로직이 담겨있다.
// ranker 안에는 필드타입과 정렬옵션을 확인하여 적합한 byte[] 비교를 수행한다.
ranker = sorts.createRanker(schema, sortMaxSize);
}
GroupDataMerger dataMerger = null;
if (groups != null) {
dataMerger = new GroupDataMerger(groups, segmentSize);
}
HighlightInfo highlightInfo = null;
int totalSize = 0;
// bundleKeySet 는 동일그룹갯수를 확인하는 용도이다.
// 묶음검색에서는 전체 문서갯수가 아닌 묶음의 갯수가 총 결과갯수가 되므로, 그룹중복을 제거하여 계산해주어야 한다.
// 32byte의 key를 HashSet에 넣었을때 100만개에 100MB, 1000만개에 1G 정도 메모리 소요.
// 대부분 100만개 이하일 것이므로, 메모리에서 수행하도록 한다.
// Set<BytesRef> bundleKeySet = new HashSet<BytesRef>();
int keySize = 0;
HybridHashSet bundleKeySet = new HybridHashSet(bundleMemMaxCountLimit, bundleHashBucketSize, keySize);
List<Explanation> explanationList = null;
BitSet[] segmentDocHitSetList = null;
try {
segmentDocHitSetList = new BitSet[segmentSize];
for (int i = 0; i < segmentSize; i++) {
// segment 의 모든 결과를 보아야 중복체크가 가능하므로 reader를 받아오도록 한다.
HitReader hitReader = collectionHandler.segmentSearcher(i).searchHitReader(q, boostList);
//
//
//FIXME highlightInfo 계속 덮어쓰나?
//
if (highlightInfo == null) {
highlightInfo = hitReader.highlightInfo();
}
segmentDocHitSetList[i] = new BitSet();
// posting data
HitElement e = null;
while ((e = hitReader.next()) != null) {
if (e.getBundleKey() != null) {
if(keySize == 0) {
keySize = e.getBundleKey().length();
bundleKeySet.setKeySize(keySize);
}
segmentDocHitSetList[i].set(e.docNo());
if(bundleKeySet.add(e.getBundleKey())) {
totalSize++;
}
} else {
totalSize++;
}
ranker.push(e);
// logger.debug("heap insert hit > {}", e.docNo());
}
// Put GroupResult
if (dataMerger != null) {
dataMerger.put(hitReader.makeGroupData());
}
if(hitReader.explanation() != null){
if(explanationList == null){
explanationList = new ArrayList<Explanation>();
}
hitReader.explanation().setSegmentId(i);
hitReader.explanation().setCollectionId(collectionId);
explanationList.add(hitReader.explanation());
}
}
} catch (IOException e) {
throw new IRException(e);
} catch (FilterException e) {
throw new IRException(e);
} catch (ClauseException e) {
throw new IRException(e);
} finally {
if(bundleKeySet != null) {
bundleKeySet.clean();
}
}
int rankerSize = ranker.size();
// logger.debug("PAGE start={}, size={}", start, rankerSize);
FixedHitStack hitStack = new FixedHitStack(rankerSize);
for (int i = 1; i <= rankerSize; i++) {
HitElement el = ranker.pop();
hitStack.push(el);
}
int c = 1;
FixedHitReader fixedHitReader = hitStack.getReader();
FixedHitQueue totalHit = new FixedHitQueue(resultRows);
while(fixedHitReader.next()) {
HitElement el = fixedHitReader.read();
// logger.debug("{} rank hit seg#{} {}", c, el.segmentSequence(), el.docNo(), el.score(), el.rowExplanations());
if (forMerging) {
//머징용도는 처음부터 모두 넣는다.
//번들키가 없거나 totalHit에 존재하지 않을때만 추가하고 나머지는 버린다.
totalHit.push(el);
} else if (c >= start) {
//차후 머징용도가 아니라면 start이후 부터만 가져온다.
// logger.debug("insert#{} > {}", c, el.docNo());
totalHit.push(el);
}
c++;
}
GroupsData groupData = null;
if (dataMerger != null) {
groupData = dataMerger.merge();
}
HitElement[] hitElementList = totalHit.getHitElementList();
int realSize = totalHit.size();
/*
* 번들 요청이 있으면 하위 묶음문서를 찾아온다.
* */
Bundle bundle = q.getBundle();
if(bundle != null) {
//검색결과의 hit내에서만 검색되도록 해야하므로, bitSet으로 filtering한다.
fillBundleResult(schema, segmentSize, hitElementList, realSize, bundle, segmentDocHitSetList);
}
return new InternalSearchResult(collectionId, hitElementList, realSize, totalSize, groupData, highlightInfo, explanationList);
}
/*
* 번들 문서를 찾아온다.
* */
private void fillBundleResult(Schema schema, int segmentSize, HitElement[] hitElementList, int size, Bundle bundle, BitSet[] segmentDocFilterList) throws IRException{
/*
* el의 bundlekey를 보고 하위 묶음문서가 몇개가 있는지 확인한다.
* 2개 이상일 경우만 저장하고 나머지는 버린다.
*/
String fieldIndexId = bundle.getFieldIndexId();
Sorts bundleSorts = bundle.getSorts();
int bundleRows = bundle.getRows();
int bundleStart = 1;
int bundleOption = bundle.getOption();
boolean isParentInclude = ( bundleOption == Bundle.OPT_PARENT_INCLUDE );
FieldSetting bundleFieldSetting = schema.getFieldSetting(bundle.getFieldIndexId());
Type bundleFieldType = bundleFieldSetting.getType();
try {
for (int k = 0; k < size; k++) {
int totalSize = 0;
//bundleKey로 clause생성한다.
int mainDocNo = hitElementList[k].docNo();
BytesRef bundleKey = hitElementList[k].getBundleKey();
if(bundleKey == null) {
continue;
}
String bundleStringKey = Formatter.getContentString(bundleKey, bundleFieldType);
if(bundleStringKey == null) {
continue;
}
Clause bundleClause = new Clause(new Term(fieldIndexId, bundleStringKey));
Hit[] segmentHitList = new Hit[segmentSize];
for (int i = 0; i < segmentSize; i++) {
//bundle key 별로 결과를 모은다.
segmentHitList[i] = collectionHandler.segmentSearcher(i).searchIndex(bundleClause, bundleSorts, bundleStart, bundleRows, segmentDocFilterList[i]);
totalSize += segmentHitList[i].totalCount();
}
//2이상이어야만 번들이 유효하다.
if(totalSize > 1) {
FixedMinHeap<FixedHitReader> hitMerger = null;
if (bundleSorts != null) {
hitMerger = bundleSorts.createMerger(schema, segmentSize);
} else {
hitMerger = new FixedMinHeap<FixedHitReader>(segmentSize);
}
for (int i = 0; i < segmentSize; i++) {
FixedHitReader hitReader = segmentHitList[i].hitStack().getReader();
// // posting data
if (hitReader.next()) {
hitMerger.push(hitReader);
}
}
int realSize = Math.min(bundleRows, totalSize);
DocIdList bundleDocIdList = new DocIdList(realSize);
int c = 1, n = 0;
while (hitMerger.size() > 0) {
FixedHitReader r = hitMerger.peek();
HitElement el = r.read();
//mainDocNo 와 동일한 문서는 제외한다.
//group 문서들에서 그룹대표 문서와 동일한 것은 보여주지 않는다.
//단, 대표문서를 포함옵션이 있다면 추가한다.
if(isParentInclude || el.docNo() != mainDocNo) {
if (c >= bundleStart) {
bundleDocIdList.add(el.segmentSequence(), el.docNo());
n++;
//logger.debug("[{}] {}", el.segmentSequence() ,el.docNo());
}
c++;
}
// 결과가 만들어졌으면 일찍 끝낸다.
if (n == bundleRows) {
break;
}
if (!r.next()) {
// 다 읽은 것은 버린다.
hitMerger.pop();
}
hitMerger.heapify();
}
//대표가 포함되지 않으면, 갯수를 줄인다.
if(!isParentInclude) {
totalSize--;
}
hitElementList[k].setBundleDocIdList(bundleDocIdList);
hitElementList[k].setTotalBundleSize(totalSize);
}
}
} catch (ClauseException e) {
throw new IRException(e);
} catch (IOException e) {
throw new IRException(e);
}
}
public DocumentResult searchDocument(DocIdList list, ViewContainer views, String[] tags, HighlightInfo highlightInfo) throws IOException {
int realSize = list.size();
Row[] row = new Row[realSize];
Row[][] bundleRow = null;
int fieldSize = collectionHandler.schema().getFieldSize();
int viewSize = views.size();
int[] fieldSequenceList = new int[viewSize];
String[] fieldIdList = new String[viewSize];
boolean[] fieldSelectOption = new boolean[fieldSize]; // true인 index의 필드값만 채워진다.
for (int i = 0; i < views.size(); i++) {
View v = views.get(i);
String fieldId = v.fieldId();
fieldIdList[i] = fieldId;
int sequence = -1;
if (fieldId.equalsIgnoreCase(ScoreField.fieldName)) {
sequence = ScoreField.fieldNumber;
} else if (fieldId.equalsIgnoreCase(DocNoField.fieldName)) {
sequence = DocNoField.fieldNumber;
} else {
sequence = collectionHandler.schema().getFieldSequence(fieldId);
if(sequence != -1){
fieldSelectOption[sequence] = true;
}
}
fieldSequenceList[i] = sequence;
}
Document[] eachDocList = new Document[realSize];
Document[][] eachBundleDocList = null;
//SegmentSearcher를 재사용하기 위한 array. Lazy-loading되며, segmentSequence가 array 첨자가 된다.
//처음에는 길이 5로 만들어놓고 나중에 더 필요하면, grow시킨다.
SegmentSearcher[] segmentSearcherList = new SegmentSearcher[5];
int idx = 0;
for (int i = 0; i < list.size(); i++) {
int segmentSequence = list.segmentSequence(i);
int docNo = list.docNo(i);
DocIdList bundleDocIdList = list.bundleDocIdList(i);
int size = segmentSearcherList.length;
//기존 범위를 벗어나는 세그먼트 요청이 있을 때 grow한다.
if(segmentSequence >= size){
while(segmentSequence >= size){
size += 5;
}
SegmentSearcher[] newSegmentSearcherList = new SegmentSearcher[size];
System.arraycopy(segmentSearcherList, 0, newSegmentSearcherList, 0, segmentSearcherList.length);
segmentSearcherList = newSegmentSearcherList;
}
if(segmentSearcherList[segmentSequence] == null) {
segmentSearcherList[segmentSequence] = collectionHandler.segmentReader(segmentSequence).segmentSearcher();
}
Document doc = segmentSearcherList[segmentSequence].getDocument(docNo, fieldSelectOption);
eachDocList[idx] = doc;
if(bundleDocIdList != null) {
//묶음문서 존재시에만 생성한다.
if(eachBundleDocList == null) {
eachBundleDocList = new Document[realSize][];
}
Document[] bundleDoclist = new Document[bundleDocIdList.size()];
for (int j = 0; j < bundleDocIdList.size(); j++) {
int bundleSegmentSequence = bundleDocIdList.segmentSequence(j);
int bundleDocNo = bundleDocIdList.docNo(j);
Document bundleDoc = collectionHandler.segmentReader(bundleSegmentSequence).segmentSearcher().getDocument(bundleDocNo, fieldSelectOption);
bundleDoclist[j] = bundleDoc;
}
eachBundleDocList[idx] = bundleDoclist;
}
idx++;
}
for (int i = 0; i < realSize; i++) {
row[i] = makeRowFromDocument(eachDocList[i], views, fieldSequenceList, tags, highlightInfo);
//bundle document
if(eachBundleDocList != null) {
Document[] bundleDocList = eachBundleDocList[i];
if(bundleDocList != null) {
///묶음문서 존재시에만 bundleRow를 생성한다.
if(bundleRow == null) {
bundleRow = new Row[realSize][];
}
bundleRow[i] = new Row[bundleDocList.length];
for (int j = 0; j < bundleDocList.length; j++) {
bundleRow[i][j] = makeRowFromDocument(bundleDocList[j], views, fieldSequenceList, tags, highlightInfo);
}
}
}
}
return new DocumentResult(row, bundleRow, fieldIdList);
}
private Row makeRowFromDocument(Document document, ViewContainer views, int[] fieldSequenceList, String[] tags, HighlightInfo highlightInfo) throws IOException {
Row rows = new Row(views.size());
for (int j = 0; j < views.size(); j++) {
View view = views.get(j);
int fieldSequence = fieldSequenceList[j];
if (fieldSequence == ScoreField.fieldNumber) {
//여기서는 score를 알수가 없으므로 공백처리.
//float score = document.getScore();
rows.put(j, null);
} else if (fieldSequence == DocNoField.fieldNumber) {
rows.put(j, Integer.toString(document.getDocId()).toCharArray());
} else if (fieldSequence == UnknownField.fieldNumber) {
rows.put(j, UnknownField.value().toCharArray());
} else {
Field field = document.get(fieldSequence);
// logger.debug("field#{} >> {}", j, field);
String text = null;
if (field != null) {
text = field.toString();
}
boolean isHighlightSummary = false;
if (has != null && text != null && highlightInfo != null) {
//하이라이팅만 수행하거나, 또는 view.snippetSize 가 존재하면 summary까지 수행될수 있다.
String fieldId = view.fieldId();
Option searchOption = highlightInfo.getOption(fieldId);
if(searchOption.useHighlight()) {
String indexAnalyzerId = highlightInfo.getIndexAnalyzerId(fieldId);
String queryAnalyzerId = highlightInfo.getQueryAnalyzerId(fieldId);
String queryTerm = highlightInfo.getQueryTerm(fieldId);
if (indexAnalyzerId != null && queryAnalyzerId != null && queryTerm != null) {
// a = System.nanoTime();
text = getHighlightedSnippet(fieldId, text, indexAnalyzerId, queryAnalyzerId, queryTerm, tags, view, searchOption);
// b += (System.nanoTime() - a);
isHighlightSummary = true;
}
}
}
if(!isHighlightSummary && view.isSummarize()){
//검색필드가 아니라서 하이라이팅이 불가능한경우는 앞에서부터 잘라 summary 해준다.
if(text != null){
if(text.length() > view.snippetSize()){
text = text.substring(0, view.snippetSize());
}
}
}
if (text != null) {
rows.put(j, text.toCharArray());
} else {
rows.put(j, null);
}
}
}
return rows;
}
private String getHighlightedSnippet(String fieldId, String text, String indexAnalyzerId, String queryAnalyzerId, String queryString, String[] tags, View view, Option searchOption) throws IOException {
AnalyzerPool queryAnalyzerPool = collectionHandler.analyzerPoolManager().getPool(queryAnalyzerId);
//analyzer id 가 같으면 하나만 공통으로 사용한다.
boolean isSamePool = queryAnalyzerId.equals(indexAnalyzerId);
AnalyzerPool indexAnalyzerPool = null;
if(isSamePool){
indexAnalyzerPool = queryAnalyzerPool;
}else{
indexAnalyzerPool = collectionHandler.analyzerPoolManager().getPool(indexAnalyzerId);
}
if (queryAnalyzerPool != null) {
Analyzer queryAnalyzer = queryAnalyzerPool.getFromPool();
Analyzer indexAnalyzer = null;
if(isSamePool){
indexAnalyzer = queryAnalyzer;
}else{
indexAnalyzer = indexAnalyzerPool.getFromPool();
}
if (indexAnalyzer != null && queryAnalyzer != null) {
try {
text = has.highlight(fieldId, indexAnalyzer, queryAnalyzer, text, queryString, tags, view.snippetSize(), view.fragmentSize(), searchOption);
} finally {
if(!isSamePool){
indexAnalyzerPool.releaseToPool(indexAnalyzer);
}
queryAnalyzerPool.releaseToPool(queryAnalyzer);
}
}
}
return text;
}
}