package org.fastcatsearch.ir.search; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.util.BytesRef; import org.fastcatsearch.error.CoreErrorCode; import org.fastcatsearch.error.SearchError; import org.fastcatsearch.ir.analysis.AnalyzerPool; import org.fastcatsearch.ir.common.IRException; import org.fastcatsearch.ir.common.SettingException; import org.fastcatsearch.ir.document.Document; import org.fastcatsearch.ir.field.DocNoField; import org.fastcatsearch.ir.field.Field; import org.fastcatsearch.ir.field.ScoreField; import org.fastcatsearch.ir.field.UnknownField; import org.fastcatsearch.ir.filter.FilterException; import org.fastcatsearch.ir.group.GroupDataMerger; import org.fastcatsearch.ir.group.GroupHit; import org.fastcatsearch.ir.group.GroupsData; import org.fastcatsearch.ir.io.*; import org.fastcatsearch.ir.query.*; import org.fastcatsearch.ir.query.Term.Option; import org.fastcatsearch.ir.search.clause.Clause; import org.fastcatsearch.ir.search.clause.ClauseException; import org.fastcatsearch.ir.settings.FieldSetting; import org.fastcatsearch.ir.settings.FieldSetting.Type; import org.fastcatsearch.ir.settings.Schema; import org.fastcatsearch.ir.summary.BasicHighlightAndSummary; import org.fastcatsearch.ir.util.Formatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; public class CollectionSearcher { private static Logger logger = LoggerFactory.getLogger(CollectionSearcher.class); private String collectionId; private CollectionHandler collectionHandler; private HighlightAndSummary has; private int bundleMemMaxCountLimit = 10 * 10000; private int bundleHashBucketSize = 100 * 10000; public CollectionSearcher(CollectionHandler collectionHandler) { this.collectionId = collectionHandler.collectionId(); this.collectionHandler = collectionHandler; has = new BasicHighlightAndSummary(); //묶음검색 파일기반 해시셋 설정값. String bundleMemMaxCount = System.getProperty("bundleMemMaxCount"); String bundleHashBucket = System.getProperty("bundleHashBucket"); if(bundleMemMaxCount != null) { bundleMemMaxCountLimit = Integer.parseInt(bundleMemMaxCount); } if(bundleHashBucket != null) { bundleHashBucketSize = Integer.parseInt(bundleHashBucket); } } public GroupsData doGrouping(Query q) throws Exception { int segmentSize = collectionHandler.segmentSize(); if (segmentSize == 0) { throw new SearchError(CoreErrorCode.COLLECTION_NOT_INDEXED, collectionId); } Groups groups = q.getGroups(); if (groups == null) { return null; } if (segmentSize == 1) { // 머징필요없음. GroupHit groupHit = collectionHandler.segmentSearcher(0).searchGroupHit(q); return groupHit.groupData(); } else { GroupDataMerger dataMerger = null; if (groups != null) { dataMerger = new GroupDataMerger(groups, segmentSize); } for (int i = 0; i < segmentSize; i++) { GroupHit groupHit = collectionHandler.segmentSearcher(i).searchGroupHit(q); if (dataMerger != null) { dataMerger.put(groupHit.groupData()); } } GroupsData groupData = null; if (dataMerger != null) { groupData = dataMerger.merge(); } return groupData; } } // id리스트에 해당하는 document자체를 읽어서 리스트로 리턴한다. @Deprecated public List<Document> requestDocument(int[] docIdList) throws IOException { // eachDocList에 해당하는 문서리스트를 리턴한다. List<Document> documentList = new ArrayList<Document>(docIdList.length); int segmentSize = collectionHandler.segmentSize(); for (int i = 0; i < docIdList.length; i++) { int docNo = docIdList[i]; // make doc number lists to send each columns for (int m = segmentSize - 1; m >= 0; m--) { if (docNo >= collectionHandler.segmentReader(m).segmentInfo().getBaseNumber()) { documentList.add(collectionHandler.segmentReader(m).segmentSearcher().getDocument(docNo)); break; } } } return documentList; } public Document requestDocument(int docNo) throws IOException { int segmentSize = collectionHandler.segmentSize(); // make doc number lists to send each columns for (int m = segmentSize - 1; m >= 0; m--) { if (docNo >= collectionHandler.segmentReader(m).segmentInfo().getBaseNumber()) { return collectionHandler.segmentReader(m).segmentSearcher().getDocument(docNo); } } return null; } public InternalSearchResult searchInternal(Query q) throws IRException, IOException, SettingException { return searchInternal(q, false, null); } /** * @param forMerging : 머징용도이면 start + length 만큼을 앞에서부터 모두 가져온다. * */ public InternalSearchResult searchInternal(Query q, boolean forMerging) throws IRException, IOException, SettingException { return searchInternal(q, forMerging, null); } public InternalSearchResult searchInternal(Query q, boolean forMerging, PkScoreList boostList) throws IRException, IOException, SettingException { int segmentSize = collectionHandler.segmentSize(); if (segmentSize == 0) { throw new SearchError(CoreErrorCode.COLLECTION_NOT_INDEXED, collectionId); } // logger.debug("searchInternal incrementCount > {} ", q); collectionHandler.queryCounter().incrementCount(); Schema schema = collectionHandler.schema(); Metadata meta = q.getMeta(); int start = meta.start(); int rows = meta.rows(); int sortMaxSize = start + rows - 1; int resultRows = rows; if (forMerging) {// 앞에서 부터 모두. resultRows = sortMaxSize; } // if(collectionId == null){ // collectionId = meta.collectionId(); // } Groups groups = q.getGroups(); Sorts sorts = q.getSorts(); // FixedMinHeap<FixedHitReader> hitMerger = null; FixedMaxPriorityQueue<HitElement> ranker = null; if (sorts == null) { // hitMerger = sorts.createMerger(schema, segmentSize); //TODO //BundleDefaultRanker (fieldIndexesReader, bundle) ranker = new DefaultRanker(sortMaxSize); } else { // hitMerger = new FixedMinHeap<FixedHitReader>(segmentSize); // ranker에 정렬 로직이 담겨있다. // ranker 안에는 필드타입과 정렬옵션을 확인하여 적합한 byte[] 비교를 수행한다. ranker = sorts.createRanker(schema, sortMaxSize); } GroupDataMerger dataMerger = null; if (groups != null) { dataMerger = new GroupDataMerger(groups, segmentSize); } HighlightInfo highlightInfo = null; int totalSize = 0; // bundleKeySet 는 동일그룹갯수를 확인하는 용도이다. // 묶음검색에서는 전체 문서갯수가 아닌 묶음의 갯수가 총 결과갯수가 되므로, 그룹중복을 제거하여 계산해주어야 한다. // 32byte의 key를 HashSet에 넣었을때 100만개에 100MB, 1000만개에 1G 정도 메모리 소요. // 대부분 100만개 이하일 것이므로, 메모리에서 수행하도록 한다. // Set<BytesRef> bundleKeySet = new HashSet<BytesRef>(); int keySize = 0; HybridHashSet bundleKeySet = new HybridHashSet(bundleMemMaxCountLimit, bundleHashBucketSize, keySize); List<Explanation> explanationList = null; BitSet[] segmentDocHitSetList = null; try { segmentDocHitSetList = new BitSet[segmentSize]; for (int i = 0; i < segmentSize; i++) { // segment 의 모든 결과를 보아야 중복체크가 가능하므로 reader를 받아오도록 한다. HitReader hitReader = collectionHandler.segmentSearcher(i).searchHitReader(q, boostList); // // //FIXME highlightInfo 계속 덮어쓰나? // if (highlightInfo == null) { highlightInfo = hitReader.highlightInfo(); } segmentDocHitSetList[i] = new BitSet(); // posting data HitElement e = null; while ((e = hitReader.next()) != null) { if (e.getBundleKey() != null) { if(keySize == 0) { keySize = e.getBundleKey().length(); bundleKeySet.setKeySize(keySize); } segmentDocHitSetList[i].set(e.docNo()); if(bundleKeySet.add(e.getBundleKey())) { totalSize++; } } else { totalSize++; } ranker.push(e); // logger.debug("heap insert hit > {}", e.docNo()); } // Put GroupResult if (dataMerger != null) { dataMerger.put(hitReader.makeGroupData()); } if(hitReader.explanation() != null){ if(explanationList == null){ explanationList = new ArrayList<Explanation>(); } hitReader.explanation().setSegmentId(i); hitReader.explanation().setCollectionId(collectionId); explanationList.add(hitReader.explanation()); } } } catch (IOException e) { throw new IRException(e); } catch (FilterException e) { throw new IRException(e); } catch (ClauseException e) { throw new IRException(e); } finally { if(bundleKeySet != null) { bundleKeySet.clean(); } } int rankerSize = ranker.size(); // logger.debug("PAGE start={}, size={}", start, rankerSize); FixedHitStack hitStack = new FixedHitStack(rankerSize); for (int i = 1; i <= rankerSize; i++) { HitElement el = ranker.pop(); hitStack.push(el); } int c = 1; FixedHitReader fixedHitReader = hitStack.getReader(); FixedHitQueue totalHit = new FixedHitQueue(resultRows); while(fixedHitReader.next()) { HitElement el = fixedHitReader.read(); // logger.debug("{} rank hit seg#{} {}", c, el.segmentSequence(), el.docNo(), el.score(), el.rowExplanations()); if (forMerging) { //머징용도는 처음부터 모두 넣는다. //번들키가 없거나 totalHit에 존재하지 않을때만 추가하고 나머지는 버린다. totalHit.push(el); } else if (c >= start) { //차후 머징용도가 아니라면 start이후 부터만 가져온다. // logger.debug("insert#{} > {}", c, el.docNo()); totalHit.push(el); } c++; } GroupsData groupData = null; if (dataMerger != null) { groupData = dataMerger.merge(); } HitElement[] hitElementList = totalHit.getHitElementList(); int realSize = totalHit.size(); /* * 번들 요청이 있으면 하위 묶음문서를 찾아온다. * */ Bundle bundle = q.getBundle(); if(bundle != null) { //검색결과의 hit내에서만 검색되도록 해야하므로, bitSet으로 filtering한다. fillBundleResult(schema, segmentSize, hitElementList, realSize, bundle, segmentDocHitSetList); } return new InternalSearchResult(collectionId, hitElementList, realSize, totalSize, groupData, highlightInfo, explanationList); } /* * 번들 문서를 찾아온다. * */ private void fillBundleResult(Schema schema, int segmentSize, HitElement[] hitElementList, int size, Bundle bundle, BitSet[] segmentDocFilterList) throws IRException{ /* * el의 bundlekey를 보고 하위 묶음문서가 몇개가 있는지 확인한다. * 2개 이상일 경우만 저장하고 나머지는 버린다. */ String fieldIndexId = bundle.getFieldIndexId(); Sorts bundleSorts = bundle.getSorts(); int bundleRows = bundle.getRows(); int bundleStart = 1; int bundleOption = bundle.getOption(); boolean isParentInclude = ( bundleOption == Bundle.OPT_PARENT_INCLUDE ); FieldSetting bundleFieldSetting = schema.getFieldSetting(bundle.getFieldIndexId()); Type bundleFieldType = bundleFieldSetting.getType(); try { for (int k = 0; k < size; k++) { int totalSize = 0; //bundleKey로 clause생성한다. int mainDocNo = hitElementList[k].docNo(); BytesRef bundleKey = hitElementList[k].getBundleKey(); if(bundleKey == null) { continue; } String bundleStringKey = Formatter.getContentString(bundleKey, bundleFieldType); if(bundleStringKey == null) { continue; } Clause bundleClause = new Clause(new Term(fieldIndexId, bundleStringKey)); Hit[] segmentHitList = new Hit[segmentSize]; for (int i = 0; i < segmentSize; i++) { //bundle key 별로 결과를 모은다. segmentHitList[i] = collectionHandler.segmentSearcher(i).searchIndex(bundleClause, bundleSorts, bundleStart, bundleRows, segmentDocFilterList[i]); totalSize += segmentHitList[i].totalCount(); } //2이상이어야만 번들이 유효하다. if(totalSize > 1) { FixedMinHeap<FixedHitReader> hitMerger = null; if (bundleSorts != null) { hitMerger = bundleSorts.createMerger(schema, segmentSize); } else { hitMerger = new FixedMinHeap<FixedHitReader>(segmentSize); } for (int i = 0; i < segmentSize; i++) { FixedHitReader hitReader = segmentHitList[i].hitStack().getReader(); // // posting data if (hitReader.next()) { hitMerger.push(hitReader); } } int realSize = Math.min(bundleRows, totalSize); DocIdList bundleDocIdList = new DocIdList(realSize); int c = 1, n = 0; while (hitMerger.size() > 0) { FixedHitReader r = hitMerger.peek(); HitElement el = r.read(); //mainDocNo 와 동일한 문서는 제외한다. //group 문서들에서 그룹대표 문서와 동일한 것은 보여주지 않는다. //단, 대표문서를 포함옵션이 있다면 추가한다. if(isParentInclude || el.docNo() != mainDocNo) { if (c >= bundleStart) { bundleDocIdList.add(el.segmentSequence(), el.docNo()); n++; //logger.debug("[{}] {}", el.segmentSequence() ,el.docNo()); } c++; } // 결과가 만들어졌으면 일찍 끝낸다. if (n == bundleRows) { break; } if (!r.next()) { // 다 읽은 것은 버린다. hitMerger.pop(); } hitMerger.heapify(); } //대표가 포함되지 않으면, 갯수를 줄인다. if(!isParentInclude) { totalSize--; } hitElementList[k].setBundleDocIdList(bundleDocIdList); hitElementList[k].setTotalBundleSize(totalSize); } } } catch (ClauseException e) { throw new IRException(e); } catch (IOException e) { throw new IRException(e); } } public DocumentResult searchDocument(DocIdList list, ViewContainer views, String[] tags, HighlightInfo highlightInfo) throws IOException { int realSize = list.size(); Row[] row = new Row[realSize]; Row[][] bundleRow = null; int fieldSize = collectionHandler.schema().getFieldSize(); int viewSize = views.size(); int[] fieldSequenceList = new int[viewSize]; String[] fieldIdList = new String[viewSize]; boolean[] fieldSelectOption = new boolean[fieldSize]; // true인 index의 필드값만 채워진다. for (int i = 0; i < views.size(); i++) { View v = views.get(i); String fieldId = v.fieldId(); fieldIdList[i] = fieldId; int sequence = -1; if (fieldId.equalsIgnoreCase(ScoreField.fieldName)) { sequence = ScoreField.fieldNumber; } else if (fieldId.equalsIgnoreCase(DocNoField.fieldName)) { sequence = DocNoField.fieldNumber; } else { sequence = collectionHandler.schema().getFieldSequence(fieldId); if(sequence != -1){ fieldSelectOption[sequence] = true; } } fieldSequenceList[i] = sequence; } Document[] eachDocList = new Document[realSize]; Document[][] eachBundleDocList = null; //SegmentSearcher를 재사용하기 위한 array. Lazy-loading되며, segmentSequence가 array 첨자가 된다. //처음에는 길이 5로 만들어놓고 나중에 더 필요하면, grow시킨다. SegmentSearcher[] segmentSearcherList = new SegmentSearcher[5]; int idx = 0; for (int i = 0; i < list.size(); i++) { int segmentSequence = list.segmentSequence(i); int docNo = list.docNo(i); DocIdList bundleDocIdList = list.bundleDocIdList(i); int size = segmentSearcherList.length; //기존 범위를 벗어나는 세그먼트 요청이 있을 때 grow한다. if(segmentSequence >= size){ while(segmentSequence >= size){ size += 5; } SegmentSearcher[] newSegmentSearcherList = new SegmentSearcher[size]; System.arraycopy(segmentSearcherList, 0, newSegmentSearcherList, 0, segmentSearcherList.length); segmentSearcherList = newSegmentSearcherList; } if(segmentSearcherList[segmentSequence] == null) { segmentSearcherList[segmentSequence] = collectionHandler.segmentReader(segmentSequence).segmentSearcher(); } Document doc = segmentSearcherList[segmentSequence].getDocument(docNo, fieldSelectOption); eachDocList[idx] = doc; if(bundleDocIdList != null) { //묶음문서 존재시에만 생성한다. if(eachBundleDocList == null) { eachBundleDocList = new Document[realSize][]; } Document[] bundleDoclist = new Document[bundleDocIdList.size()]; for (int j = 0; j < bundleDocIdList.size(); j++) { int bundleSegmentSequence = bundleDocIdList.segmentSequence(j); int bundleDocNo = bundleDocIdList.docNo(j); Document bundleDoc = collectionHandler.segmentReader(bundleSegmentSequence).segmentSearcher().getDocument(bundleDocNo, fieldSelectOption); bundleDoclist[j] = bundleDoc; } eachBundleDocList[idx] = bundleDoclist; } idx++; } for (int i = 0; i < realSize; i++) { row[i] = makeRowFromDocument(eachDocList[i], views, fieldSequenceList, tags, highlightInfo); //bundle document if(eachBundleDocList != null) { Document[] bundleDocList = eachBundleDocList[i]; if(bundleDocList != null) { ///묶음문서 존재시에만 bundleRow를 생성한다. if(bundleRow == null) { bundleRow = new Row[realSize][]; } bundleRow[i] = new Row[bundleDocList.length]; for (int j = 0; j < bundleDocList.length; j++) { bundleRow[i][j] = makeRowFromDocument(bundleDocList[j], views, fieldSequenceList, tags, highlightInfo); } } } } return new DocumentResult(row, bundleRow, fieldIdList); } private Row makeRowFromDocument(Document document, ViewContainer views, int[] fieldSequenceList, String[] tags, HighlightInfo highlightInfo) throws IOException { Row rows = new Row(views.size()); for (int j = 0; j < views.size(); j++) { View view = views.get(j); int fieldSequence = fieldSequenceList[j]; if (fieldSequence == ScoreField.fieldNumber) { //여기서는 score를 알수가 없으므로 공백처리. //float score = document.getScore(); rows.put(j, null); } else if (fieldSequence == DocNoField.fieldNumber) { rows.put(j, Integer.toString(document.getDocId()).toCharArray()); } else if (fieldSequence == UnknownField.fieldNumber) { rows.put(j, UnknownField.value().toCharArray()); } else { Field field = document.get(fieldSequence); // logger.debug("field#{} >> {}", j, field); String text = null; if (field != null) { text = field.toString(); } boolean isHighlightSummary = false; if (has != null && text != null && highlightInfo != null) { //하이라이팅만 수행하거나, 또는 view.snippetSize 가 존재하면 summary까지 수행될수 있다. String fieldId = view.fieldId(); Option searchOption = highlightInfo.getOption(fieldId); if(searchOption.useHighlight()) { String indexAnalyzerId = highlightInfo.getIndexAnalyzerId(fieldId); String queryAnalyzerId = highlightInfo.getQueryAnalyzerId(fieldId); String queryTerm = highlightInfo.getQueryTerm(fieldId); if (indexAnalyzerId != null && queryAnalyzerId != null && queryTerm != null) { // a = System.nanoTime(); text = getHighlightedSnippet(fieldId, text, indexAnalyzerId, queryAnalyzerId, queryTerm, tags, view, searchOption); // b += (System.nanoTime() - a); isHighlightSummary = true; } } } if(!isHighlightSummary && view.isSummarize()){ //검색필드가 아니라서 하이라이팅이 불가능한경우는 앞에서부터 잘라 summary 해준다. if(text != null){ if(text.length() > view.snippetSize()){ text = text.substring(0, view.snippetSize()); } } } if (text != null) { rows.put(j, text.toCharArray()); } else { rows.put(j, null); } } } return rows; } private String getHighlightedSnippet(String fieldId, String text, String indexAnalyzerId, String queryAnalyzerId, String queryString, String[] tags, View view, Option searchOption) throws IOException { AnalyzerPool queryAnalyzerPool = collectionHandler.analyzerPoolManager().getPool(queryAnalyzerId); //analyzer id 가 같으면 하나만 공통으로 사용한다. boolean isSamePool = queryAnalyzerId.equals(indexAnalyzerId); AnalyzerPool indexAnalyzerPool = null; if(isSamePool){ indexAnalyzerPool = queryAnalyzerPool; }else{ indexAnalyzerPool = collectionHandler.analyzerPoolManager().getPool(indexAnalyzerId); } if (queryAnalyzerPool != null) { Analyzer queryAnalyzer = queryAnalyzerPool.getFromPool(); Analyzer indexAnalyzer = null; if(isSamePool){ indexAnalyzer = queryAnalyzer; }else{ indexAnalyzer = indexAnalyzerPool.getFromPool(); } if (indexAnalyzer != null && queryAnalyzer != null) { try { text = has.highlight(fieldId, indexAnalyzer, queryAnalyzer, text, queryString, tags, view.snippetSize(), view.fragmentSize(), searchOption); } finally { if(!isSamePool){ indexAnalyzerPool.releaseToPool(indexAnalyzer); } queryAnalyzerPool.releaseToPool(queryAnalyzer); } } } return text; } }