package org.fastcatsearch.job.management;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.AnalyzerOption;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharsRefTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.fastcatsearch.common.io.Streamable;
import org.fastcatsearch.exception.FastcatSearchException;
import org.fastcatsearch.ir.IRService;
import org.fastcatsearch.ir.analysis.AnalyzerPool;
import org.fastcatsearch.ir.config.DataInfo.SegmentInfo;
import org.fastcatsearch.ir.document.Document;
import org.fastcatsearch.ir.document.DocumentReader;
import org.fastcatsearch.ir.field.Field;
import org.fastcatsearch.ir.io.BytesDataOutput;
import org.fastcatsearch.ir.io.DataInput;
import org.fastcatsearch.ir.io.DataOutput;
import org.fastcatsearch.ir.search.CollectionHandler;
import org.fastcatsearch.ir.search.SegmentReader;
import org.fastcatsearch.ir.search.SegmentSearcher;
import org.fastcatsearch.ir.settings.IndexRefSetting;
import org.fastcatsearch.ir.settings.IndexSetting;
import org.fastcatsearch.ir.settings.PrimaryKeySetting;
import org.fastcatsearch.ir.settings.RefSetting;
import org.fastcatsearch.ir.settings.Schema;
import org.fastcatsearch.ir.settings.SchemaSetting;
import org.fastcatsearch.job.Job;
import org.fastcatsearch.service.ServiceManager;
import org.fastcatsearch.vo.CollectionAnalyzedIndexData;
import org.fastcatsearch.vo.CollectionIndexData.RowData;
public class GetCollectionAnalyzedIndexDataJob extends Job implements Streamable {
private static final long serialVersionUID = 5821814699500442825L;
private String collectionId;
private int start;
private int end;
private String pkValue;
private AnalyzerOption indexingAnalyzerOption;
public GetCollectionAnalyzedIndexDataJob() {}
public GetCollectionAnalyzedIndexDataJob(String collectionId, int start, int end, String pkValue) {
this.collectionId = collectionId;
this.start = start;
this.end = end;
this.pkValue = pkValue;
}
@Override
public JobResult doRun() throws FastcatSearchException {
//색인시는 stopword만 본다.
indexingAnalyzerOption = new AnalyzerOption();
indexingAnalyzerOption.useStopword(true);
IRService irService = ServiceManager.getInstance().getService(IRService.class);
CollectionHandler collectionHandler = irService.collectionHandler(collectionId);
if(collectionHandler == null || !collectionHandler.isLoaded()){
CollectionAnalyzedIndexData data = new CollectionAnalyzedIndexData(collectionId, 0, null, null, null, null, null);
return new JobResult(data);
}
int segmentSize = collectionHandler.segmentSize();
List<String> fieldList = new ArrayList<String>();
List<RowData> pkDataList = new ArrayList<RowData>();
List<RowData> indexDataList = new ArrayList<RowData>();
List<RowData> analyzedDataList = new ArrayList<RowData>();
List<Boolean> isDeletedList = new ArrayList<Boolean>();
int documentSize = 0;
try{
Schema schema = collectionHandler.schema();
SchemaSetting schemaSetting = collectionHandler.schema().schemaSetting();
PrimaryKeySetting primaryKeySetting = schemaSetting.getPrimaryKeySetting();
List<RefSetting> primaryKeyIdList = primaryKeySetting.getFieldList();
List<IndexSetting> indexSettingList = schemaSetting.getIndexSettingList();
for (int i = 0; i < indexSettingList.size(); i++) {
IndexSetting indexSetting = indexSettingList.get(i);
String indexId = indexSetting.getId();
fieldList.add(indexId);
}
if(pkValue != null && pkValue.length() > 0) {
if(primaryKeyIdList != null && primaryKeyIdList.size() > 0) {
String[] pkList = pkValue.split("\\W");
BytesDataOutput tempOutput = new BytesDataOutput();
int count = 0;
Set<String> dupSet = new HashSet<String>();
for(String pk : pkList) {
pk = pk.trim();
if(pk.length() == 0) {
continue;
}
if(dupSet.contains(pk)){
continue;
}else{
dupSet.add(pk);
}
for (int segmentNumber = segmentSize - 1; segmentNumber >= 0; segmentNumber--) {
SegmentReader segmentReader = collectionHandler.segmentReader(segmentNumber);
int docNo = segmentReader.newSearchIndexesReader().getPrimaryKeyIndexesReader().getDocNo(pk, tempOutput);
// logger.debug(">>>docNo = {}", docNo);
if (docNo != -1) {
// logger.debug(">>> {} , doc={}~ {}", count, start, end);
if(count >= start && count <= end) {
Document document = collectionHandler.segmentReader(segmentNumber).segmentSearcher().getDocument(docNo);
if(document != null) {
isDeletedList.add(segmentReader.deleteSet().isSet(docNo));
add(document, primaryKeyIdList, schema, collectionHandler, String.valueOf(segmentNumber), indexSettingList, pkDataList, indexDataList, analyzedDataList);
}
}
documentSize++;
count++;
}
}
}
}
} else {
//이 배열의 index번호는 세그먼트번호.
int[] segmentEndNumbers = new int[segmentSize];
for (int segmentNumber = 0; segmentNumber < segmentSize; segmentNumber++) {
SegmentReader reader = collectionHandler.segmentReader(segmentNumber);
DocumentReader documentReader = reader.newDocumentReader();
int count = documentReader.getDocumentCount();
documentSize += count;
segmentEndNumbers[segmentNumber] = documentReader.getBaseNumber() + documentReader.getDocumentCount() - 1;
logger.debug("segmentEndNumbers[{}]={}", segmentNumber, segmentEndNumbers[segmentNumber]);
}
//여러세그먼트에 걸쳐있을 경우를 고려한다.
int[][] matchSegmentList = matchSegment(segmentEndNumbers, start, end - start + 1);
for (int i = matchSegmentList.length - 1; i >= 0; i--) {
int segmentNumber = matchSegmentList[i][0];
int startNo = matchSegmentList[i][1];
int endNo = matchSegmentList[i][2];
SegmentReader segmentReader = collectionHandler.segmentReader(segmentNumber);
if (segmentReader != null) {
SegmentInfo segmentInfo = segmentReader.segmentInfo();
String segmentId = segmentInfo.getId();
SegmentSearcher segmentSearcher = segmentReader.segmentSearcher();
for (int docNo = startNo; docNo <= endNo; docNo++) {
Document document = segmentSearcher.getDocument(docNo);
if(document == null){
//문서의 끝에 다다름.
break;
}
isDeletedList.add(segmentReader.deleteSet().isSet(docNo));
add(document, primaryKeyIdList, schema, collectionHandler, segmentId, indexSettingList, pkDataList, indexDataList, analyzedDataList);
}
} else {
logger.debug("segmentReader is NULL");
}
}
}
} catch (Throwable e) {
logger.error("", e);
}
CollectionAnalyzedIndexData data = new CollectionAnalyzedIndexData(collectionId, documentSize, fieldList, pkDataList, indexDataList, analyzedDataList, isDeletedList);
return new JobResult(data);
}
private void add(Document document, List<RefSetting> primaryKeyIdList, Schema schema, CollectionHandler collectionHandler, String segmentId, List<IndexSetting> indexSettingList, List<RowData> pkDataList, List<RowData> indexDataList, List<RowData> analyzedDataList){
int pkSize = (primaryKeyIdList != null && primaryKeyIdList.size() > 0) ? primaryKeyIdList.size() : 0;
String[][] pkData = new String[pkSize][];
for (int index = 0; index < pkSize; index++) {
RefSetting refSetting = primaryKeyIdList.get(index);
String fieldId = refSetting.getRef();
int pkFieldSequence = schema.getFieldSequence(fieldId);
Field field = document.get(pkFieldSequence);
String fieldData = field.toString();
// logger.debug("PK {} > {} > {}", refSetting, pkFieldSequence, field);
pkData[index] = new String[] { fieldId, fieldData };
}
RowData pkRowData = new RowData(segmentId, pkData);
pkDataList.add(pkRowData);
String[][] indexData = new String[indexSettingList.size()][];
String[][] analyzedData = new String[indexSettingList.size()][];
for (int k = 0; k < indexSettingList.size(); k++) {
StringBuffer analyzedBuffer = new StringBuffer();
IndexSetting indexSetting = indexSettingList.get(k);
String indexId = indexSetting.getId();
List<IndexRefSetting> refList = indexSetting.getFieldList();
boolean isIgnoreCase = indexSetting.isIgnoreCase();
boolean isStorePosition = indexSetting.isStorePosition();
int positionIncrementGap = indexSetting.getPositionIncrementGap();
int gapOffset = 0;
StringBuffer allFieldData = new StringBuffer();
for (int m = 0; m < refList.size(); m++) {
IndexRefSetting refSetting = refList.get(m);
String fieldId = refSetting.getRef();
String indexAnalyzerId = refSetting.getIndexAnalyzer();
int fieldSequence = schema.getFieldSequence(fieldId);
Field field = document.get(fieldSequence);
String data = field.toString();
if(isIgnoreCase){
data = data.toUpperCase();
}
if(allFieldData.length() > 0) {
allFieldData.append(" ");
}
allFieldData.append(data);
AnalyzerPool analyzerPool = collectionHandler.getAnalyzerPool(indexAnalyzerId);
Analyzer analyzer = analyzerPool.getFromPool();
try{
TokenStream tokenStream = analyzer.tokenStream(fieldId, new StringReader(data), indexingAnalyzerOption);
tokenStream.reset();
CharsRefTermAttribute refTermAttribute = null;
PositionIncrementAttribute positionAttribute = null;
CharTermAttribute termAttribute = null;
if(tokenStream.hasAttribute(CharsRefTermAttribute.class)){
refTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
}
if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) {
positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
}
if (tokenStream.hasAttribute(CharTermAttribute.class)) {
termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
}
while(tokenStream.incrementToken()){
String value = null;
if (refTermAttribute != null) {
value = refTermAttribute.charsRef().toString();
} else {
value = termAttribute.toString();
}
int position = -1;
if (isStorePosition && positionAttribute != null) {
position = positionAttribute.getPositionIncrement() + gapOffset;
}
if(analyzedBuffer.length() > 0){
analyzedBuffer.append(", ");
}
analyzedBuffer.append(value);
if(position != -1){
analyzedBuffer.append(" [");
analyzedBuffer.append(position);
analyzedBuffer.append("]");
}
}
} catch (IOException e) {
logger.error("", e);
}finally{
analyzerPool.releaseToPool(analyzer);
}
//필드가 바뀌면 positionIncrementGap 만큼 포지션이 증가한다.
gapOffset += positionIncrementGap;
}
indexData[k] = new String[] {indexId, allFieldData.toString()};
analyzedData[k] = new String[] {indexId, analyzedBuffer.toString()};
}//for
RowData indexRowData = new RowData(segmentId, indexData);
indexDataList.add(indexRowData);
RowData analyzedRowData = new RowData(segmentId, analyzedData);
analyzedDataList.add(analyzedRowData);
}
private int[][] matchSegment(int[] segEndNums, int start, int rows) {
// [][세그먼트번호,시작번호,끝번호]
ArrayList<int[]> list = new ArrayList<int[]>();
for (int i = 0; i < segEndNums.length; i++) {
if (start > segEndNums[i]) {
start = start - segEndNums[i] - 1;
} else {
int[] res = new int[3];
int emptyCount = segEndNums[i] - start + 1;
res[0] = i;// 세그먼트번호
if (emptyCount < rows) {
res[1] = start;// 시작번호
res[2] = segEndNums[i];
start = 0;
rows = rows - emptyCount;
list.add(res);
} else {
res[1] = start;// 시작번호
res[2] = start + rows - 1;// 끝번호
list.add(res);
break;
}
}
}
int[][] result = new int[list.size()][3];
for (int i = 0; i < list.size(); i++) {
int[] tmp = list.get(i);
result[i][0] = tmp[0];
result[i][1] = tmp[1];
result[i][2] = tmp[2];
}
return result;
}
@Override
public void readFrom(DataInput input) throws IOException {
collectionId = input.readString();
start = input.readInt();
end = input.readInt();
pkValue = input.readString();
}
@Override
public void writeTo(DataOutput output) throws IOException {
output.writeString(collectionId);
output.writeInt(start);
output.writeInt(end);
output.writeString(pkValue);
}
}