GetCollectionAnalyzedIndexDataJob.java example

Explorer
fastcatsearch-master
package org.fastcatsearch.job.management;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.AnalyzerOption;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharsRefTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.fastcatsearch.common.io.Streamable;
import org.fastcatsearch.exception.FastcatSearchException;
import org.fastcatsearch.ir.IRService;
import org.fastcatsearch.ir.analysis.AnalyzerPool;
import org.fastcatsearch.ir.config.DataInfo.SegmentInfo;
import org.fastcatsearch.ir.document.Document;
import org.fastcatsearch.ir.document.DocumentReader;
import org.fastcatsearch.ir.field.Field;
import org.fastcatsearch.ir.io.BytesDataOutput;
import org.fastcatsearch.ir.io.DataInput;
import org.fastcatsearch.ir.io.DataOutput;
import org.fastcatsearch.ir.search.CollectionHandler;
import org.fastcatsearch.ir.search.SegmentReader;
import org.fastcatsearch.ir.search.SegmentSearcher;
import org.fastcatsearch.ir.settings.IndexRefSetting;
import org.fastcatsearch.ir.settings.IndexSetting;
import org.fastcatsearch.ir.settings.PrimaryKeySetting;
import org.fastcatsearch.ir.settings.RefSetting;
import org.fastcatsearch.ir.settings.Schema;
import org.fastcatsearch.ir.settings.SchemaSetting;
import org.fastcatsearch.job.Job;
import org.fastcatsearch.service.ServiceManager;
import org.fastcatsearch.vo.CollectionAnalyzedIndexData;
import org.fastcatsearch.vo.CollectionIndexData.RowData;

public class GetCollectionAnalyzedIndexDataJob extends Job implements Streamable {

	private static final long serialVersionUID = 5821814699500442825L;
	
	private String collectionId;
	private int start;
	private int end;
	private String pkValue;
	
	private AnalyzerOption indexingAnalyzerOption;
	
	public GetCollectionAnalyzedIndexDataJob() {}
	
	public GetCollectionAnalyzedIndexDataJob(String collectionId, int start, int end, String pkValue) {
		this.collectionId = collectionId;
		this.start = start;
		this.end = end;
		this.pkValue = pkValue;
	}
	

	@Override
	public JobResult doRun() throws FastcatSearchException {
		
		//색인시는 stopword만 본다.
		indexingAnalyzerOption = new AnalyzerOption();
		indexingAnalyzerOption.useStopword(true);
				
		IRService irService = ServiceManager.getInstance().getService(IRService.class);

		CollectionHandler collectionHandler = irService.collectionHandler(collectionId);
		if(collectionHandler == null || !collectionHandler.isLoaded()){
			CollectionAnalyzedIndexData data = new CollectionAnalyzedIndexData(collectionId, 0, null, null, null, null, null);
			return new JobResult(data);
		}
		
		int segmentSize = collectionHandler.segmentSize();
		
		List<String> fieldList = new ArrayList<String>();
		List<RowData> pkDataList = new ArrayList<RowData>();
		List<RowData> indexDataList = new ArrayList<RowData>();
		List<RowData> analyzedDataList = new ArrayList<RowData>();
		List<Boolean> isDeletedList = new ArrayList<Boolean>();
		
		int documentSize = 0;
		try{

			Schema schema = collectionHandler.schema();
			SchemaSetting schemaSetting = collectionHandler.schema().schemaSetting();
			PrimaryKeySetting primaryKeySetting = schemaSetting.getPrimaryKeySetting();
			List<RefSetting> primaryKeyIdList = primaryKeySetting.getFieldList();
			List<IndexSetting> indexSettingList = schemaSetting.getIndexSettingList();
			
			for (int i = 0; i < indexSettingList.size(); i++) {
				IndexSetting indexSetting = indexSettingList.get(i);
				String indexId = indexSetting.getId();
				fieldList.add(indexId);
			}
			
			if(pkValue != null && pkValue.length() > 0) {
				if(primaryKeyIdList != null && primaryKeyIdList.size() > 0) {
					String[] pkList = pkValue.split("\\W");
					BytesDataOutput tempOutput = new BytesDataOutput();
					int count = 0;
					Set<String> dupSet = new HashSet<String>();
					for(String pk : pkList) {
						pk = pk.trim();
						if(pk.length() == 0) {
							continue;
						}
						if(dupSet.contains(pk)){
							continue;
						}else{
							dupSet.add(pk);
						}
						for (int segmentNumber = segmentSize - 1; segmentNumber >= 0; segmentNumber--) {
							SegmentReader segmentReader = collectionHandler.segmentReader(segmentNumber);
							int docNo = segmentReader.newSearchIndexesReader().getPrimaryKeyIndexesReader().getDocNo(pk, tempOutput);
	//						logger.debug(">>>docNo = {}", docNo);
							if (docNo != -1) {
	//							logger.debug(">>> {} , doc={}~ {}", count, start, end);
								if(count >= start && count <= end) {
									Document document = collectionHandler.segmentReader(segmentNumber).segmentSearcher().getDocument(docNo);
									if(document != null) {
										isDeletedList.add(segmentReader.deleteSet().isSet(docNo));
										add(document, primaryKeyIdList, schema, collectionHandler, String.valueOf(segmentNumber), indexSettingList, pkDataList, indexDataList, analyzedDataList);
									}
								}
								documentSize++;
								count++;
							}
						}
					}
				}
			} else {
				
				//이 배열의 index번호는 세그먼트번호.
				int[] segmentEndNumbers = new int[segmentSize];
				for (int segmentNumber = 0; segmentNumber < segmentSize; segmentNumber++) {
					SegmentReader reader = collectionHandler.segmentReader(segmentNumber);
					DocumentReader documentReader = reader.newDocumentReader();
					int count = documentReader.getDocumentCount();
					documentSize += count;
					segmentEndNumbers[segmentNumber] = documentReader.getBaseNumber() + documentReader.getDocumentCount() - 1;
					logger.debug("segmentEndNumbers[{}]={}", segmentNumber, segmentEndNumbers[segmentNumber]);
				}
				
				//여러세그먼트에 걸쳐있을 경우를 고려한다.
				int[][] matchSegmentList = matchSegment(segmentEndNumbers, start, end - start + 1);
				for (int i = matchSegmentList.length - 1; i >= 0; i--) {
					int segmentNumber = matchSegmentList[i][0];
					int startNo = matchSegmentList[i][1];
					int endNo = matchSegmentList[i][2];
					
					SegmentReader segmentReader = collectionHandler.segmentReader(segmentNumber);
					
					if (segmentReader != null) {
						SegmentInfo segmentInfo = segmentReader.segmentInfo();
						String segmentId = segmentInfo.getId();
						SegmentSearcher segmentSearcher = segmentReader.segmentSearcher();
						
						for (int docNo = startNo; docNo <= endNo; docNo++) {
							Document document = segmentSearcher.getDocument(docNo);
							if(document == null){
								//문서의 끝에 다다름.
								break;
							}
							isDeletedList.add(segmentReader.deleteSet().isSet(docNo));
							add(document, primaryKeyIdList, schema, collectionHandler, segmentId, indexSettingList, pkDataList, indexDataList, analyzedDataList);
						}
					} else {
						logger.debug("segmentReader is NULL");
					}
				}
				
			}
		} catch (Throwable e) {
			logger.error("", e);
		}
		
		CollectionAnalyzedIndexData data = new CollectionAnalyzedIndexData(collectionId, documentSize, fieldList, pkDataList, indexDataList, analyzedDataList, isDeletedList);
		return new JobResult(data);
	}
	
	private void add(Document document, List<RefSetting> primaryKeyIdList, Schema schema, CollectionHandler collectionHandler, String segmentId, List<IndexSetting> indexSettingList, List<RowData> pkDataList, List<RowData> indexDataList, List<RowData> analyzedDataList){
		
		
		int pkSize = (primaryKeyIdList != null && primaryKeyIdList.size() > 0) ? primaryKeyIdList.size() : 0;
		String[][] pkData = new String[pkSize][];
		for (int index = 0; index < pkSize; index++) {
			RefSetting refSetting = primaryKeyIdList.get(index);
			String fieldId = refSetting.getRef();
			int pkFieldSequence = schema.getFieldSequence(fieldId);
			Field field = document.get(pkFieldSequence);
			String fieldData = field.toString();
//			logger.debug("PK {} > {} > {}", refSetting, pkFieldSequence, field);
			pkData[index] = new String[] { fieldId, fieldData };
		}
		RowData pkRowData = new RowData(segmentId, pkData);
		pkDataList.add(pkRowData);
		
		String[][] indexData = new String[indexSettingList.size()][];
		String[][] analyzedData = new String[indexSettingList.size()][];
		
		for (int k = 0; k < indexSettingList.size(); k++) {
			StringBuffer analyzedBuffer = new StringBuffer();
			
			IndexSetting indexSetting = indexSettingList.get(k);
			String indexId = indexSetting.getId();
			List<IndexRefSetting> refList = indexSetting.getFieldList();
			boolean isIgnoreCase = indexSetting.isIgnoreCase();
			boolean isStorePosition = indexSetting.isStorePosition();
			int positionIncrementGap = indexSetting.getPositionIncrementGap();
			int gapOffset = 0;
			
			StringBuffer allFieldData = new StringBuffer();
			for (int m = 0; m < refList.size(); m++) {
				IndexRefSetting refSetting = refList.get(m);
				String fieldId = refSetting.getRef();
				String indexAnalyzerId = refSetting.getIndexAnalyzer();
				int fieldSequence = schema.getFieldSequence(fieldId);
				Field field = document.get(fieldSequence);
				String data = field.toString();
				if(isIgnoreCase){
					data = data.toUpperCase();
				}
				if(allFieldData.length() > 0) {
					allFieldData.append(" ");
				}
				allFieldData.append(data);
				
				AnalyzerPool analyzerPool = collectionHandler.getAnalyzerPool(indexAnalyzerId);
				Analyzer analyzer = analyzerPool.getFromPool();
				try{
					TokenStream tokenStream = analyzer.tokenStream(fieldId, new StringReader(data), indexingAnalyzerOption);
					tokenStream.reset();
					CharsRefTermAttribute refTermAttribute = null;
					PositionIncrementAttribute positionAttribute = null;
					CharTermAttribute termAttribute = null;
					if(tokenStream.hasAttribute(CharsRefTermAttribute.class)){
						refTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
					}
					if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) {
						positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
					}
					if (tokenStream.hasAttribute(CharTermAttribute.class)) {
						termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
					}
					
					while(tokenStream.incrementToken()){
						String value = null;
						if (refTermAttribute != null) {
							value = refTermAttribute.charsRef().toString();
						} else {
							value = termAttribute.toString();
						}
						int position = -1;
						if (isStorePosition && positionAttribute != null) {
							position = positionAttribute.getPositionIncrement() + gapOffset;
						}
						
						if(analyzedBuffer.length() > 0){
							analyzedBuffer.append(", ");
						}
						analyzedBuffer.append(value);
						if(position != -1){
							analyzedBuffer.append(" [");
							analyzedBuffer.append(position);
							analyzedBuffer.append("]");
						}
					}
				} catch (IOException e) {
					logger.error("", e);
				}finally{
					analyzerPool.releaseToPool(analyzer);
				}
				
				//필드가 바뀌면 positionIncrementGap 만큼 포지션이 증가한다.
				gapOffset += positionIncrementGap;
			}
			
			indexData[k] = new String[] {indexId, allFieldData.toString()};
			analyzedData[k] = new String[] {indexId, analyzedBuffer.toString()};
			
		}//for
		
		RowData indexRowData = new RowData(segmentId, indexData);
		indexDataList.add(indexRowData);
		
		RowData analyzedRowData = new RowData(segmentId, analyzedData);
		analyzedDataList.add(analyzedRowData);
	}
	private int[][] matchSegment(int[] segEndNums, int start, int rows) {
		// [][세그먼트번호,시작번호,끝번호]
		ArrayList<int[]> list = new ArrayList<int[]>();
		for (int i = 0; i < segEndNums.length; i++) {
			if (start > segEndNums[i]) {
				start = start - segEndNums[i] - 1;
			} else {
				int[] res = new int[3];
				int emptyCount = segEndNums[i] - start + 1;
				res[0] = i;// 세그먼트번호
				if (emptyCount < rows) {
					res[1] = start;// 시작번호
					res[2] = segEndNums[i];
					start = 0;
					rows = rows - emptyCount;
					list.add(res);
				} else {
					res[1] = start;// 시작번호
					res[2] = start + rows - 1;// 끝번호
					list.add(res);
					break;
				}
			}
		}
		int[][] result = new int[list.size()][3];
		for (int i = 0; i < list.size(); i++) {
			int[] tmp = list.get(i);
			result[i][0] = tmp[0];
			result[i][1] = tmp[1];
			result[i][2] = tmp[2];
		}

		return result;
	}
	
	@Override
	public void readFrom(DataInput input) throws IOException {
		collectionId = input.readString();
		start = input.readInt();
		end = input.readInt();
		pkValue = input.readString();
	}

	@Override
	public void writeTo(DataOutput output) throws IOException {
		output.writeString(collectionId);
		output.writeInt(start);
		output.writeInt(end);
		output.writeString(pkValue);
	}

}