TempSearchFieldMerger.java example

Explorer
fastcatsearch-master
/*
 * Copyright 2013 Websquared, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fastcatsearch.ir.index.temp;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.lucene.util.BytesRef;
import org.fastcatsearch.ir.common.IndexFileNames;
import org.fastcatsearch.ir.index.IndexFieldOption;
import org.fastcatsearch.ir.io.BufferedFileOutput;
import org.fastcatsearch.ir.io.BytesDataOutput;
import org.fastcatsearch.ir.io.CharVector;
import org.fastcatsearch.ir.io.IOUtil;
import org.fastcatsearch.ir.io.IndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TempSearchFieldMerger {
	protected static Logger logger = LoggerFactory.getLogger(TempSearchFieldMerger.class);

	protected int[] heap;
	protected TempSearchFieldReader[] reader;
	protected String indexId;
	protected int flushCount;
	protected BytesDataOutput tempPostingOutput;
	
	private int bufferCount;
	private CharVector cv;
	private CharVector cvOld;
	protected int totalCount;
	protected int prevDocNo;
	private BytesRef[] buffers;

	public TempSearchFieldMerger(String indexId, List<Long> flushPosition, File tempFile) throws IOException {
		this.indexId = indexId;
		this.flushCount = flushPosition.size();
		reader = new TempSearchFieldReader[flushCount];
		for (int m = 0; m < flushCount; m++) {
			reader[m] = new TempSearchFieldReader(m, indexId, tempFile, flushPosition.get(m));
			reader[m].next();
		}

		tempPostingOutput = new BytesDataOutput(1024 * 1024);
		// 동일한 단어는 최대 flush갯수 만큼 buffer 배열에 쌓이게 된다.
		buffers = new BytesRef[flushCount];
	}

	public void mergeAndMakeIndex(File baseDir, int indexInterval, IndexFieldOption fieldIndexOption) throws IOException {
		logger.debug("**** mergeAndMakeIndex ****");
		logger.debug("flushCount={}", flushCount);

		if (flushCount <= 0) {
			return;
		}

		IndexOutput postingOutput = new BufferedFileOutput(IndexFileNames.getRevisionDir(baseDir, 0),
				IndexFileNames.getSearchPostingFileName(indexId));
		IndexOutput lexiconOutput = new BufferedFileOutput(IndexFileNames.getRevisionDir(baseDir, 0),
				IndexFileNames.getSearchLexiconFileName(indexId));
		IndexOutput indexOutput = new BufferedFileOutput(IndexFileNames.getRevisionDir(baseDir, 0), IndexFileNames.getSearchIndexFileName(indexId));

		try {
			postingOutput.writeInt(fieldIndexOption.value());

			// to each field
			logger.debug("## MERGE field = {}", indexId);

			makeHeap(flushCount);

			int termCount = 0;
			int indexTermCount = 0;

			lexiconOutput.writeInt(termCount);// termCount
			indexOutput.writeInt(indexTermCount);// indexTermCount

			CharVector term = new CharVector();
			while (readNextTempIndex(term)) {
				int len = (int) tempPostingOutput.position();
				int count = totalCount;
				int lastDocNo = prevDocNo;
				int firstDocNo = IOUtil.readVInt(tempPostingOutput.array(), 0);
				int sz = IOUtil.lenVariableByte(firstDocNo);

				len -= sz;

				int sz2 = IOUtil.lenVariableByte(firstDocNo);

				long postingPosition = postingOutput.position();

				int len2 = IOUtil.SIZE_OF_INT * 2 + sz2 + len;
				if (len2 < 8)
					throw new IOException("Terrible Error!! " + len2);
				
				
				//1. Write Posting
				postingOutput.writeVInt(len2);
				postingOutput.writeInt(count);
				postingOutput.writeInt(lastDocNo);
				postingOutput.writeVInt(firstDocNo);
				postingOutput.writeBytes(tempPostingOutput.array(), sz, len);

				
				//2. Write Lexicon
				long lexiconPosition = lexiconOutput.position();
				lexiconOutput.writeUString(term.array(), term.start(), term.length());
				lexiconOutput.writeLong(postingPosition);
				
				//3. Write Index
				if (indexInterval > 0 && (termCount % indexInterval) == 0) {
					indexOutput.writeUString(term.array(), term.start(), term.length());
					indexOutput.writeLong(lexiconPosition);
					indexOutput.writeLong(postingPosition);
					indexTermCount++;
				}
				termCount++;
				
			}


			// Write term count on head position
			// long prevPos = lexiconOutput.position();
			// lexiconOutput.seek(prevPos);
			if (termCount > 0) {
				lexiconOutput.seek(0);
				lexiconOutput.writeInt(termCount);
				indexOutput.seek(0);
				indexOutput.writeInt(indexTermCount);
			} else {
				// 이미 indexTermCount는 0으로 셋팅되어 있으므로 기록할 필요없음.
			}
			logger.debug("## write index [{}] termCount[{}] indexTermCount[{}] indexInterval[{}]", indexId, termCount, indexTermCount, indexInterval);

			lexiconOutput.flush();
			indexOutput.flush();
			postingOutput.flush();

		} finally {
			IOException exception = null;

			try {
				if (postingOutput != null) {
					postingOutput.close();
				}
			} catch (IOException e) {
				exception = e;
			}
			try {
				if (lexiconOutput != null) {
					lexiconOutput.close();
				}
			} catch (IOException e) {
				exception = e;
			}
			try {
				if (indexOutput != null) {
					indexOutput.close();
				}
			} catch (IOException e) {
				exception = e;
			}

			if (exception != null) {
				throw exception;
			}
		}
	}

	// 여러번 flush된 임시 posing 파일에서 정렬된 단어들을 읽어들여 posting을 tempPostingOutput 하나로 머징한다.
	protected boolean readNextTempIndex(CharVector term) throws IOException {
		tempPostingOutput.reset();
		boolean termMade = false;

		// int kk = 0;
		while (true) {
			int idx = heap[1];
			cv = reader[idx].term();
			if (cv == null && cvOld == null) {
				// if cv and cvOld are null, it's done
				return false;
			}

			// cv == null일경우는 모든 reader가 종료되어 null이 된경우이며
			// cvOld 와 cv 가 다른 경우는 머징시 텀이 바뀐경우. cvOld를 기록해야한다.
			if ((cv == null || !cv.equals(cvOld)) && cvOld != null) {
				// merge buffers
				prevDocNo = -1;
				totalCount = 0;
				for (int k = 0; k < bufferCount; k++) {
					BytesRef buf = buffers[k];
					// buf.reset();

					// count 와 lastNo를 읽어둔다.
					int count = IOUtil.readInt(buf);
					int lastDocNo = IOUtil.readInt(buf);
					totalCount += count;
					// logger.debug("count="+count);
					if (k == 0) {
						// 첫번째 문서번호부터 끝까지 기록한다.
						tempPostingOutput.writeBytes(buf.array(), buf.pos(), buf.remaining());
					} else {
						int firstNo = IOUtil.readVInt(buf);
						int newDocNo = firstNo - prevDocNo - 1;
//						logger.debug("newDocNo={}, firstNo={}, prevDocNo={}", newDocNo, firstNo, prevDocNo);

						IOUtil.writeVInt(tempPostingOutput, newDocNo);
						tempPostingOutput.writeBytes(buf.array(), buf.pos(), buf.remaining());
					}
					prevDocNo = lastDocNo;
				}

				termMade = true;
				term.init(cvOld.array(), cvOld.start(), cvOld.length());

				bufferCount = 0;

			}

			if(bufferCount < buffers.length){
				try {
					buffers[bufferCount++] = reader[idx].buffer();
				} catch (ArrayIndexOutOfBoundsException e) {
					logger.info("### bufferCount= {}, buffers.len={}, idx={}, reader={}", bufferCount, buffers.length, idx, reader.length);
					logger.error("dup terms", e);
				}
			}else{
				logger.warn("wrong! {}", cv);
				logger.info("### bufferCount= {}, buffers.len={}, idx={}, reader={}", bufferCount, buffers.length, idx, reader.length);
			}
			// backup cv to old
			cvOld = cv;

			reader[idx].next();

			heapify(1, flushCount);

			if (termMade) {
				return true;
			}
		} // while(true)

	}

	public void close() throws IOException {
		IOException exception = null;
		for (int i = 0; i < flushCount; i++) {
			if (reader[i] != null) {
				try {
					reader[i].close();
				} catch (IOException e) {
					exception = e;
				}
			}
		}
		if (exception != null) {
			throw exception;
		}
	}

	protected void makeHeap(int heapSize) {
		heap = new int[heapSize + 1];
		// index starts from 1
		for (int i = 0; i < heapSize; i++) {
			heap[i + 1] = i;
		}

		int n = heapSize >> 1; // last inner node index

		for (int i = n; i > 0; i--) {
			heapify(i, heapSize);
		}

	}

	protected void heapify(int idx, int heapSize) {

		int temp = -1;
		int child = -1;

		while (idx <= heapSize) {
			int left = idx << 1;// *=2
			int right = left + 1;

			if (left <= heapSize) {
				if (right <= heapSize) {
					// 키워드가 동일할 경우 먼저 flush된 reader가 우선해야, docNo가 오름차순 정렬순서대로 올바로 기록됨.
					// flush후 머징시 문제가 생기는 버그 해결됨 2013-5-21 swsong
					int c = compareKey(left, right);
					if (c < 0) {
						child = left;
					} else if (c > 0) {
						child = right;
					} else {
						// 하위 value 둘이 같아서 seq확인.
						// 같다면 id가 작은게 우선.
						int a = heap[left];
						int b = heap[right];
						if (reader[a].sequence() < reader[b].sequence()) {
							child = left;
						} else {
							child = right;
						}
					}
				} else {
					// if there is no right el.
					child = left;
				}
			} else {
				// no children
				break;
			}

			// compare and swap
			int c = compareKey(child, idx);
			if (c < 0) {
				temp = heap[child];
				heap[child] = heap[idx];
				heap[idx] = temp;
				idx = child;
				// System.out.println("idx1="+idx);
			} else if (c == 0) {
				// 하위와 자신의 value가 같아서 seq확인
				// 같다면 seq가 작은게 우선.
				int a = heap[idx];
				int b = heap[child];
				if (reader[a].sequence() > reader[b].sequence()) {
					// 하위의 seq가 작아서 child채택!
					temp = heap[child];
					heap[child] = heap[idx];
					heap[idx] = temp;
					idx = child;
				} else {
					// 내것을 그대로 사용.
					// sorted
					break;
				}
			} else {
				// sorted, then do not check child
				break;
			}

		}
	}

	protected int compareKey(int one, int another) {

		int a = heap[one];
		int b = heap[another];

		return compareKey(reader[a].term(), reader[b].term());
	}

	protected int compareKey(CharVector term1, CharVector term2) {

		// reader gets EOS, returns null
		if (term1 == null && term2 == null) {
			return 0;
		} else if (term1 == null)
			return 1;
		else if (term2 == null)
			return -1;

		int len = (term1.length() < term2.length()) ? term1.length() : term2.length();

		for (int i = 0; i < len; i++) {
			if (term1.charAt(i) != term2.charAt(i))
				return term1.charAt(i) - term2.charAt(i);
		}

		return term1.length() - term2.length();
	}
}