/*
* Copyright 2013 Websquared, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fastcatsearch.ir.index.temp;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.fastcatsearch.ir.common.IndexFileNames;
import org.fastcatsearch.ir.index.IndexFieldOption;
import org.fastcatsearch.ir.io.BufferedFileOutput;
import org.fastcatsearch.ir.io.BytesDataOutput;
import org.fastcatsearch.ir.io.CharVector;
import org.fastcatsearch.ir.io.IOUtil;
import org.fastcatsearch.ir.io.IndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TempSearchFieldMerger {
protected static Logger logger = LoggerFactory.getLogger(TempSearchFieldMerger.class);
protected int[] heap;
protected TempSearchFieldReader[] reader;
protected String indexId;
protected int flushCount;
protected BytesDataOutput tempPostingOutput;
private int bufferCount;
private CharVector cv;
private CharVector cvOld;
protected int totalCount;
protected int prevDocNo;
private BytesRef[] buffers;
public TempSearchFieldMerger(String indexId, List<Long> flushPosition, File tempFile) throws IOException {
this.indexId = indexId;
this.flushCount = flushPosition.size();
reader = new TempSearchFieldReader[flushCount];
for (int m = 0; m < flushCount; m++) {
reader[m] = new TempSearchFieldReader(m, indexId, tempFile, flushPosition.get(m));
reader[m].next();
}
tempPostingOutput = new BytesDataOutput(1024 * 1024);
// 동일한 단어는 최대 flush갯수 만큼 buffer 배열에 쌓이게 된다.
buffers = new BytesRef[flushCount];
}
public void mergeAndMakeIndex(File baseDir, int indexInterval, IndexFieldOption fieldIndexOption) throws IOException {
logger.debug("**** mergeAndMakeIndex ****");
logger.debug("flushCount={}", flushCount);
if (flushCount <= 0) {
return;
}
IndexOutput postingOutput = new BufferedFileOutput(IndexFileNames.getRevisionDir(baseDir, 0),
IndexFileNames.getSearchPostingFileName(indexId));
IndexOutput lexiconOutput = new BufferedFileOutput(IndexFileNames.getRevisionDir(baseDir, 0),
IndexFileNames.getSearchLexiconFileName(indexId));
IndexOutput indexOutput = new BufferedFileOutput(IndexFileNames.getRevisionDir(baseDir, 0), IndexFileNames.getSearchIndexFileName(indexId));
try {
postingOutput.writeInt(fieldIndexOption.value());
// to each field
logger.debug("## MERGE field = {}", indexId);
makeHeap(flushCount);
int termCount = 0;
int indexTermCount = 0;
lexiconOutput.writeInt(termCount);// termCount
indexOutput.writeInt(indexTermCount);// indexTermCount
CharVector term = new CharVector();
while (readNextTempIndex(term)) {
int len = (int) tempPostingOutput.position();
int count = totalCount;
int lastDocNo = prevDocNo;
int firstDocNo = IOUtil.readVInt(tempPostingOutput.array(), 0);
int sz = IOUtil.lenVariableByte(firstDocNo);
len -= sz;
int sz2 = IOUtil.lenVariableByte(firstDocNo);
long postingPosition = postingOutput.position();
int len2 = IOUtil.SIZE_OF_INT * 2 + sz2 + len;
if (len2 < 8)
throw new IOException("Terrible Error!! " + len2);
//1. Write Posting
postingOutput.writeVInt(len2);
postingOutput.writeInt(count);
postingOutput.writeInt(lastDocNo);
postingOutput.writeVInt(firstDocNo);
postingOutput.writeBytes(tempPostingOutput.array(), sz, len);
//2. Write Lexicon
long lexiconPosition = lexiconOutput.position();
lexiconOutput.writeUString(term.array(), term.start(), term.length());
lexiconOutput.writeLong(postingPosition);
//3. Write Index
if (indexInterval > 0 && (termCount % indexInterval) == 0) {
indexOutput.writeUString(term.array(), term.start(), term.length());
indexOutput.writeLong(lexiconPosition);
indexOutput.writeLong(postingPosition);
indexTermCount++;
}
termCount++;
}
// Write term count on head position
// long prevPos = lexiconOutput.position();
// lexiconOutput.seek(prevPos);
if (termCount > 0) {
lexiconOutput.seek(0);
lexiconOutput.writeInt(termCount);
indexOutput.seek(0);
indexOutput.writeInt(indexTermCount);
} else {
// 이미 indexTermCount는 0으로 셋팅되어 있으므로 기록할 필요없음.
}
logger.debug("## write index [{}] termCount[{}] indexTermCount[{}] indexInterval[{}]", indexId, termCount, indexTermCount, indexInterval);
lexiconOutput.flush();
indexOutput.flush();
postingOutput.flush();
} finally {
IOException exception = null;
try {
if (postingOutput != null) {
postingOutput.close();
}
} catch (IOException e) {
exception = e;
}
try {
if (lexiconOutput != null) {
lexiconOutput.close();
}
} catch (IOException e) {
exception = e;
}
try {
if (indexOutput != null) {
indexOutput.close();
}
} catch (IOException e) {
exception = e;
}
if (exception != null) {
throw exception;
}
}
}
// 여러번 flush된 임시 posing 파일에서 정렬된 단어들을 읽어들여 posting을 tempPostingOutput 하나로 머징한다.
protected boolean readNextTempIndex(CharVector term) throws IOException {
tempPostingOutput.reset();
boolean termMade = false;
// int kk = 0;
while (true) {
int idx = heap[1];
cv = reader[idx].term();
if (cv == null && cvOld == null) {
// if cv and cvOld are null, it's done
return false;
}
// cv == null일경우는 모든 reader가 종료되어 null이 된경우이며
// cvOld 와 cv 가 다른 경우는 머징시 텀이 바뀐경우. cvOld를 기록해야한다.
if ((cv == null || !cv.equals(cvOld)) && cvOld != null) {
// merge buffers
prevDocNo = -1;
totalCount = 0;
for (int k = 0; k < bufferCount; k++) {
BytesRef buf = buffers[k];
// buf.reset();
// count 와 lastNo를 읽어둔다.
int count = IOUtil.readInt(buf);
int lastDocNo = IOUtil.readInt(buf);
totalCount += count;
// logger.debug("count="+count);
if (k == 0) {
// 첫번째 문서번호부터 끝까지 기록한다.
tempPostingOutput.writeBytes(buf.array(), buf.pos(), buf.remaining());
} else {
int firstNo = IOUtil.readVInt(buf);
int newDocNo = firstNo - prevDocNo - 1;
// logger.debug("newDocNo={}, firstNo={}, prevDocNo={}", newDocNo, firstNo, prevDocNo);
IOUtil.writeVInt(tempPostingOutput, newDocNo);
tempPostingOutput.writeBytes(buf.array(), buf.pos(), buf.remaining());
}
prevDocNo = lastDocNo;
}
termMade = true;
term.init(cvOld.array(), cvOld.start(), cvOld.length());
bufferCount = 0;
}
if(bufferCount < buffers.length){
try {
buffers[bufferCount++] = reader[idx].buffer();
} catch (ArrayIndexOutOfBoundsException e) {
logger.info("### bufferCount= {}, buffers.len={}, idx={}, reader={}", bufferCount, buffers.length, idx, reader.length);
logger.error("dup terms", e);
}
}else{
logger.warn("wrong! {}", cv);
logger.info("### bufferCount= {}, buffers.len={}, idx={}, reader={}", bufferCount, buffers.length, idx, reader.length);
}
// backup cv to old
cvOld = cv;
reader[idx].next();
heapify(1, flushCount);
if (termMade) {
return true;
}
} // while(true)
}
public void close() throws IOException {
IOException exception = null;
for (int i = 0; i < flushCount; i++) {
if (reader[i] != null) {
try {
reader[i].close();
} catch (IOException e) {
exception = e;
}
}
}
if (exception != null) {
throw exception;
}
}
protected void makeHeap(int heapSize) {
heap = new int[heapSize + 1];
// index starts from 1
for (int i = 0; i < heapSize; i++) {
heap[i + 1] = i;
}
int n = heapSize >> 1; // last inner node index
for (int i = n; i > 0; i--) {
heapify(i, heapSize);
}
}
protected void heapify(int idx, int heapSize) {
int temp = -1;
int child = -1;
while (idx <= heapSize) {
int left = idx << 1;// *=2
int right = left + 1;
if (left <= heapSize) {
if (right <= heapSize) {
// 키워드가 동일할 경우 먼저 flush된 reader가 우선해야, docNo가 오름차순 정렬순서대로 올바로 기록됨.
// flush후 머징시 문제가 생기는 버그 해결됨 2013-5-21 swsong
int c = compareKey(left, right);
if (c < 0) {
child = left;
} else if (c > 0) {
child = right;
} else {
// 하위 value 둘이 같아서 seq확인.
// 같다면 id가 작은게 우선.
int a = heap[left];
int b = heap[right];
if (reader[a].sequence() < reader[b].sequence()) {
child = left;
} else {
child = right;
}
}
} else {
// if there is no right el.
child = left;
}
} else {
// no children
break;
}
// compare and swap
int c = compareKey(child, idx);
if (c < 0) {
temp = heap[child];
heap[child] = heap[idx];
heap[idx] = temp;
idx = child;
// System.out.println("idx1="+idx);
} else if (c == 0) {
// 하위와 자신의 value가 같아서 seq확인
// 같다면 seq가 작은게 우선.
int a = heap[idx];
int b = heap[child];
if (reader[a].sequence() > reader[b].sequence()) {
// 하위의 seq가 작아서 child채택!
temp = heap[child];
heap[child] = heap[idx];
heap[idx] = temp;
idx = child;
} else {
// 내것을 그대로 사용.
// sorted
break;
}
} else {
// sorted, then do not check child
break;
}
}
}
protected int compareKey(int one, int another) {
int a = heap[one];
int b = heap[another];
return compareKey(reader[a].term(), reader[b].term());
}
protected int compareKey(CharVector term1, CharVector term2) {
// reader gets EOS, returns null
if (term1 == null && term2 == null) {
return 0;
} else if (term1 == null)
return 1;
else if (term2 == null)
return -1;
int len = (term1.length() < term2.length()) ? term1.length() : term2.length();
for (int i = 0; i < len; i++) {
if (term1.charAt(i) != term2.charAt(i))
return term1.charAt(i) - term2.charAt(i);
}
return term1.length() - term2.length();
}
}