/*
* Copyright 2013 Websquared, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fastcatsearch.ir.document.merge;
import java.io.IOException;
import org.fastcatsearch.ir.document.TempPrimaryKeyIndexReader;
import org.fastcatsearch.ir.io.BytesBuffer;
import org.fastcatsearch.ir.io.IndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* pk색인시 flush한 임시 pk파일들을 머징한다.
* */
public class PrimaryKeyIndexMultipleMerger {
protected static Logger logger = LoggerFactory.getLogger(PrimaryKeyIndexMultipleMerger.class);
protected int[] heap;
protected TempPrimaryKeyIndexReader[] reader;
protected int flushCount;
private KeyValue kv;
private KeyValue kvOld;
protected int totalCount;
public PrimaryKeyIndexMultipleMerger(TempPrimaryKeyIndexReader[] reader) throws IOException {
this.flushCount = reader.length;
this.reader = reader;
for (TempPrimaryKeyIndexReader r : reader) {
r.next();
}
kv = new KeyValue();
kvOld = new KeyValue();
}
public void mergeAndMakeIndex(IndexOutput output, IndexOutput indexOutput, int indexInterval) throws IOException {
logger.debug("**** mergeAndMakeIndex ****");
logger.debug("flushCount={}", flushCount);
if (flushCount <= 0) {
output.writeInt(0);
indexOutput.writeInt(0);
return;
}
// to each field
logger.debug("## MERGE PK field");
makeHeap(flushCount);
int termCount = 0;
int indexTermCount = 0;
output.writeInt(termCount);// termCount
indexOutput.writeInt(indexTermCount);// indexTermCount
KeyValue keyValue = new KeyValue();
while (readNextTempIndex(keyValue)) {
// logger.debug("####keyValue > {}", keyValue);
BytesBuffer key = keyValue.key();
// write pkmap index
if (indexInterval > 0 && termCount % indexInterval == 0) {
indexOutput.writeVInt(key.length);
indexOutput.writeBytes(key.bytes, key.offset, key.length);
indexOutput.writeLong(output.position());
indexTermCount++;
}
output.writeVInt(key.length);
output.writeBytes(key.bytes, key.offset, key.length);
output.writeInt(keyValue.value());
termCount++;
}
logger.debug("pk index count = {}", indexTermCount);
logger.debug("filesize = {} bytes", output.position());
// write idxCount
// long p = indexOutput.position();
if (termCount > 0) {
output.seek(0);
output.writeInt(termCount);
indexOutput.seek(0);
indexOutput.writeInt(indexTermCount);
} else {
// 이미 indexTermCount는 0으로 셋팅되어 있으므로 기록할 필요없음.
}
logger.debug("## write PK termCount[{}] indexTermCount[{}] indexInterval[{}]", termCount, indexTermCount, indexInterval);
output.flush();
indexOutput.flush();
}
class KeyValue {
private BytesBuffer key;
private int value;
public KeyValue() {
}
public void init(BytesBuffer key, int value) {
this.key = key;
this.value = value;
}
public BytesBuffer key() {
return key;
}
public int value() {
return value;
}
public boolean isNull() {
return key == null;
}
@Override
public boolean equals(Object obj) {
if (obj == null || ((KeyValue) obj).key == null) {
return false;
}
return key.equals(((KeyValue) obj).key);
}
@Override
public String toString() {
return (key != null ? key.toAlphaString() : key) + " >> " + value;
}
}
// 여러번 flush된 임시 posing 파일에서 정렬된 단어들을 읽어들여 posting을 tempPostingOutput 하나로 머징한다.
protected boolean readNextTempIndex(KeyValue keyValue) throws IOException {
boolean isMade = false;
// int kk = 0;
while (true) {
int idx = heap[1];
kv.init(reader[idx].key(), reader[idx].docNo());
// logger.debug("kvOld > {}", kvOld);
// logger.debug("kv > {}", kv);
// logger.debug("---");
// if (kv.isNull() && kvOld == null) {
if (kv.isNull()) {
// if cv and cvOld are null, it's done
return false;
}
// cv == null일경우는 모든 reader가 종료되어 null이 된경우이며
// cvOld 와 cv 가 다른 경우는 머징시 텀이 바뀐경우. cvOld를 기록해야한다.
if ((kv.isNull() || !kv.equals(kvOld)) && !kvOld.isNull()) {
keyValue.init(kvOld.key(), kvOld.value());
isMade = true;
}
// backup cv to old
kvOld.init(kv.key, kv.value);
reader[idx].next();
heapify(1, flushCount);
if (isMade) {
return true;
}
} // while(true)
}
public void close() throws IOException {
IOException exception = null;
for (int i = 0; i < flushCount; i++) {
if (reader[i] != null) {
try {
reader[i].close();
} catch (IOException e) {
exception = e;
}
}
}
if (exception != null) {
throw exception;
}
}
protected void makeHeap(int heapSize) {
heap = new int[heapSize + 1];
// index starts from 1
for (int i = 0; i < heapSize; i++) {
heap[i + 1] = i;
}
int n = heapSize >> 1; // last inner node index
for (int i = n; i > 0; i--) {
heapify(i, heapSize);
}
}
protected void heapify(int idx, int heapSize) {
int temp = -1;
int child = -1;
while (idx <= heapSize) {
int left = idx << 1;// *=2
int right = left + 1;
if (left <= heapSize) {
if (right <= heapSize) {
// 키워드가 동일할 경우 먼저 flush된 reader가 우선해야, docNo가 오름차순 정렬순서대로 올바로 기록됨.
// flush후 머징시 문제가 생기는 버그 해결됨 2013-5-21 swsong
int c = compareKey(left, right);
if (c < 0) {
child = left;
} else if (c > 0) {
child = right;
} else {
// 하위 value 둘이 같아서 seq확인.
// 같다면 id가 작은게 우선.
int a = heap[left];
int b = heap[right];
if (reader[a].docNo() < reader[b].docNo()) {
child = left;
} else {
child = right;
}
}
} else {
// if there is no right el.
child = left;
}
} else {
// no children
break;
}
// compare and swap
int c = compareKey(child, idx);
if (c < 0) {
temp = heap[child];
heap[child] = heap[idx];
heap[idx] = temp;
idx = child;
} else if (c == 0) {
// 하위와 자신의 value가 같아서 seq확인
// 같다면 seq가 작은게 우선.
int a = heap[idx];
int b = heap[child];
if (reader[a].docNo() > reader[b].docNo()) {
// 하위의 seq가 작아서 child채택!
temp = heap[child];
heap[child] = heap[idx];
heap[idx] = temp;
idx = child;
} else {
// 내것을 그대로 사용.
// sorted
break;
}
} else {
// sorted, then do not check child
break;
}
}
}
protected int compareKey(int one, int another) {
int a = heap[one];
int b = heap[another];
return compareKey(reader[a].key(), reader[b].key());
}
protected int compareKey(BytesBuffer key1, BytesBuffer key2) {
// reader gets EOS, returns null
if (key1 == null && key2 == null) {
return 0;
} else if (key1 == null)
return 1;
else if (key2 == null)
return -1;
return BytesBuffer.compareBuffer(key1, key2);
}
}