/*
* Copyright 2013 Websquared, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fastcatsearch.ir.document;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import org.fastcatsearch.ir.common.IndexFileNames;
import org.fastcatsearch.ir.io.BufferedFileOutput;
import org.fastcatsearch.ir.io.BytesBuffer;
import org.fastcatsearch.ir.io.IndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* 해시구조로 키들을 모은후 정렬하여 파일에 쓴다.
* @author sangwook.song
*
*/
public class PrimaryKeyIndexWriter {
private static Logger logger = LoggerFactory.getLogger(PrimaryKeyIndexWriter.class);
private IndexOutput output;
private IndexOutput indexOutput;
private int[] bucket;
private byte[] array;
private int[] keyPos;
private int[] nextIdx;
private int[] intValueArray;
private int bucketSize;
private int length;
private int count;
private int keySize;
private int keyUseSize;
private int indexInterval;
private boolean hasOutOfMemory;
public PrimaryKeyIndexWriter(int bucketSize) throws IOException{
this(null, null, 0, bucketSize);
}
public PrimaryKeyIndexWriter(int indexInterval, int bucketSize) throws IOException{
this(null, null, indexInterval, bucketSize);
}
public PrimaryKeyIndexWriter(File dir, String filename, int indexInterval, int bucketSize) throws IOException{
if(dir != null && filename != null){
String indexFilename = IndexFileNames.getIndexFileName(filename);
output = new BufferedFileOutput(dir, filename);
indexOutput = new BufferedFileOutput(dir, indexFilename);
}
this.indexInterval = indexInterval;
this.bucketSize = bucketSize;
length = bucketSize;
count = 0;
keySize = bucketSize * 5;
keyUseSize = 0;
// logger.debug("length="+length);
bucket = new int[bucketSize];
array = new byte[keySize];
keyPos = new int[length];
nextIdx = new int[length];
intValueArray = new int[length];
Arrays.fill(bucket, -1);
Arrays.fill(intValueArray, -1);
}
public void setDestination(IndexOutput output, IndexOutput indexOutput){
this.output = output;
this.indexOutput = indexOutput;
}
public int count(){
return count;
}
public void write() throws IOException{
if(hasOutOfMemory)
return;
if(count == 0){
output.writeInt(0);
indexOutput.writeInt(0);
return;
}
//sort
int[] sortedIdx = new int[count];
for (int i = 0; i < count; i++)
sortedIdx[i] = i;
long st = System.currentTimeMillis();
quickSort(sortedIdx, 0, count - 1);
logger.debug("sort time = "+(System.currentTimeMillis() - st)+"ms");
//term count
output.writeInt(count);
// long indexPos = indexOutput.position();
indexOutput.writeInt(0);//write later again.
logger.debug("pk count = {}", count);
int idxCount = 0;
for (int i = 0; i < count; i++) {
int id = sortedIdx[i];
int pos = keyPos[id];
int len = -1;
//last elt
if(id == count - 1)
len = keyUseSize - pos;
else
len = keyPos[id+1] - pos;
//write pkmap index
if(indexInterval > 0 && i % indexInterval == 0){
indexOutput.writeVInt(len);
indexOutput.writeBytes(array, pos, len);
indexOutput.writeLong(output.position());
idxCount++;
}
output.writeVInt(len);
output.writeBytes(array, pos, len);
output.writeInt(intValueArray[id]);
}
logger.debug("{} pk index count = {}, filesize = {} bytes", output.toString(), idxCount, output.position());
//write idxCount
// long p = indexOutput.position();
indexOutput.seek(0);
indexOutput.writeInt(idxCount);
// indexOutput.seek(p);
}
public void close() throws IOException{
indexOutput.close();
output.close();
}
private void quickSort(int[] ids, int first, int last) {
if(last <= 0)
return;
int stackMaxSize = (int) ((Math.log(last - first + 1) + 3) * 2);
int[][] stack = new int[stackMaxSize][2];
int pivotId = 0, sp = 0;
int left = 0, right = 0;
while(true){
while(first < last){
left = first;
right = last;
int pivot = (left + right)/2;
//move pivot to left most.
int tmp = ids[left];
ids[left] = ids[pivot];
ids[pivot] = tmp;
pivotId = ids[left];
while (left < right) {
while (compareKey(ids[right], pivotId) >= 0 && (left < right))
right --;
if (left != right){
ids[left] = ids[right];
left++;
}
while (compareKey(ids[left], pivotId) <= 0 && (left < right))
left ++;
if (left != right) {
ids[right] = ids[left];
right --;
}
}
ids[left] = pivotId;
if(left - first < last - left){
if (left + 1 < last) {
sp++;
stack[sp][0] = left + 1;
stack[sp][1] = last;
}
last = left - 1;
}else{
if (first < left - 1) {
sp++;
stack[sp][0] = first;
stack[sp][1] = left -1;
}
first = left + 1;
}
}
if (sp == 0) {
return;
}else {
first = stack[sp][0];
last = stack[sp][1];
sp--;
}
}
}
private int compareKey(int id, int id2){
int pos = keyPos[id];
int len = -1;
//last elt
if(id == count - 1)
len = keyUseSize - pos;
else
len = keyPos[id+1] - pos;
int pos2 = keyPos[id2];
int len2 = -1;
//last elt
if(id2 == count - 1)
len2 = keyUseSize - pos2;
else
len2 = keyPos[id2+1] - pos2;
int length = (len < len2) ? len : len2;
for (int i = 0; i < length; i++) {
if(array[pos+i] != array[pos2+i])
return array[pos+i] - array[pos2+i];
}
return len - len2;
}
private boolean isTheSame(byte[] data, int offset, int dataLength, int idx) {
int pos = keyPos[idx];
int len = -1;
if(idx == count - 1)
len = keyUseSize - pos;
else
len = keyPos[idx+1] - pos;
if(dataLength == len){
for (int i = 0; i < len; i++) {
if(data[offset + i] != array[pos+i])
return false;
}
return true;
}
return false;
}
public int put(BytesBuffer buffer, int docNo) throws IOException {
return put(buffer.bytes, buffer.offset, buffer.length, docNo);
}
public int put(byte[] data, int offset, int dataLength, int docNo) throws IOException {
int hashValue = rsHash(data, offset, dataLength);
int prev = -1;
int idx = bucket[hashValue];
while(idx >= 0){
if(isTheSame(data, offset, dataLength, idx)){
break;
}
prev = idx;
idx = nextIdx[idx];
}
if(idx >= 0){
if(prev != -1){
nextIdx[prev] = nextIdx[idx];
nextIdx[idx] = bucket[hashValue];
bucket[hashValue] = idx;
}
}
else{
//new term
idx = getNextIdx();
// logger.debug("new term next idx = "+idx+"/ "+dataLength+" / "+keyUseSize);
try{
if (keyUseSize + dataLength >= keySize) {
keySize *= 1.2;
// logger.debug(this+" ## grow keysize = "+keySize+", "+Runtime.getRuntime().totalMemory()+", count="+count+", dl = "+dataLength);
byte[] newArray = new byte[keySize];
System.arraycopy(array, 0, newArray, 0, keyUseSize);
array = newArray;
}
}catch(OutOfMemoryError e){
hasOutOfMemory = true;
logger.error("PK writing OOM! size = "+keySize+" msg = "+e.getMessage(),e);
throw new IOException(e.toString());
}
keyPos[idx] = keyUseSize;
System.arraycopy(data, offset, array, keyUseSize, dataLength);
keyUseSize += dataLength;
nextIdx[idx] = -1;
if(prev != -1)
nextIdx[prev] = idx;
else
bucket[hashValue] = idx;
}
int old = intValueArray[idx];
intValueArray[idx] = docNo;
return old;
}
public int get(BytesBuffer buffer) {
return get(buffer.bytes, buffer.offset, buffer.length);
}
public int get(byte[] data, int offset, int dataLength) {
int hashValue = rsHash(data, offset, dataLength);
int idx = bucket[hashValue];
while(idx >= 0){
if(isTheSame(data, offset, dataLength, idx))
break;
idx = nextIdx[idx];
}
if(idx >= 0)
return intValueArray[idx];
else{
return -1;
}
}
private int getNextIdx() {
if(count >= length){
int newLength = (int) (length * 1.2);
int [] newKeyPos = new int[newLength];
int [] newNext = new int[newLength];
int [] newIntValueArray = new int[newLength];
System.arraycopy(keyPos, 0, newKeyPos, 0, count);
System.arraycopy(nextIdx, 0, newNext, 0, count);
System.arraycopy(intValueArray, 0, newIntValueArray, 0, count);
Arrays.fill(newIntValueArray, count, newIntValueArray.length, -1);
keyPos = newKeyPos;
nextIdx = newNext;
intValueArray = newIntValueArray;
length = newLength;
}
return count++;
}
private int rsHash(byte[] data, int offset, int length) {
int b = 378551;
int a = 63689;
int hashValue = 0;
for (int i = 0; i < length; i++) {
hashValue = hashValue * a + (data[offset + i] & 0xff);
a = a * b;
}
return hashValue & (bucketSize - 1);
}
}