/* * Copyright 2013 Websquared, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fastcatsearch.ir.index; import java.io.IOException; import java.util.Arrays; import org.fastcatsearch.al.HashFunctions; import org.fastcatsearch.ir.common.IRException; import org.fastcatsearch.ir.io.BytesBuffer; import org.fastcatsearch.ir.io.CharVector; import org.fastcatsearch.ir.io.IndexOutput; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** */ public class MemoryPosting { protected static Logger logger = LoggerFactory.getLogger(MemoryPosting.class); protected static final HashFunctions hfunc = HashFunctions.RSHash; protected int[] bucket; protected char[] keyArray; protected int[] keyPos; protected int[] nextIdx; protected PostingBuffer[] postingArray; protected int bucketSize; protected int length; protected int count; protected int keyArrayLength; protected int keyUseLength; protected boolean isIgnoreCase; public MemoryPosting(int size) { this(size, false); } public MemoryPosting(int size, boolean isIgnoreCase) { bucketSize = size; length = bucketSize; count = 0; keyArrayLength = bucketSize * 5; keyUseLength = 0; this.isIgnoreCase = isIgnoreCase; bucket = new int[bucketSize]; keyArray = new char[keyArrayLength]; keyPos = new int[length]; nextIdx = new int[length]; postingArray = new PostingBuffer[length]; Arrays.fill(bucket, -1); } protected PostingBuffer newPostingBuffer() { return new PostingBuffer(); } public long save(IndexOutput output) throws IOException { // 하나의 파일에 블럭단위로 write한다. 맨앞에 데이터 길이필요 logger.debug("MemoryPosting term-count = {}", count); // sort int[] sortedID = new int[count]; for (int i = 0; i < count; i++) { sortedID[i] = i; } if (count > 0) { long st = System.currentTimeMillis(); logger.debug("MemoryPosting term sort..."); quickSort(sortedID, 0, count - 1); logger.debug("Sort Done. time = {}ms", System.currentTimeMillis() - st); } // 기록위치 long outPos = output.position(); // 텀갯수 output.writeInt(count); // logger.debug("term count = {}", count); for (int i = 0; i < count; i++) { int id = sortedID[i]; int pos = keyPos[id]; int len = -1; // 마지막 원소이면 if (id == count - 1) { len = keyUseLength - pos; } else { len = keyPos[id + 1] - pos; } output.writeUString(keyArray, pos, len); // logger.debug("key>> {}", new String(keyArray, pos, len)); if (postingArray[id] == null) { logger.error("id={}, len={}, term={}", id, len, new String(keyArray, pos, len)); } postingArray[id].finish(); BytesBuffer buf = postingArray[id].buffer(); // 데이터길이 output.writeVInt(buf.length()); // logger.debug("term = {} >> {}", new String(keyArray, pos, len), buf.length()); if (buf.length() > 0) { output.writeBytes(buf); } else { logger.error("buffer empty >> {} = {} ,data=0 ", id, new String(keyArray, pos, len)); // 버퍼가 비어있도록 진행하도록 수정. // throw new IOException("buf is empty"); } } logger.debug("=================="); logger.debug("outPos = {}, count = {}, end = {}", outPos, count, output.position()); return outPos; } private void quickSort(int[] ids, int first, int last) { if (last <= 0) return; int stackMaxSize = (int) ((Math.log(last - first + 1) + 3) * 2); int[][] stack = new int[stackMaxSize][2]; int pivotId = 0, sp = 0; int left = 0, right = 0; while (true) { while (first < last) { left = first; right = last; int median = (left + right) / 2; // move pivot to left most. int tmp = ids[left]; ids[left] = ids[median]; ids[median] = tmp; pivotId = ids[left]; while (left < right) { while (compareKey(ids[right], pivotId) >= 0 && (left < right)) right--; if (left != right) { ids[left] = ids[right]; left++; } while (compareKey(ids[left], pivotId) <= 0 && (left < right)) left++; if (left != right) { ids[right] = ids[left]; right--; } } ids[left] = pivotId; if (left - first < last - left) { if (left + 1 < last) { sp++; stack[sp][0] = left + 1; stack[sp][1] = last; } last = left - 1; } else { if (first < left - 1) { sp++; stack[sp][0] = first; stack[sp][1] = left - 1; } first = left + 1; } } if (sp == 0) { return; } else { first = stack[sp][0]; last = stack[sp][1]; sp--; } } } private int compareKey(int id, int id2) { int pos = keyPos[id]; int len = -1; if (id == count - 1) len = keyUseLength - pos; else len = keyPos[id + 1] - pos; int pos2 = keyPos[id2]; int len2 = -1; if (id2 == count - 1) len2 = keyUseLength - pos2; else len2 = keyPos[id2 + 1] - pos2; int length = (len < len2) ? len : len2; for (int i = 0; i < length; i++) { if (keyArray[pos + i] != keyArray[pos2 + i]) return keyArray[pos + i] - keyArray[pos2 + i]; } return len - len2; } public void add(CharVector term, int docNo) throws IRException { add(term, docNo, 0); } public void add(CharVector term, int docNo, int position) throws IRException { if (term == null || term.length() == 0) { return; } PostingBuffer p = get(term); if (p == null) { p = newPostingBuffer(); put0(term, p); } // logger.debug("term >> {}", term); p.addOne(docNo, position); } private boolean isTheSame(CharVector term, int id) { int pos = keyPos[id]; int len = -1; // last el? if (id == count - 1) len = keyUseLength - pos; else len = keyPos[id + 1] - pos; // logger.debug(term+" , term.length="+term.length+", len="+len); if (term.length() == len) { if (isIgnoreCase) { for (int i = 0; i < len; i++) { if (toUpperChar(term.charAt(i)) != keyArray[pos + i]){ return false; } } }else{ for (int i = 0; i < len; i++) { if (term.charAt(i) != keyArray[pos + i]){ return false; } } } return true; } return false; } private char toUpperChar(int ch) { if ((ch <= 'z' && ch >= 'a')) { // 소문자이면.. ch -= 32; } return (char) ch; } private PostingBuffer put0(CharVector term, PostingBuffer p) { int hashValue = hfunc.hash(term, bucketSize, isIgnoreCase); int prev = -1; int idx = bucket[hashValue]; while (idx >= 0) { if (isTheSame(term, idx)) break; prev = idx; idx = nextIdx[idx]; } if (idx >= 0) { // duplicated term if (prev != -1) { // put a link to the front nextIdx[prev] = nextIdx[idx]; nextIdx[idx] = bucket[hashValue]; bucket[hashValue] = idx; }// else let it be } else { // new term idx = getNextIdx(); if (keyUseLength + term.length() >= keyArrayLength) { keyArrayLength *= 1.2; char[] newArray = new char[keyArrayLength]; System.arraycopy(keyArray, 0, newArray, 0, keyUseLength); keyArray = newArray; } keyPos[idx] = keyUseLength; if (isIgnoreCase) { for (int i = 0; i < term.length(); i++) { keyArray[keyUseLength++] = toUpperChar(term.charAt(i)); } } else { for (int i = 0; i < term.length(); i++) { keyArray[keyUseLength++] = term.charAt(i); } } nextIdx[idx] = -1; if (prev != -1) nextIdx[prev] = idx; else bucket[hashValue] = idx; } PostingBuffer old = postingArray[idx]; postingArray[idx] = p; return old; } public PostingBuffer get(CharVector term) { int hashValue = hfunc.hash(term, bucketSize, isIgnoreCase); int idx = bucket[hashValue]; // logger.debug(term+" = "+hashValue+", idx="+idx); while (idx >= 0) { if (isTheSame(term, idx)) { break; } idx = nextIdx[idx]; } if (idx < 0) return null; // 검색실패 else { return postingArray[idx]; } } private int getNextIdx() { if (count >= length) { int newLength = (int) (length * 1.2); // logger.debug("Grow length = "+length+" => "+newLength+", new int * 2, new PostingBuffer[], arraycopy * 3"); int[] newKeyPos = new int[newLength]; int[] newNext = new int[newLength]; PostingBuffer[] newTermPosting = new PostingBuffer[newLength]; System.arraycopy(keyPos, 0, newKeyPos, 0, count); System.arraycopy(nextIdx, 0, newNext, 0, count); System.arraycopy(postingArray, 0, newTermPosting, 0, count); keyPos = newKeyPos; nextIdx = newNext; postingArray = newTermPosting; length = newLength; } return count++; } public int workingMemorySize() { int size = 0; for (int i = 0; i < postingArray.length; i++) if (postingArray[i] != null) size += (postingArray[i].size() + 80); size += keyUseLength * 2; size += bucket.length * 4; size += count * 8; // keyPos(4), nextIdx(4) return size; } public int staticMemorySize() { int size = 0; for (int i = 0; i < postingArray.length; i++) if (postingArray[i] != null) size += (postingArray[i].size() + 80); size += keyArrayLength * 2; size += bucket.length * 4; size += keyPos.length * 4; size += nextIdx.length * 4; return size; } public void clear() { Arrays.fill(bucket, -1); Arrays.fill(nextIdx, -1); // posting array는 지워준다. for (int i = 0; i < postingArray.length; i++) { postingArray[i] = null; } count = 0; keyUseLength = 0; // keyarray배열은 그대로 재사용한다. } // entry count public int count() { return count; } }