/*
* Copyright 2013 Websquared, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fastcatsearch.ir.search.clause;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Writer;
import java.util.List;
import org.fastcatsearch.ir.query.RankInfo;
import org.fastcatsearch.ir.search.CollectedEntry;
import org.fastcatsearch.ir.search.PostingDoc;
import org.fastcatsearch.ir.search.PostingReader;
import org.fastcatsearch.ir.search.TermDocCollector;
import org.fastcatsearch.ir.search.TermDocTreeReader;
import org.fastcatsearch.ir.search.PostingDocs;
import org.fastcatsearch.ir.search.posting.PostingDocsTreeNode;
public class MultiTermOperatedClause extends OperatedClause {
private static final int SCORE_BASE = 10000;
private TermDocTreeReader termDocTreeReader;
private int termCount;
private TermDocCollector termDocCollector;
private boolean storePosition;
public MultiTermOperatedClause() {
this(null, false);
}
public MultiTermOperatedClause(String indexId, boolean storePosition) {
super(indexId);
this.storePosition = storePosition;
termDocTreeReader = new TermDocTreeReader();
}
public void addTerm(PostingReader postingReader) {
addTerm(postingReader, null);
}
public void addTerm(PostingReader postingReader, List<PostingDocs> synonymList) {
if (postingReader != null) {
termDocTreeReader.addNode(new PostingDocsTreeNode(postingReader));
termCount++;
}
// PostingDocs sysnonymTermDocs = null;
// if(synonymList != null){
// if(synonymList.size() == 1){
// sysnonymTermDocs = synonymList.get(0);
// }else{
// sysnonymTermDocs = new
// PostingDocsMerger(synonymList).merge(termDocs.term(), 1024);
// }
// termDocTreeReader.addNode(new PostingDocsTreeNode(sysnonymTermDocs,
// queryPosition, true));
// termCount++;
// }
}
protected boolean nextDoc(RankInfo docInfo) {
if (termDocCollector == null) {
termDocCollector = new TermDocCollector(termCount);
}
// 동일문서번호에 대한 TermDoc list를 리턴받는다.
int docNo = -1;
while (true) {
while (true) {
termDocCollector.clear();
docNo = termDocTreeReader.next(termDocCollector);
if (docNo == -1 || (((float) termDocCollector.size()) / ((float) termDocCollector.capasity()) > 0.7f)) {
break;
}
}
if (docNo < 0) {
return false;
} else {
// logger.debug(">> phrase doc={}, term size={}", docNo,
// totalTermDocList.size());
// 쿼리와 비교한 여러단어에 대한 점수계산.
if (storePosition) {
float proximityScore = 0f;
int documentScore = 0;
float sumOfTPI = 0f;
int windowSize = 2;
// logger.debug("----------------------------{}", termDocCollector.size());
int continuosWords = 1;
for (int i = 0; i < termDocCollector.size(); i++) {
CollectedEntry entry = termDocCollector.get(i);
PostingDoc termDoc = entry.termDoc();
// okapi점수에 tf만 사용. 쿼리점수무시. idf무시.
// documentScore += (2.2f * (float)termDoc.tf() / (2.0f+ (float)termDoc.tf()));
documentScore += 1;
// logger.debug("[{}]doc >> {} : {} >> score {}", entry.term(), termDoc.docNo(), termDoc.tf(), documentScore);
// score += 1000; //tf 는 무시. 한단어가 여러번 나오는 것은 그다지 중요하지
// 않음.
int[] positions = termDoc.positions();
// 인접해있는 windowSize 까지만 봄.
int j = i + 1;
if (j < termDocCollector.size()) {
// TODO doc1과 doc2가 유사어관계면 점수계산하지 않는다. 근데 누구의 유사어인지
// 확인할 방법이없네..
// TODO 같은 단어를 두번이상 쿼리에 입려했을 경우 무시하는 로직 필요.?
CollectedEntry entry2 = termDocCollector.get(j);
PostingDoc termDoc2 = entry2.termDoc();
// logger.debug("### {}[{}] : {}[{}]", entry.term(), termDoc.tf(), entry2.term(), termDoc2.tf());
int origianlPositionGap = entry2.queryPosition() - entry.queryPosition();
int[] positions2 = termDoc2.positions();
// logger.debug("pos1= {}", positions);
// logger.debug("pos2= {}", positions2);
int minGap = -1;
for (int i2 = 0; i2 < positions.length; i2++) {
for (int j2 = 0; j2 < positions2.length; j2++) {
int actualPositionGap = positions2[j2] - positions[i2];
if(actualPositionGap > 10 || actualPositionGap < -10){
continue;
}
// logger.debug("------- {} - {} = {} : {}", positions[i2], positions2[j2], actualPositionGap, origianlPositionGap);
int gap = 0;
if(origianlPositionGap > 0){
if (actualPositionGap > 0) {
if (actualPositionGap <= origianlPositionGap + windowSize) {
// logger.debug("OK");
gap = Math.abs(origianlPositionGap - actualPositionGap);
} else {
continue;
}
} else {
if (-actualPositionGap <= origianlPositionGap) {
// logger.debug("OK2");
gap = Math.abs(origianlPositionGap + actualPositionGap);
} else {
continue;
}
}
}else{
if (actualPositionGap > 0) {
if (actualPositionGap <= -origianlPositionGap) {
// logger.debug("OK3");
gap = Math.abs(-origianlPositionGap - actualPositionGap);
} else {
continue;
}
} else {
if (-actualPositionGap <= -origianlPositionGap + windowSize) {
// logger.debug("OK4");
gap = Math.abs(-origianlPositionGap + actualPositionGap);
} else {
continue;
}
}
}
if(minGap == -1 || gap < minGap){
minGap = gap;
}
}
}
if(minGap != -1){
continuosWords++;
}
// int positionGapDiff = origianlPositionGap - actualPositionGap;
// d(ti, tj) = 원래차이 - 실제차이 => 원래차이와 실제차이가
// 같을수록 즉 0에 가까울수록 tpi가 높은 점수가 된다.
// tpi(ti,tj) = 1.0 / d(t1, tj)^2
// 완벽한 TPRSV 점수를구하려면 문서길이/문서평균길이를 이용한 K 값을
// 계산해야 하나 여기서는 생략한다.
float tpi = (float) (1.0 / (1.0 + Math.pow(minGap, 2.0)));
// logger.debug("------- {} - {} = {} - {} = {} >> {}",
// positions[i2], positions2[j2],
// actualPositionGap, origianlPositionGap,
// positionGapDiff, tpi);
sumOfTPI += tpi;
// logger.debug("{}: {} = {}",positions[i2], positions2[j2], tpi);
}//if
}
/*
* 인접을 보면서 제외시킨다.
*/
// logger.debug("####continuosWords {} >= {} ({})", continuosWords, (termCount/2) + 1, termCount);
if (continuosWords >= (termCount/2) + 1) {
float nomalizedDocScore = ((float)documentScore / (float)termCount) * 2.0f;
// Okapi점수를 계산하여 tpi점수와 더해야 최종 점수가 계산됨.
proximityScore = 2.2f * sumOfTPI / (2.0f + sumOfTPI);
int score = (int) ((nomalizedDocScore + proximityScore) * SCORE_BASE);
// logger.debug("nomalize {} = {} / {} * 2 => {} + {}", score, documentScore, termCount, nomalizedDocScore, proximityScore);
docInfo.init(docNo, score);
break;
}
} else {
// 위치정보가 없으면 tf를 점수로 만든다.
// Okapi 점수 계산
float s = 0.0f;
for (int i = 0; i < termDocCollector.size(); i++) {
PostingDoc termDoc = termDocCollector.get(i).termDoc();
s += (2.2f * (float) termDoc.tf() / (2.0f + (float) termDoc.tf()));
}
int score = (int) (s * SCORE_BASE);
logger.debug("추가2 >> docNo={} : {}", docNo, score);
docInfo.init(docNo, score);
break;
}
}
}
// TODO점수가 일정치이하이면 버린다.
return true;
}
@Override
public void close() {
termDocTreeReader.close();
}
@Override
public void printTrace(Writer writer, int indent, int depth) throws IOException {
}
@Override
protected void initClause(boolean explain) {
}
// @Override
// protected void initExplanation() {
// }
}