/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.common.lucene.index.FreqTermsEnum;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
import java.io.IOException;
//TODO public for tests
public abstract class WordScorer {
protected final IndexReader reader;
protected final String field;
protected final Terms terms;
protected final long vocabluarySize;
protected final double realWordLikelyhood;
protected final BytesRefBuilder spare = new BytesRefBuilder();
protected final BytesRef separator;
private final TermsEnum termsEnum;
private final long numTerms;
private final boolean useTotalTermFreq;
public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
this(reader, MultiFields.getTerms(reader, field), field, realWordLikelyHood, separator);
}
public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
this.field = field;
if (terms == null) {
throw new IllegalArgumentException("Field: [" + field + "] does not exist");
}
this.terms = terms;
final long vocSize = terms.getSumTotalTermFreq();
this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize;
this.useTotalTermFreq = vocSize != -1;
this.numTerms = terms.size();
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
this.reader = reader;
this.realWordLikelyhood = realWordLikelyHood;
this.separator = separator;
}
public long frequency(BytesRef term) throws IOException {
if (termsEnum.seekExact(term)) {
return useTotalTermFreq ? termsEnum.totalTermFreq() : termsEnum.docFreq();
}
return 0;
}
protected double channelScore(Candidate candidate, Candidate original) throws IOException {
if (candidate.stringDistance == 1.0d) {
return realWordLikelyhood;
}
return candidate.stringDistance;
}
public double score(Candidate[] path, CandidateSet[] candidateSet, int at, int gramSize) throws IOException {
if (at == 0 || gramSize == 1) {
return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreUnigram(path[at]));
} else if (at == 1 || gramSize == 2) {
return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreBigram(path[at], path[at - 1]));
} else {
return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreTrigram(path[at], path[at - 1], path[at - 2]));
}
}
protected double scoreUnigram(Candidate word) throws IOException {
return (1.0 + frequency(word.term)) / (vocabluarySize + numTerms);
}
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
return scoreUnigram(word);
}
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
return scoreBigram(word, w_1);
}
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
result.clear();
for (int i = 0; i < toJoin.length - 1; i++) {
result.append(toJoin[i]);
result.append(separator);
}
result.append(toJoin[toJoin.length-1]);
return result.get();
}
public interface WordScorerFactory {
WordScorer newScorer(IndexReader reader, Terms terms,
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
}
}