/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package contrast;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.corpus.stats.IDFIndexCalc;
import org.apache.lucene.corpus.stats.TFIDFPriorityQueue;
import org.apache.lucene.corpus.stats.TermIDF;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.mutable.MutableValueInt;
class QueryToCorpusContraster {
private final int maxDocs;
private final IndexSearcher searcher;
private final Version version;
private final boolean ignoreCase = true;
private Analyzer analyzer = null;
private int maxTokens = 10000;
//if the term doesn't show up in this many docs, ignore!
private final int minTermFreq = 10;
public QueryToCorpusContraster(Version version, IndexSearcher searcher, int maxDocs) {
this.searcher = searcher;
this.maxDocs = maxDocs;
this.version = version;
}
public List<TermIDF> contrast(Query query, String fieldName, int numResults)
throws IOException {
TopScoreDocCollector results = TopScoreDocCollector.create(maxDocs);
searcher.search(query, results);
ScoreDoc[] scoreDocs = results.topDocs().scoreDocs;
//if there are fewer documents than minTermFreq
//return empty list now
if (scoreDocs.length < minTermFreq) {
return new ArrayList<TermIDF>();
}
//total hack
int initialSize = scoreDocs.length * 100;
CharArrayMap<MutableValueInt> map = new CharArrayMap<>(initialSize, ignoreCase);
CharArraySet tmpSet = new CharArraySet(100, ignoreCase);
Set<String> selector = new HashSet<>();
selector.add(fieldName);
for (ScoreDoc scoreDoc : scoreDocs) {
//get terms from doc
processDoc(scoreDoc.doc, fieldName, selector, tmpSet);
//now update global doc freqs
Iterator<Object> it = tmpSet.iterator();
while (it.hasNext()) {
char[] token = (char[]) it.next();
MutableValueInt docCount = map.get(token, 0, token.length);
if (docCount == null) {
docCount = new MutableValueInt();
docCount.value = 1;
} else {
docCount.value++;
}
map.put(token, docCount);
}
tmpSet.clear();
}
return getResults(fieldName, map, numResults);
}
private List<TermIDF> getResults(String fieldName,
CharArrayMap<MutableValueInt> map, int numResults) {
TFIDFPriorityQueue queue = new TFIDFPriorityQueue(numResults);
IDFIndexCalc idfCalc = new IDFIndexCalc(searcher.getIndexReader());
int tf = -1;
double idf = -1.0;
int minTf = minTermFreq;
String text = null;
//make more efficient
// Term reusableTerm = new Term(fieldName, "");
for (Map.Entry<Object, MutableValueInt> entry : map.entrySet()) {
tf = entry.getValue().value;
if (tf < minTf)
continue;
text = new String((char[]) entry.getKey());
// calculate idf for potential phrase
try {
idf = idfCalc.singleTermIDF(new Term(fieldName, text));
} catch (IOException e) {
throw new RuntimeException("Error trying to calculate IDF: " + e.getMessage());
}
int estimatedDF = (int) Math.max(1, Math.round(idfCalc.unIDF(idf)));
TermIDF r = new TermIDF(text, estimatedDF, tf, idf);
queue.insertWithOverflow(r);
}
List<TermIDF> results = new LinkedList<>();
while (queue.size() > 0) {
results.add(0, queue.pop());
}
return results;
}
private void processDoc(int docid, String fieldName, Set<String> selector,
CharArraySet set) throws IOException {
Terms terms = searcher.getIndexReader().getTermVector(docid, fieldName);
if (terms != null) {
TermsEnum te = terms.iterator();
BytesRef bytes = te.next();
while (bytes != null) {
set.add(bytes);
bytes = te.next();
}
} else if (analyzer != null) {
Document document = searcher.doc(docid, selector);
IndexableField[] fields = document.getFields(fieldName);
if (fields == null) {
return;
}
for (IndexableField field : fields) {
String s = field.stringValue();
//is this possible
if (s == null) {
continue;
}
processFieldEntry(fieldName, s, set);
}
} else {
throw new IllegalArgumentException("The field must have a term vector or the analyzer must" +
" not be null.");
}
}
private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
TokenStream ts = analyzer.tokenStream(fieldName, s);
CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
set.add(cattr.toString());
}
ts.end();
ts.close();
}
/**
* Sets the analyzer to be used if term vectors are not stored.
*
* @param analyzer analyzer to be used if term vectors are not stored
* @param maxTokens maximum number of tokens to analyze. If < 0,
* all tokens will be analyzed.
*/
public void setAnalyzer(Analyzer analyzer, int maxTokens) {
this.analyzer = analyzer;
this.maxTokens = maxTokens;
}
}