package org.apache.lucene.search.concordance.windowvisitor;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.corpus.stats.IDFIndexCalc;
import org.apache.lucene.corpus.stats.TFIDFPriorityQueue;
import org.apache.lucene.corpus.stats.TermIDF;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.mutable.MutableValueInt;
/**
* Class to count cooccurrences for targets
*/
public class CooccurVisitor extends ArrayWindowVisitor<List<TermIDF>> {
private final Map<String, MutableValueInt> tfs = new HashMap<>();
private final IDFIndexCalc idfCalc;
private final Set<String> alreadySeen = new HashSet<>();
private final boolean allowDuplicates;
private final Grammer grammer;
/**
* minimum term frequency to include in calculations.
* If the term doesn't show up this often in the context of the target,
* ignore it.
*/
private int minTermFreq = 5;
/**
* number of results to return
*/
private int numResults = 20;
/**
* @param fieldName field to search
* @param tokensBefore number of tokens before
* @param tokensAfter number of tokens after
* @param grammer grammer to use to combine tokens
* @param idfCalc calculator of inverse document frequency
* @param maxWindows maximum number of windows to collect
* @param allowDuplicates collect stats on duplicate windows?
*/
public CooccurVisitor(String fieldName,
int tokensBefore, int tokensAfter, Grammer grammer,
IDFIndexCalc idfCalc, int maxWindows, boolean allowDuplicates) {
super(fieldName, tokensBefore, tokensAfter, false, false, maxWindows);
this.grammer = grammer;
this.idfCalc = idfCalc;
this.allowDuplicates = allowDuplicates;
}
@Override
public void visit(String docId, ConcordanceArrayWindow window)
throws IOException {
if (getNumWindowsVisited() >= getMaxWindows()) {
setHitMax(true);
return;
}
if (!allowDuplicates) {
String key = window.toString();
if (alreadySeen.contains(key)) {
return;
}
alreadySeen.add(key);
}
List<String> tmpGrams = grammer.getGrams(window.getRawPreList(), SPACE);
tmpGrams.addAll(grammer.getGrams(window.getRawPostList(), SPACE));
for (String nGram : tmpGrams) {
MutableValueInt cnt = tfs.get(nGram);
if (cnt == null) {
cnt = new MutableValueInt();
cnt.value = 0;
}
cnt.value++;
tfs.put(nGram, cnt);
}
finishedVisit(docId);
}
/**
* can throw RuntimeException if there is an IOException
* while calculating the IDFs
*/
public List<TermIDF> getResults() {
TFIDFPriorityQueue queue = new TFIDFPriorityQueue(numResults);
int tf = -1;
double idf = -1.0;
int minTf = minTermFreq;
String text = "";
Term reusableTerm = new Term(getFieldName(), "");
for (Map.Entry<String, MutableValueInt> entry : tfs.entrySet()) {
tf = entry.getValue().value;
if (tf < minTf)
continue;
text = entry.getKey();
// calculate idf for potential phrase
double[] stats;
try {
stats = idfCalc.multiTermIDF(text, reusableTerm);
} catch (IOException e) {
throw new RuntimeException("Error trying to calculate IDF: " + e.getMessage());
}
idf = stats[0];
int estimatedDF = (int) Math.max(1, Math.round(idfCalc.unIDF(idf)));
TermIDF r = new TermIDF(text, estimatedDF, tf, idf);
queue.insertWithOverflow(r);
}
List<TermIDF> results = new LinkedList<TermIDF>();
while (queue.size() > 0) {
results.add(0, queue.pop());
}
return results;
}
public int getMinTermFreq() {
return minTermFreq;
}
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public void setNumResults(int numResults) {
if (numResults < 0) {
throw new IllegalArgumentException("Number of results must be >= 0:" +numResults);
}
this.numResults = numResults;
}
}