package focusedCrawler.seedfinder;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import focusedCrawler.seedfinder.QueryProcessor.QueryResult;
import focusedCrawler.target.model.Page;
public class QueryGenerator {
private double minimumPrecision = 0.5d;
Set<String> queryTermsUsed = new HashSet<>();
RelevanceModel relevanceModel = new RelevanceModel();
public QueryGenerator(double minimumPrecision) {
this.minimumPrecision = minimumPrecision;
}
public Query buildNextQuery(Query query, QueryResult queryResult) {
// keep track of all terms used
queryTermsUsed.addAll(query.termsSet());
//
// 1. compute term scores for terms contained in all positive and negative documents
//
// (this 'if' exists in the original implementation, but is not described in the paper)
if(queryResult.precision() > minimumPrecision) {
for (Page page : queryResult.positivePages) {
relevanceModel.addPage(true, page);
}
for (Page page : queryResult.negativePages) {
relevanceModel.addPage(false, page);
}
}
//
// 2. re-weight term scores of all terms of the query
//
for (QueryTerm t : query.getTerms()) {
relevanceModel.reweightScore(t.term, queryResult.precision());
}
//
// 3. create new query
//
int querySize = query.getTerms().size();
if(queryResult.precision() < minimumPrecision) {
querySize++;
} else {
if(queryResult.percentNewResults < minimumPrecision) {
querySize--;
}
}
Query newQuery = new Query();
List<QueryTerm> bestTerms = relevanceModel.getTermsWithBestScores(querySize-1);
newQuery.addTerms(bestTerms);
for (QueryTerm term : bestTerms) System.out.println("bestTerm: "+term.toString());
queryTermsUsed.addAll(newQuery.termsSet());
QueryTerm unusedTerm = relevanceModel.getTermWithBestScoreExcept(queryTermsUsed);
System.out.println("unusedTerm: "+unusedTerm.toString());
newQuery.addTerm(unusedTerm);
System.out.println("New query: "+newQuery.asString());
return newQuery;
}
}