package models;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import models.helpers.IFilter;
import models.helpers.SetOperations;
import models.helpers.StopWords;
/**
* SearchFilter can be used as a Filter-Visitor that classifies a list of
* questions of how well the question's content, its answers' contents, its tags
* and the name of their owners fit given search terms. Questions not matching
* <em>all</em> of the search terms are given a value of <code>null</code> so
* that they can be filtered out.
**/
public class SearchFilter implements IFilter<Question, Double> {
/**
* A set of terms to search for. All these terms must be found (AND search).
* Set to <code>null</code> for filtering out questions with partially
* overlapping tag sets.
*/
private final Set<String> queryFulltext;
/**
* A set of tags to match. The more tags overlap, the higher a question will
* be rated.
*/
private final Set<Tag> queryTags;
/**
* Instantiates a new search filter.
*
* @param query
* all the search terms a question must match (resp. its answers
* or their owner's usernames). In order to force a term to only
* match tags, prepend it with "tag:". If <code>query</code> is
* <code>null</code>, all questions matching at least one of the
* tags will be kept, ordered by the tag matching ratio.
*
* Pass in <code>null</code> in order to find all questions with
* at least partially overlapping tag sets.
* @param tags
* a list of tags. The more of these a question matches, the
* higher it's rated.
*/
public SearchFilter(Set<String> query, Set<Tag> tags) {
this.queryFulltext = query != null ? SetOperations.difference(query,
StopWords.get())
: null;
this.queryTags = tags;
}
/**
* Rates a question as to how well it matches the given search terms (if at
* all) and as to how well its tags overlap with the set of given tags.
*
* @param question
* the question to rate
* @return a value between -1 (complete match) and 0 (failed to match),
* where 0 is replaced with <code>null</code> in order to allow
* filtering out non-matching questions
*/
public Double visit(Question question) {
Set<String> mustHave = new HashSet<String>(
this.queryFulltext == null ? Collections.EMPTY_SET
: this.queryFulltext);
double tagRating = rateTags(question, mustHave);
double textRating = rateText(question, mustHave);
double answerRating = rateAnswers(question, mustHave);
double rating = tagRating + textRating + answerRating;
// all search terms must appear at least once (AND search)
if (mustHave.size() != 0)
return null;
// best matching questions should appear first in an ascending sort
return rating > 0 ? -rating : null;
}
/**
* Rates how well the tags of a question match the searched for tags
* yielding values between 0 (no tags match) to 1 (the tags are identical).
*
* @param question
* the question to rate
* @param mustHave
* the list of search terms that have to occur in some part of a
* question (else the overall rating is 0)
* @return the matching ratio of the question's tags with the searched for
* tags
*/
private double rateTags(Question question, Set<String> mustHave) {
Set<Tag> tags = new HashSet<Tag>(question.getTags());
if (this.queryTags == null || this.queryTags.isEmpty()
|| tags.isEmpty())
return 0;
for (Tag tag : tags) {
mustHave.remove(tag.getName());
// search terms prepended with "tag:" won't match any content
// and are thus guaranteed to only match tags (and maybe very odd
// usernames)
mustHave.remove("tag:" + tag.getName());
}
// rate highest questions that share most of the tags and don't have
// hardly any additional tags
return Math.pow(
SetOperations.intersection(tags, this.queryTags).size(), 2)
/ this.queryTags.size() / tags.size();
}
/**
* Rates how well the content of an entry (and its owner's name) matches the
* searched for terms. This is a ratio between 0 (no term appears) and 1
* (the content consists exclusively of the searched for terms).
*
* @param entry
* the entry whose content is to be rated
* @param mustHave
* the list of search terms that have to occur in some part of a
* question (else the overall rating is 0)
* @return the matching ratio of the entry's content with the searched for
* terms
*/
private double rateText(Entry entry, Set<String> mustHave) {
String content = entry.getContentText();
if (entry.owner() != null)
content += " " + entry.owner().getName();
if (this.queryFulltext == null)
return 0;
Set<String> words = getWords(content);
if (words.isEmpty())
return 0;
mustHave.removeAll(words);
return 1.0
* SetOperations.intersection(words, this.queryFulltext).size()
/ words.size();
}
/**
* Rate how well a question's answers matches the searched for terms.
*
* @param question
* the question whose answers are to be classified
* @param mustHave
* the list of search terms that have to occur in some part of a
* question (else the overall rating is 0)
* @return the matching ratio of the question's answers with the searched
* for terms
*/
private double rateAnswers(Question question, Set<String> mustHave) {
double rating = 0;
for (Answer ans : question.answers()) {
rating += rateText(ans, mustHave);
}
int answerCount = question.answers().size();
return rating / (answerCount == 0 ? 1 : answerCount);
}
/**
* Splits the content of a question, answer or the search terms up into a
* set of words not containing HTML tags nor words occurring very often in
* the English language (StopWords).
*
* @param string
* an (HTML-)string to split up and clean
* @return a set of words for easier rating through intersections
*/
private Set<String> getWords(String string) {
Set<String> words = new HashSet<String>();
for (String word : string.split("\\W+")) {
words.add(word.toLowerCase());
}
words.remove(""); // remove splitting artifact
return SetOperations.difference(words, StopWords.get());
}
}