package edu.uncc.cs.watsonsim.researchers;
import java.util.ArrayList;
import java.util.List;
import edu.uncc.cs.watsonsim.Answer;
import edu.uncc.cs.watsonsim.Question;
public class StrictFilters extends Researcher {
/**
* Perform several strict filters relating mostly to game rules.
*
* 1: Remove J! Archive since it has actual answers.
* 2: Remove "List of *" because that's not the format of an answer.
* 3: Remove any answer inside the question because they don't give the
* answers away in the questions (at least not in a string-match way)
* 4: Remove ultra-long answers because J! never wants a 3-minute speech
* 5: Remove answers not in Latin text
*/
public List<Answer> question(Question q, List<Answer> answers) {
List<Answer> new_answers = new ArrayList<>();
for (Answer a : answers) {
// J! Archive has answers
if (a.text.contains("J! Archive")) {}
// "List of" is a bad sign
else if (a.text.contains("List of")) {}
// Is the answer in the question?
else if (almostContains(q.text, a.text)) {}
// Is it too long?
// The longest real answer in our sample of about 40,000 is:
// How much wood would a woodchuck chuck if a woodchuck could chuck wood?
// and it's 70 characters long. So cut there.
else if (a.getTokens().isEmpty() || a.text.length() > 70) {}
// Is over half of it non-Latin text?
else if (a.text.replaceAll("[^A-Za-z0-9 ]", "").length() * 2 < a.text.length()) {}
// Does it look like a web address?
else if (a.text.matches("^(http://)?([A-Za-z]+\\.)?[A-Za-z]+\\.(com|net|org|co\\.[A-Za-z]{2})$")) {}
else {
new_answers.add(a);
}
}
log.info("Eliminated " + (answers.size() - new_answers.size()) + " invalid answers");
return new_answers;
}
/**
* Check if the question text (left) almost contains the answer text
* (right).
*/
public boolean almostContains(String left, String right) {
// TODO: more stopword removal, etc.
return left.toLowerCase().contains(right.toLowerCase());
}
}