/*
* This file is part of ALOE.
*
* ALOE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* ALOE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with ALOE. If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl)
*/
package etc.aloe.filters;
import java.util.regex.Pattern;
/**
* Filter that searches for occurrences of special strings (negations, names,
* and swearing)
*
* @author Michael Brooks <mjbrooks@uw.edu>
*/
public class SpecialRegexFilter extends AbstractRegexFilter {
private final String[] contractedNegationForms = new String[]{
"aren'?t",
"can'?t",
"couldn'?t",
"daren'?t",
"didn'?t",
"doesn'?t",
"don'?t",
"hasn'?t",
"haven'?t",
"hadn'?t",
"isn'?t",
"mayn'?t",
"mightn'?t",
"mustn'?t",
"needn'?t",
"oughtn'?t",
"shan'?t",
"shouldn'?t",
"wasn'?t",
"weren'?t",
"won'?t",
"wouldn'?t"
};
private final String[] namesList = new String[]{
"Ray",
"Gary",
"Pascal",
"Paul",
"Derek",
"Ben",
"Stef",
"Rene",
"Gabriel",
"Maurice",
"Emile",
"Matt",
"Sam",
"Kevin",
"Rick",
"Naomi",
"Christophe",
"Dennis",
"Rob"
};
private NamedRegex[] regexFeatures = new NamedRegex[]{
// Negation: no, not, cannot, can't/won't/whatever, cant/wont/whatever
// See http://www.englishclub.com/vocabulary/contractions-negative.htm
new NamedRegex("negation", "(?<!\\w)(not?|cannot|" + toRegex(contractedNegationForms, false) + ")(?!\\w)", Pattern.CASE_INSENSITIVE),
// Matches many swearwords
new NamedRegex("swear", "(?<!\\w)("
+ "((?=\\p{Punct}*[@#$%^&*]\\p{Punct}*[@#$%^&*])([\\p{Punct}&&[^.]]{4,}))"
+ "|crap(p?ed|s|p?ing|p?y)?"
+ "|shit(s|t?ing|t?y)?"
+ "|(god?)?dam(n|mit)?"
+ "|(mother)?fuck(ed|ing|er)?"
+ "|ass(hole)?"
+ "|suck(y|s|ed)?"
+ ")(?!\\w)", Pattern.CASE_INSENSITIVE),
// Matches lols, hehes, heehees, hahaas, and hohos (plus many others)
new NamedRegex("names", "(?<!\\w)(" + toRegex(namesList) + ")", Pattern.CASE_INSENSITIVE),
// Matches hashtags
new NamedRegex("hashtags", "(?<!\\w)(#[a-z]\\w*)(?!\\w)", Pattern.CASE_INSENSITIVE),
// Matches mentions
new NamedRegex("mentions", "(?<!\\w)(@[a-z]\\w*)(?!\\w)", Pattern.CASE_INSENSITIVE),
// Matches urls
new NamedRegex("urls", "(^|\\s)((https?:\\/\\/)?[\\w-]+(\\.[\\w-]+)+\\.?(:\\d+)?(\\/\\S*)?)")
};
public SpecialRegexFilter() {
}
public SpecialRegexFilter(String attributeName) {
this.setStringAttributeName(attributeName);
}
@Override
protected NamedRegex[] getRegexFeatures() {
return regexFeatures;
}
}