package ch.unibe.scg.cc;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.Sets;
/**
* A detector that can detect spammy clone results. Give it two strings, and the
* detector will tell you if they're not the kind of result a user would want to
* look at. Otherwise, it is considered <em>spammy</em>.
*
* @author nes
*/
// TODO(niko): Use all features from doi://10.1109/CSMR.2012.37
public final class SpamDetector implements Serializable {
private static final long serialVersionUID = 1L;
final private Pattern identifier = Pattern.compile("\\p{Alpha}\\p{Alnum}*");
/** Feature vector for classifying a cloning result as spam or not */
public final static class FeatureVector {
final double vocabularySimilarity;
FeatureVector(double vocabularySimilarity) {
this.vocabularySimilarity = vocabularySimilarity;
}
@Override
public String toString() {
return "FeatureVector[vocabularySimilarity=" + vocabularySimilarity + "]";
}
}
/**
* @return the feature vector that decides whether or not the clone
* represented by doc1 and doc2 is spammy.
*/
public FeatureVector extractFeatureVector(String doc1, String doc2) {
Set<String> v1 = extractVocabulary(doc1);
Set<String> v2 = extractVocabulary(doc2);
double vocabularySimilarity = ((double) Sets.intersection(v1, v2).size())
/ ((double) Sets.union(v1, v2).size());
return new FeatureVector(vocabularySimilarity);
}
/** @return whether {@code v} represents a cloning result that is spammy. */
public boolean isSpamByParameters(FeatureVector v) {
// A low vocabulary similarity suggests that everything was renamed.
// That's unlikely, so probably we're looking at something that wasn't
// cloned at all.
return v.vocabularySimilarity < 0.8;
}
/**
* Get all identifiers from the document. To speed up the operation, the set
* may include more than the identifiers.
*/
Set<String> extractVocabulary(String doc) {
Set<String> ret = new HashSet<>();
Matcher match = identifier.matcher(doc);
while (match.find()) {
ret.add(match.group());
}
return ret;
}
}