/**
*
*/
package com.maalaang.omtwitter.text;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import org.apache.commons.math3.exception.MathArithmeticException;
import org.apache.commons.math3.linear.OpenMapRealVector;
import org.apache.commons.math3.linear.RealVector;
import com.maalaang.omtwitter.model.OMTweet;
/**
* @author Sangwon Park
*
*/
public class FilterCosineSimilarity implements TweetFilter {
private LinkedList<RealVector> fvList = null;
private Map<String,Integer> tokenIdMap = null;
private int windowSize = 0;
private double threshold = 0.0;
private boolean filtered = false;
private int tokenIndex = 0;
public FilterCosineSimilarity(int windowSize, double threshold) {
this.windowSize = windowSize;
this.threshold = threshold;
}
public void initialize() {
fvList = new LinkedList<RealVector>();
tokenIdMap = new HashMap<String,Integer>();
}
public void next(OMTweet tweet, OMTweetToken[] tokenList) {
RealVector fv = tweetToFeatureVector(tweet, tokenList);
filtered = false;
for (RealVector fv1 : fvList) {
try {
if (fv.cosine(fv1) > threshold) {
filtered = true;
break;
}
} catch (MathArithmeticException e) {
}
}
fvList.add(fv);
if (fvList.size() > windowSize) {
fvList.remove();
}
}
public boolean isFilteredOut() {
return filtered;
}
public void close() {
fvList.clear();
tokenIdMap.clear();
}
private RealVector tweetToFeatureVector(OMTweet tweet, OMTweetToken[] tokenList) {
RealVector fv = new OpenMapRealVector(Integer.MAX_VALUE);
for (OMTweetToken tok : tokenList) {
String t = tok.getNormalizedText();
Integer tokenId = tokenIdMap.get(t);
if (tokenId == null) {
tokenId = tokenIndex++;
tokenIdMap.put(t, tokenId);
}
fv.addToEntry(tokenId, 1.0);
}
return fv;
}
}