package com.ebottabi.bolt;
/**
*
* @author ebot
*/
import java.util.LinkedList;
import java.util.List;
import org.apache.http.client.utils.URLEncodedUtils;
import org.json.simple.JSONObject;
import com.gravity.goose.Article;
import com.gravity.goose.Configuration;
import com.gravity.goose.Goose;
import com.gravity.goose.network.NotHtmlException;
import java.util.Map;
import twitter4j.Status;
import twitter4j.URLEntity;
public class RedisGooseExtractor extends RedisBolt {
public static final String CHANNEL = "articles";
private Goose goose;
public RedisGooseExtractor() {
super(CHANNEL);
}
@Override
protected void setupNonSerializableAttributes() {
super.setupNonSerializableAttributes();
Configuration config = new Configuration();
config.enableImageFetching_$eq(false); //No images right now, it requires imagemagik installed in a specific path
goose = new Goose(config);
}
@Override
public List<Object> filter(Status status) {
URLEntity urls[] = status.getURLEntities();
if (urls == null || urls.length == 0) {
return null;
}
if (urls[0].getExpandedURL() == null) {
return null;
}
String articleUrl = urls[0].getExpandedURL().toString();
Article article = null;
try {
article = goose.extractContent(articleUrl);
} catch (Exception e) {
System.out.println(articleUrl);
System.out.println(e.getMessage());
return null;
}
if (article == null
|| article.title() == null
|| article.title().length() == 0
|| article.cleanedArticleText() == null
|| article.cleanedArticleText().length() == 0) {
return null;
}
JSONObject json = new JSONObject();
json.put("url", articleUrl);
json.put("title", article.title());
json.put("text", article.cleanedArticleText());
json.put("description", article.metaDescription());
if (article.topImage() != null && article.topImage().imageSrc().length() != 0) {
json.put("image", article.topImage().imageSrc());
}
publish(json.toJSONString());
return null;
}
@Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
}