package edu.usc.cssl.tacit.crawlers.reddit.services; import static com.github.jreddit.utils.restclient.JsonUtils.safeJsonToString; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import com.github.jreddit.entity.Kind; import com.github.jreddit.utils.restclient.RestClient; public class RedditPlugin { private RestClient restClient; private String outputPath; private int limit; // link the number of records to be saved private String sortType; private ArrayList<String> subReddits; private boolean limitToBestComments; private String timeFrame; HashMap<String, String> redditCategories; /** * Constructor. * @param restClient REST Client instance * @param actor User instance */ public RedditPlugin(RestClient restClient) { this.restClient = restClient; this.outputPath = "F:\\NLP\\TEMP_OUTPUT\\Reddit"; this.limit = 100; this.sortType = "relevance"; this.subReddits = new ArrayList<String>(); subReddits.add("television"); this.limitToBestComments = true; // limited to best comments this.timeFrame = "all"; } protected HashMap<String, String> fetchRedditCategories(int limit) { redditCategories = new HashMap<String, String>(); Object response = restClient.get("/subreddits/.json?limit=1000&sort=".concat(sortType), null).getResponseObject(); int count = 0; breakEverything: while(true) { if (response instanceof JSONObject) { JSONObject subReddits = (JSONObject)(response); if(subReddits.containsKey("data")) { JSONObject subRedditDetails = (JSONObject) subReddits.get("data"); JSONArray subscriptions = (JSONArray) subRedditDetails.get("children"); for (Object subscription : subscriptions) { JSONObject data = (JSONObject)((JSONObject) subscription).get("data"); String subscriptionUrl = (String) data.get("url"); String subscriptionName = (String) data.get("display_name"); redditCategories.put(subscriptionName, subscriptionUrl); count++; if(count == limit) break breakEverything; } //crawl consecutive pages if(subRedditDetails.containsKey("after")) { if(null == subRedditDetails.get("after")) break; response = restClient.get("/subreddits/.json?limit=1000&after=".concat((String)subRedditDetails.get("after")), null).getResponseObject(); } else break; } else break; } } System.out.println(redditCategories.keySet().size()); for(String name: redditCategories.keySet()) { System.out.println(name + ":" + redditCategories.get(name)); } return redditCategories; } /* * To crawl trending posts (hot, new, rising) */ public void crawlTrendingPosts(String trendType) throws IOException, URISyntaxException { String filePath = this.outputPath + File.separator + trendType + ".txt"; JSONArray resultData = new JSONArray(); // to store the results getSimplifiedLinkData(resultData, "/".concat(trendType).concat("/").concat(".json")); FileWriter file = new FileWriter(filePath); file.write(resultData.toJSONString()); file.flush(); file.close(); } /* * To crawl all the user posts */ public void crawlUsersPosts(String username) throws IOException, URISyntaxException { // As of now fetches only links String filePath = this.outputPath + File.separator + "UserPosts.txt"; JSONArray resultData = new JSONArray(); // to store the results getSimplifiedLinkData(resultData, "/user/".concat(username).concat("/.json?sort=").concat(sortType)); FileWriter file = new FileWriter(filePath); file.write(resultData.toJSONString()); file.flush(); file.close(); } /* * To crawl all labeled posts (controversial, top) */ public void crawlLabeledPosts(String label) throws IOException, URISyntaxException { // As of now fetches only links String filePath = this.outputPath + File.separator + label + ".txt"; JSONArray resultData = new JSONArray(); // to store the results getSimplifiedLinkData(resultData, "/".concat(label).concat("/").concat(".json?t=").concat(timeFrame)); FileWriter file = new FileWriter(filePath); file.write(resultData.toJSONString()); file.flush(); file.close(); } /* * To crawl the given query results (title:cats subreddit:movies) */ public void crawlQueryResults(String query) throws IOException, URISyntaxException { // As of now fetches only links String filePath = this.outputPath + File.separator + query + ".txt"; JSONArray resultData = new JSONArray(); // to store the results getSimplifiedLinkData(resultData, "/search/.json?sort=".concat(sortType).concat("&q=").concat(query)); FileWriter file = new FileWriter(filePath); file.write(resultData.toJSONString()); file.flush(); file.close(); } @SuppressWarnings("unchecked") private void getSimplifiedLinkData(JSONArray resultData, String url) throws IOException, URISyntaxException { Object response = restClient.get(url, null).getResponseObject(); int count = 0; breakEverything: while(true) { if (response instanceof JSONObject) { JSONObject respObject = (JSONObject) response; JSONObject dataObject = (JSONObject) respObject.get("data"); JSONArray userPosts = (JSONArray) dataObject.get("children"); for (Object post : userPosts) { JSONObject data = (JSONObject) post; String kind = safeJsonToString(data.get("kind")); if (kind != null) { if (kind.equals(Kind.LINK.value())) { // only links are saved, not comments, etc. data = ((JSONObject) data.get("data")); resultData.add(getSimplifiedLinkData(data)); // add the simplified link data to resultant object array saveLinkComments(data); // save the link comments count++; if(this.limit == count) break breakEverything; } } } if(dataObject.containsKey("after") && null != dataObject.get("after")) { if(url.contains("?")) { response = restClient.get(url.concat("&after=").concat(String.valueOf(dataObject.get("after"))), null).getResponseObject(); } else { response = restClient.get(url.concat("?after=").concat(String.valueOf(dataObject.get("after"))), null).getResponseObject(); } } } else { throw new IllegalArgumentException("Parsing failed because JSON input is not from a submission."); } } } /* To Look thru the link and find the related comments * 1. Get permalink which is a direct link to comments * 2. As of now, stores only first page of comments * 3. There are comments for comments, crawl only the top level comments */ @SuppressWarnings("unchecked") private void saveLinkComments(JSONObject obj) throws IOException, URISyntaxException { String permalink = String.valueOf(obj.get("permalink")); // direct link to comments if(-1 != permalink.indexOf("?")) { String temp[] = permalink.split("\\?"); permalink = temp[0]; } System.out.println("Crawling comments :" + permalink); String filePath = this.outputPath + File.separator + getLastURLComponent(permalink) +".txt"; JSONArray linkComments = new JSONArray(); Object response = restClient.get(permalink.concat("/.json?sort=best"), null).getResponseObject(); // sorts by best if (response instanceof JSONArray) { JSONObject respObject = (JSONObject)((JSONArray) response).get(1); JSONObject dataObject = (JSONObject) respObject.get("data"); JSONArray userComments = (JSONArray) dataObject.get("children"); for (Object post : userComments) { JSONObject data = (JSONObject) post; String kind = safeJsonToString(data.get("kind")); if (kind != null) { if (kind.equals(Kind.COMMENT.value())) { // only links are save, not comments, etc. data = ((JSONObject) data.get("data")); linkComments.add(getSimplifiedCommentData(data)); } else if (kind.equals(Kind.MORE.value()) && !limitToBestComments) { // handle more comments dataObject = (JSONObject) data.get("data"); userComments = (JSONArray) dataObject.get("children"); for (Object morePost : userComments) { JSONObject result = fetchThisComment(morePost, permalink); if(null != result) linkComments.add(result); } } } } } else { throw new IllegalArgumentException("Parsing failed because JSON input is not from a submission."); } FileWriter file = new FileWriter(filePath); file.write(linkComments.toJSONString()); file.flush(); file.close(); } private JSONObject fetchThisComment(Object morePost, String permalink) { Object response = restClient.get((permalink + morePost).concat("/.json?sort=best"), null).getResponseObject(); JSONObject respObject = (JSONObject)((JSONArray) response).get(1); JSONObject dataObject = (JSONObject) respObject.get("data"); if(0 == ((JSONArray) dataObject.get("children")).size()) return null; JSONObject comment = (JSONObject) ((JSONArray) dataObject.get("children")).get(0); String JSONKind = safeJsonToString(comment.get("kind")); if (JSONKind != null && JSONKind.equals(Kind.COMMENT.value())) { // only links are save, not comments, etc. comment = ((JSONObject) comment.get("data")); return getSimplifiedCommentData(comment); } return null; } /* * Returns the last component of the given URL */ private String getLastURLComponent(String permalink) throws URISyntaxException { URI uri = new URI(permalink); String[] segments = uri.getPath().split("/"); return segments[segments.length-1]; } /* * Returns the newly constructed JSONObject */ @SuppressWarnings("unchecked") private JSONObject getSimplifiedLinkData(JSONObject data) { JSONObject simplifiedData = new JSONObject(); simplifiedData.put("gilded", data.get("gilded")); simplifiedData.put("title", data.get("title")); simplifiedData.put("score", data.get("score")); simplifiedData.put("num_comments", data.get("num_comments")); simplifiedData.put("created_utc", data.get("created_utc")); simplifiedData.put("selftext", data.get("selftext")); simplifiedData.put("thumbnail", data.get("thumbnail")); simplifiedData.put("author", data.get("author")); simplifiedData.put("url", data.get("url")); return simplifiedData; } /* * Returns the newly constructued JSONObject */ @SuppressWarnings("unchecked") private JSONObject getSimplifiedCommentData(JSONObject data) { JSONObject simplifiedData = new JSONObject(); simplifiedData.put("gilded", data.get("gilded")); simplifiedData.put("score", data.get("score")); simplifiedData.put("created_utc", data.get("created_utc")); simplifiedData.put("author", data.get("author")); simplifiedData.put("body", data.get("body")); simplifiedData.put("replies", data.get("replies")); simplifiedData.put("ups", data.get("ups")); simplifiedData.put("downs", data.get("downs")); return simplifiedData; } }