package com.airlocksoftware.hackernews.parser; import android.content.Context; import android.util.Log; import com.airlocksoftware.hackernews.data.ConnectionManager; import com.airlocksoftware.hackernews.data.UserPrefs; import com.airlocksoftware.hackernews.model.*; import org.apache.commons.lang3.StringUtils; import org.jsoup.Connection; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; public class StoryParser { private static final String TAG = StoryParser.class.getSimpleName(); private static final int NO_POSITION = -1; // num comments / points private static final Pattern NUM_COMMENTS_PATTERN = Pattern.compile("\\d+"); /** Parse stories from Front Page, Ask, Best, or New * */ public static StoryResponse parseStoryList(Context context, Page page, Request request, String moreFnid) { String urlExtension = generateUrlExtension(request, page, moreFnid); StoryResponse response = parseStories(context, page, urlExtension); // parseStories() doesn't know about MORE, so potentially set it here if (response.result == Result.SUCCESS && moreFnid != null && request == Request.MORE) { response.result = Result.MORE; } return response; } /** Generate the extension that we're trying to load (goes on the end of ConnectionManager.BASE_URL) * */ private static String generateUrlExtension(Request request, Page page, String moreFnid) { String urlExtension = "/"; if (moreFnid != null && request == Request.MORE) urlExtension += moreFnid; switch (page) { case ASK: urlExtension += "ask"; break; case BEST: urlExtension += "best"; break; case NEW: urlExtension += "newest"; break; case ACTIVE: urlExtension += "active"; break; default: break; } return urlExtension; } /** Parse stories from the user's submissions page * */ public static StoryResponse parseUserSubmissions(Context context, String username, String moreFnid) { if (StringUtils.isBlank(username)) { throw new RuntimeException("StoryParser.parseUserSubmissions received a blank username"); } String urlExtension = StringUtils.isNotBlank(moreFnid) ? "/" + moreFnid : "/submitted?id=" + username; StoryResponse response = parseStories(context, Page.USER, urlExtension); if (StringUtils.isNotBlank(moreFnid) && response.result == Result.SUCCESS) { // switch result to MORE response.result = Result.MORE; } return response; } private static StoryResponse parseStories(Context context, Page page, String urlExtension) { StoryResponse response = new StoryResponse(); response.stories = new ArrayList<Story>(); response.result = Result.SUCCESS; // success unless error state is tripped try { UserPrefs data = new UserPrefs(context); String userCookie = data.getUserCookie(); Document doc = getDocument(urlExtension, userCookie); // check for expired fnid Element body = doc.body(); String bodyText = body.text(); if (bodyText.equals("Unknown or expired link.")) { response.result = Result.FNID_EXPIRED; return response; } Elements titles = doc.select("span.rank"); // html changed, story rank numbers now have this class Elements subtexts = doc.select("td.subtext"); ListIterator<Element> titlesIterator = titles.listIterator(); ListIterator<Element> subtextIterator = subtexts.listIterator(); while (titlesIterator.hasNext() && subtextIterator.hasNext()) { Element child = titlesIterator.next(); Element titleElement = child.parent().parent(); Element subtextElement = subtextIterator.next(); Story story = parseStory(titleElement, subtextElement, userCookie != null); story.page = page; response.stories.add(story); } response.timestamp = getNewTimestamp(doc); } catch (IOException e) { response.result = Result.FAILURE; } catch (NumberFormatException e) { response.result = Result.FAILURE; } catch (NullPointerException e) { response.result = Result.FAILURE; } if (response.stories == null || response.stories.size() < 1) { response.result = Result.FAILURE; } return response; } private static Document getDocument(String urlExtension, String userCookie) throws IOException { Connection con; if (userCookie != null) con = ConnectionManager.authConnect(urlExtension, userCookie); else con = ConnectionManager.anonConnect(urlExtension); return con.get(); } /** Creates a new timestamp if the more element exists on the page, else returns null. * */ private static StoryTimestamp getNewTimestamp(Document doc) { // get new moreFnid & Timestamp Element more = doc.select("td.title a:matchesOwn(^More$)") .first(); if (more == null) return null; String fnid = more.attr("href"); // strip leading slash (/) since it's added by the urlExtension code above if (fnid.startsWith("/")) fnid = fnid.substring(1); StoryTimestamp timestamp = new StoryTimestamp(); timestamp.fnid = fnid; timestamp.time = System.currentTimeMillis(); return timestamp; } public static class StoryResponse { // NULL_RESPONSE :: A response with all fields set to `null` public static final StoryResponse NULL_RESPONSE = new StoryResponse(); public Result result = null; public List<Story> stories = null; public StoryTimestamp timestamp = null; public boolean isNull() { return (this.equals(NULL_RESPONSE)); } @Override public boolean equals(Object other) { if (other == null) return false; if (other == this) return true; if (!(other instanceof StoryResponse)) return false; StoryResponse o = (StoryResponse) other; return (result == o.result && stories == o.stories && timestamp == o.timestamp); } } /** * Parses a story from the two tags we can reach with "td.title:containsOwn(.)" and * "td.subtext" * TODO figure out a better way of parsing than try / catching exceptions */ public static Story parseStory(Element title, Element subtext, boolean loggedIn) { Story story = new Story(); story.position = parsePosition(title); String potentialJobsUrl = null; try { Element titleLink = title.select("td.title > a") .first(); story.title = titleLink.text(); // try to get url & domain, if it fails you're on a self post try { story.url = titleLink.attr("href"); // if url starts with item?id, it's a self post & may potentially be a url for a jobs post if (story.url.startsWith("item?id=")) potentialJobsUrl = ConnectionManager.BASE_URL + "/" + story.url; story.domain = parseDomain(title); } catch (NullPointerException e) { story.url = null; story.domain = null; } story.ago = parseAgo(subtext); story.storyId = parseStoryId(subtext); // if the user is logged in, get isUpvoted, go_to, and auth if (loggedIn) { story.isUpvoted = true; story.whence = null; story.auth = null; Element voteAnchor = title.select("a[href^=vote]") .first(); if (voteAnchor != null) { String[] voteHref = voteAnchor.attr("href") .split("[=&]"); story.isUpvoted = false; story.whence = voteHref[voteHref.length - 1]; story.auth = voteHref[7]; } } story.numPoints = parseNumPoints(subtext); story.username = (subtext.select("a[href^=user]").text()); story.numComments = parseNumComments(subtext); } catch (Exception e) { // this means it's a YCombinator jobs post story.storyId = 0; story.whence = null; story.numPoints = 0; story.username = null; story.numComments = 0; if (potentialJobsUrl != null) story.url = potentialJobsUrl; } return story; } /** try to get number of comments. If it fails there are 0 comments. * */ private static int parseNumComments(Element subtext) { // last child is <a href="item?id=9029159">20 comments</a> try { int lastIndex = subtext.children().size() - 1; Element numComments = subtext.child(lastIndex); Matcher matcher = NUM_COMMENTS_PATTERN.matcher(numComments.text()); if (matcher.find()) return Integer.parseInt(matcher.group()); } catch (NumberFormatException e) { Log.i(TAG, "Error parsing number of comments from: ", e);// + numComments.text()); } catch (Throwable t) { Log.i(TAG, "Other error", t); } return 0; } private static int parseNumPoints(Element subtext) { return Integer.parseInt(subtext.select("span.score").first().text().split("\\s")[0]); } // private static boolean parseHasUpvoteButton(Element voteAnchor) { // Elements voteButtons = voteAnchor.select("img[src=http://ycombinator.com/images/grayarrow.gif]"); // return voteButtons.size() == 1; // } private static long parseStoryId(Element subtext) { return Long.parseLong(subtext.select("a[href^=item]") .attr("href") .split("=")[1]); } private static String parseAgo(Element subtext) { Element agoLink = subtext.select("a").get(1); return agoLink.text().replace("|", "").trim(); } private static String parseDomain(Element title) { String domain = title.select("span.comhead") .first() .text() .trim(); // trim parens from domain; domain = domain.substring(1, domain.length() - 1); return domain; } /** Get the stories position (i.e. 1st, 2nd, 3rd, etc) on the page. * */ private static int parsePosition(Element title) { try { String position = title.child(0) .text() .replace(".", ""); return Integer.parseInt(position); } catch (Exception e) { // TODO fix exception catch'em all! // this means we're on the comments page return NO_POSITION; } } }