package me.ccrama.redditslide.util; import org.apache.commons.lang3.StringEscapeUtils; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Utility methods to transform html received from Reddit into a more parsable * format. * * The output will unescape all html, except for table tags and some special delimiter * token such as for code blocks. */ public class SubmissionParser { private static final Pattern SPOILER_PATTERN = Pattern.compile("<a[^>]*title=\"([^\"]*)\"[^>]*>([^<]*)</a>"); private static final String TABLE_START_TAG = "<table>"; private static final String HR_TAG = "<hr/>"; private static final String TABLE_END_TAG = "</table>"; private SubmissionParser() { } /** * Parses html and returns a list corresponding to blocks of text to be * formatted. * * Each block is one of: * - Vanilla text * - Code block * - Table * * Note that this method will unescape html entities, so this is best called * with the raw html received from reddit. * * @param html html to be formatted. Can be raw from the api * @return list of text blocks */ public static List<String> getBlocks(String html) { html = StringEscapeUtils.unescapeHtml4(html) .replace("<p>", "<div>") .replace("</p>", "</div>") .replace("<li>\\s*<div>", "<li>") .replace("</div>\\s*</li>", "</li>") .replace("<li><div>", "<li>") .replace("</div></li>", "</li>") .replace("<del>", "[[d[") .replace("<sup>", "<sup><small>") .replace("</sup>", "</small></sup>") .replace("</del>", "]d]]"); if (html.contains("\n")) { html = html.substring(0, html.lastIndexOf("\n")); } if (html.contains("<!-- SC_ON -->")) { html = html.substring(15, html.lastIndexOf("<!-- SC_ON -->")); } html = parseSpoilerTags(html); if (html.contains("<ol") || html.contains("<ul")) { html = parseLists(html); } List<String> codeBlockSeperated = parseCodeTags(html); if (html.contains(HR_TAG)) { codeBlockSeperated = parseHR(codeBlockSeperated); } if (html.contains("<table")) { return parseTableTags(codeBlockSeperated); } else { return codeBlockSeperated; } } private static String parseLists(String html) { int firstIndex; boolean isNumbered; int firstOl = html.indexOf("<ol"); int firstUl = html.indexOf("<ul"); if ((firstUl != -1 && firstOl > firstUl) || firstOl == -1) { firstIndex = firstUl; isNumbered = false; } else { firstIndex = firstOl; isNumbered = true; } List<Integer> listNumbers = new ArrayList<>(); int indent = -1; int i = firstIndex; while (i < html.length() - 4 && i != -1) { if (html.substring(i, i + 3).equals("<ol") || html.substring(i, i + 3).equals("<ul")) { if (html.substring(i, i + 3).equals("<ol")) { isNumbered = true; indent++; listNumbers.add(indent, 1); } else { isNumbered = false; } i = html.indexOf("<li", i); } else if (html.substring(i, i + 3).equals("<li")) { int tagEnd = html.indexOf(">", i); int itemClose = html.indexOf("</li", tagEnd); int ulClose = html.indexOf("<ul", tagEnd); int olClose = html.indexOf("<ol", tagEnd); int closeTag; // Find what is closest: </li>, <ul>, or <ol> if (((ulClose == -1 && itemClose != -1) || (itemClose != -1 && ulClose != -1 && itemClose < ulClose)) && ((olClose == -1 && itemClose != -1) || (itemClose != -1 && olClose != -1 && itemClose < olClose))) { closeTag = itemClose; } else if (((ulClose == -1 && olClose != -1) || (olClose != -1 && ulClose != -1 && olClose < ulClose)) && ((olClose == -1 && itemClose != -1) || (olClose != -1 && itemClose != -1 && olClose < itemClose))) { closeTag = olClose; } else { closeTag = ulClose; } String text = html.substring(tagEnd + 1, closeTag); String indentSpacing = ""; for (int j = 0; j < indent; j++) { indentSpacing += "    "; } if (isNumbered) { html = html.substring(0, tagEnd + 1) + indentSpacing + listNumbers.get(indent)+ ". " + text + "<br/>" + html.substring(closeTag); listNumbers.set(indent, listNumbers.get(indent) + 1); i = closeTag + 3; } else { html = html.substring(0, tagEnd + 1) + indentSpacing + "• " + text + "<br/>" + html.substring(closeTag); i = closeTag + 2; } } else { i = html.indexOf("<", i + 1); if (i != -1 && html.substring(i, i + 4).equals("</ol")) { indent--; if(indent == -1){ isNumbered = false; } } } } html = html.replace("<ol>","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ol>", "").replace("</ul>",""); //Remove the tags, which actually work in Android 7.0 on return html; } private static List<String> parseHR(List<String> blocks) { List<String> newBlocks = new ArrayList<>(); for (String block : blocks) { if (block.contains(HR_TAG)) { for(String s : block.split(HR_TAG)) { newBlocks.add(s); newBlocks.add(HR_TAG); } newBlocks.remove(newBlocks.size() - 1); } else { newBlocks.add(block); } } return newBlocks; } /** * For code within <code><pre></code> tags, line breaks are converted to * <code><br /></code> tags, and spaces to &nbsp;. This allows for Html.fromHtml * to preserve indents of these blocks. * <p/> * In addition, <code>[[<[</code> and <code>]>]]</code> are inserted to denote the * beginning and end of code segments, for styling later. * * @param html the unparsed HTML * @return the code parsed HTML with additional markers, split but code blocks */ private static List<String> parseCodeTags(String html) { final String startTag = "<pre><code>"; final String endTag = "</code></pre>"; String[] startSeperated = html.split(startTag); List<String> preSeperated = new ArrayList<>(); String text; String code; String[] split; preSeperated.add(startSeperated[0].replace("<code>", "<code>[[<[").replace("</code>", "]>]]</code>")); for (int i = 1; i < startSeperated.length; i++) { text = startSeperated[i]; split = text.split(endTag); code = split[0]; code = code.replace("\n", "<br/>"); code = code.replace(" ", " "); preSeperated.add(startTag + "[[<[" + code + "]>]]" + endTag); if (split.length > 1) { preSeperated.add(split[1].replace("<code>", "<code>[[<[").replace("</code>", "]>]]</code>")); } } return preSeperated; } /** * Move the spoil text inside of the "title" attribute to inside the link * tag. Then surround the spoil text with <code>[[s[</code> and <code>]s]]</code>. * <p/> * If there is no text inside of the link tag, insert "spoil". * * @param html * @return */ private static String parseSpoilerTags(String html) { String spoilerText; String tag; String spoilerTeaser; Matcher matcher = SPOILER_PATTERN.matcher(html); while (matcher.find()) { tag = matcher.group(0); spoilerText = matcher.group(1); spoilerTeaser = matcher.group(2); // Remove the last </a> tag, but keep the < for parsing. if (!tag.contains("<a href=\"http")) { html = html.replace(tag, tag.substring(0, tag.length() - 4) + (spoilerTeaser.isEmpty() ? "spoiler" : "") + "< [[s[ " + spoilerText + "]s]]</a>"); } } return html; } /** * Parse a given list of html strings, splitting by table blocks. * * All table tags are html escaped. * * @param blocks list of html with or individual table blocks * @return list of html with tables split into it's entry */ private static List<String> parseTableTags(List<String> blocks) { List<String> newBlocks = new ArrayList<>(); for (String block : blocks) { if (block.contains(TABLE_START_TAG)) { String[] startSeperated = block.split(TABLE_START_TAG); newBlocks.add(startSeperated[0].trim()); for (int i = 1; i < startSeperated.length; i++) { String [] split = startSeperated[i].split(TABLE_END_TAG); newBlocks.add("<table>" + split[0] + "</table>"); if (split.length > 1) { newBlocks.add(split[1]); } } } else { newBlocks.add(block); } } return newBlocks; } }