/******************************************************************************* * This file is part of RedReader. * * RedReader is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * RedReader is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with RedReader. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.quantumbadger.redreader.reddit.prepared.markdown; import org.apache.commons.lang3.StringEscapeUtils; import java.util.HashSet; public final class MarkdownTokenizer { // TODO support double graves public static final int TOKEN_UNDERSCORE = -1, TOKEN_UNDERSCORE_DOUBLE = -2, TOKEN_ASTERISK = -3, TOKEN_ASTERISK_DOUBLE = -4, TOKEN_TILDE_DOUBLE = -5, TOKEN_CARET = -6, TOKEN_GRAVE = -7, TOKEN_BRACKET_SQUARE_OPEN = -8, TOKEN_BRACKET_SQUARE_CLOSE = -9, TOKEN_PAREN_OPEN = -10, TOKEN_PAREN_CLOSE = -11, TOKEN_UNICODE_OPEN = -12, TOKEN_UNICODE_CLOSE = -13; private static final char[][] reverseLookup = new char[20][]; private static final char[][] linkPrefixes = { "http://".toCharArray(), "https://".toCharArray(), "www.".toCharArray() }; private static final char[][] linkPrefixes_reddit = { "/r/".toCharArray(), "r/".toCharArray(), "/u/".toCharArray(), "u/".toCharArray(), "/user/".toCharArray() }; private static final HashSet<Integer> unicodeWhitespace = new HashSet<>(); static { reverseLookup[20 + TOKEN_UNDERSCORE] = new char[] {'_'}; reverseLookup[20 + TOKEN_UNDERSCORE_DOUBLE] = new char[] {'_', '_'}; reverseLookup[20 + TOKEN_ASTERISK] = new char[] {'*'}; reverseLookup[20 + TOKEN_ASTERISK_DOUBLE] = new char[] {'*', '*'}; reverseLookup[20 + TOKEN_TILDE_DOUBLE] = new char[] {'~', '~'}; reverseLookup[20 + TOKEN_CARET] = new char[] {'^'}; reverseLookup[20 + TOKEN_GRAVE] = new char[] {'`'}; reverseLookup[20 + TOKEN_BRACKET_SQUARE_OPEN] = new char[] {'['}; reverseLookup[20 + TOKEN_BRACKET_SQUARE_CLOSE] = new char[] {']'}; reverseLookup[20 + TOKEN_PAREN_OPEN] = new char[] {'('}; reverseLookup[20 + TOKEN_PAREN_CLOSE] = new char[] {')'}; reverseLookup[20 + TOKEN_UNICODE_OPEN] = new char[] {'&'}; reverseLookup[20 + TOKEN_UNICODE_CLOSE] = new char[] {';'}; unicodeWhitespace.add(0x0009); unicodeWhitespace.add(0x000B); unicodeWhitespace.add(0x00A0); unicodeWhitespace.add(0x1680); unicodeWhitespace.add(0x2000); unicodeWhitespace.add(0x2001); unicodeWhitespace.add(0x2002); unicodeWhitespace.add(0x2003); unicodeWhitespace.add(0x2004); unicodeWhitespace.add(0x2005); unicodeWhitespace.add(0x2006); unicodeWhitespace.add(0x2007); unicodeWhitespace.add(0x2008); unicodeWhitespace.add(0x2009); unicodeWhitespace.add(0x200A); unicodeWhitespace.add(0x202F); unicodeWhitespace.add(0x205F); unicodeWhitespace.add(0x3000); } public static boolean isUnicodeWhitespace(int codepoint) { return unicodeWhitespace.contains(codepoint); } public static IntArrayLengthPair tokenize(final CharArrSubstring input) { final IntArrayLengthPair tmp1 = new IntArrayLengthPair(input.length * 3); final IntArrayLengthPair tmp2 = new IntArrayLengthPair(input.length * 3); tmp1.pos = input.length; for(int i = 0; i < input.length; i++) { tmp1.data[i] = input.charAt(i); } // Markdown is evil. naiveTokenize(tmp1, tmp2); clean(tmp2, tmp1); linkify(tmp1, tmp2); clean(tmp2, tmp1); return tmp1; } private static void linkify(final IntArrayLengthPair input, final IntArrayLengthPair output) { if(input.data.length > output.data.length * 3) throw new RuntimeException(); output.clear(); int inBrackets = 0; boolean lastCharOk = true; for(int i = 0; i < input.pos; i++) { final int token = input.data[i]; switch(token) { case TOKEN_BRACKET_SQUARE_OPEN: case TOKEN_PAREN_OPEN: output.data[output.pos++] = token; inBrackets++; lastCharOk = true; break; case TOKEN_BRACKET_SQUARE_CLOSE: case TOKEN_PAREN_CLOSE: output.data[output.pos++] = token; inBrackets--; lastCharOk = true; break; case ' ': output.data[output.pos++] = ' '; lastCharOk = true; break; case 'h': case 'w': if(inBrackets == 0 && lastCharOk) { final int linkStartType = getLinkStartType(input.data, i, input.pos); if(linkStartType >= 0) { // Greedily read to space, or <>, or etc final int linkStartPos = i; final int linkPrefixEndPos = linkPrefixes[linkStartType].length + linkStartPos; int linkEndPos = linkPrefixEndPos; while(linkEndPos < input.pos) { final int lToken = input.data[linkEndPos]; final boolean isValidChar = lToken != ' ' && lToken != '<' && lToken != '>' && lToken != TOKEN_GRAVE && lToken != TOKEN_BRACKET_SQUARE_OPEN && lToken != TOKEN_BRACKET_SQUARE_CLOSE; if(isValidChar) { linkEndPos++; } else { break; } } // discard many final chars if they are '.', ',', '?', ';' etc // THEN, discard single final char if it is '\'', '"', etc while(input.data[linkEndPos - 1] == '.' || input.data[linkEndPos - 1] == ',' || input.data[linkEndPos - 1] == '?' || input.data[linkEndPos - 1] == ';') { linkEndPos--; } if(input.data[linkEndPos - 1] == '"') { linkEndPos--; } if(input.data[linkEndPos - 1] == '\'') { linkEndPos--; } if(input.data[linkEndPos - 1] == ')') { linkEndPos--; } if(linkEndPos - linkPrefixEndPos >= 2) { final int[] reverted = revert(input.data, linkStartPos, linkEndPos); output.data[output.pos++] = TOKEN_BRACKET_SQUARE_OPEN; output.append(reverted); output.data[output.pos++] = TOKEN_BRACKET_SQUARE_CLOSE; output.data[output.pos++] = TOKEN_PAREN_OPEN; output.append(reverted); output.data[output.pos++] = TOKEN_PAREN_CLOSE; i = linkEndPos - 1; } else { output.data[output.pos++] = token; } } else { output.data[output.pos++] = token; } } else { output.data[output.pos++] = token; } lastCharOk = false; break; case 'r': case 'u': case '/': if(inBrackets == 0 && lastCharOk) { final int linkStartType = getRedditLinkStartType(input.data, i, input.pos); if(linkStartType >= 0) { final int linkStartPos = i; final int linkPrefixEndPos = linkPrefixes_reddit[linkStartType].length + linkStartPos; int linkEndPos = linkPrefixEndPos; while(linkEndPos < input.pos) { final int lToken = input.data[linkEndPos]; final boolean isValidChar = (lToken >= 'a' && lToken <= 'z') || (lToken >= 'A' && lToken <= 'Z') || (lToken >= '0' && lToken <= '9') || lToken == '_' || lToken == TOKEN_UNDERSCORE || lToken == TOKEN_UNDERSCORE_DOUBLE || lToken == '+' || lToken == '-'; if(isValidChar) { linkEndPos++; } else { break; } } if(linkEndPos - linkPrefixEndPos > 2) { final int[] reverted = revert(input.data, linkStartPos, linkEndPos); output.data[output.pos++] = TOKEN_BRACKET_SQUARE_OPEN; output.append(reverted); output.data[output.pos++] = TOKEN_BRACKET_SQUARE_CLOSE; output.data[output.pos++] = TOKEN_PAREN_OPEN; output.append(reverted); output.data[output.pos++] = TOKEN_PAREN_CLOSE; i = linkEndPos - 1; } else { output.data[output.pos++] = token; } } else { output.data[output.pos++] = token; } } else { output.data[output.pos++] = token; } lastCharOk = false; break; default: // TODO test this against reddits impl lastCharOk = token < 0 || (!Character.isLetterOrDigit(token)); output.data[output.pos++] = token; break; } } } public static void clean(final IntArrayLengthPair input, final IntArrayLengthPair output) { // TODO use single byte array, flags final boolean[] toRevert = new boolean[input.pos]; final boolean[] toDelete = new boolean[input.pos]; int openingUnderscore = -1, openingUnderscoreDouble = -1; int openingAsterisk = -1, openingAsteriskDouble = -1; int openingTildeDouble = -1; int lastBracketSquareOpen = -1; for(int i = 0; i < input.pos; i++) { final int c = input.data[i]; final boolean beforeASpace = i + 1 < input.pos && input.data[i + 1] == ' '; final boolean afterASpace = i > 0 && input.data[i - 1] == ' '; switch(c) { case TOKEN_UNDERSCORE: if(openingUnderscore < 0) { // Opening underscore if(beforeASpace) { toRevert[i] = true; } else { openingUnderscore = i; } } else { // Closing underscore if(afterASpace) { toRevert[i] = true; } else { openingUnderscore = -1; } } break; case TOKEN_UNDERSCORE_DOUBLE: if(i != 0 && openingUnderscoreDouble == i - 1) { toRevert[openingUnderscoreDouble] = true; toRevert[i] = true; openingUnderscoreDouble = -1; } else { if(openingUnderscoreDouble < 0) { // Opening double underscore if(beforeASpace) { toRevert[i] = true; } else { openingUnderscoreDouble = i; } } else { // Closing double underscore if(afterASpace) { toRevert[i] = true; } else { openingUnderscoreDouble = -1; } } } break; case TOKEN_ASTERISK: if(openingAsterisk < 0) { // Opening asterisk if(beforeASpace) { toRevert[i] = true; } else { openingAsterisk = i; } } else { // Closing asterisk if(afterASpace) { toRevert[i] = true; } else { openingAsterisk = -1; } } break; case TOKEN_ASTERISK_DOUBLE: if(i != 0 && openingAsteriskDouble == i - 1) { toRevert[openingAsteriskDouble] = true; toRevert[i] = true; openingAsteriskDouble = -1; } else { if(openingAsteriskDouble < 0) { // Opening double asterisk if(beforeASpace) { toRevert[i] = true; } else { openingAsteriskDouble = i; } } else { // Closing double asterisk if(afterASpace) { toRevert[i] = true; } else { openingAsteriskDouble = -1; } } } break; case TOKEN_TILDE_DOUBLE: if(i != 0 && openingTildeDouble == i - 1) { toRevert[openingTildeDouble] = true; toRevert[i] = true; openingTildeDouble = -1; } else { if(openingTildeDouble < 0) { // Opening double tilde if(beforeASpace) { toRevert[i] = true; } else { openingTildeDouble = i; } } else { // Closing double tilde if(afterASpace) { toRevert[i] = true; } else { openingTildeDouble = -1; } } } break; case TOKEN_GRAVE: final int openingGrave = i; final int closingGrave = indexOf(input.data, TOKEN_GRAVE, i + 1, input.pos); if(closingGrave < 0) { toRevert[i] = true; } else { for(int j = openingGrave + 1; j < closingGrave; j++) { if(input.data[j] < 0) toRevert[j] = true; } i = closingGrave; } break; case TOKEN_BRACKET_SQUARE_OPEN: if(lastBracketSquareOpen < 0) { // Attempt to parse link text with well-bracketed square brackets final int closingSquareBracket = findCloseWellBracketed( input.data, TOKEN_BRACKET_SQUARE_OPEN, TOKEN_BRACKET_SQUARE_CLOSE, i, input.pos); if(closingSquareBracket > i) { final int parenOpenPos = indexOf(input.data, TOKEN_PAREN_OPEN, closingSquareBracket + 1, input.pos); if(parenOpenPos > closingSquareBracket && isSpaces(input.data, closingSquareBracket + 1, parenOpenPos)) { lastBracketSquareOpen = i; for(int j = i + 1; j < closingSquareBracket; j++) { if(input.data[j] == TOKEN_BRACKET_SQUARE_OPEN) { input.data[j] = '['; } else if(input.data[j] == TOKEN_BRACKET_SQUARE_CLOSE) { input.data[j] = ']'; } } } else { toRevert[i] = true; } } else { toRevert[i] = true; } } else { toRevert[lastBracketSquareOpen] = true; lastBracketSquareOpen = i; } break; case TOKEN_BRACKET_SQUARE_CLOSE: if(lastBracketSquareOpen < 0) { toRevert[i] = true; } else { final int lastBracketSquareClose = i; final int parenOpenPos = indexOf(input.data, TOKEN_PAREN_OPEN, lastBracketSquareClose + 1, input.pos); boolean linkParseSuccess = false; if(parenOpenPos >= 0) { if(isSpaces(input.data, lastBracketSquareClose + 1, parenOpenPos)) { final int parenClosePos = findParenClosePos(input, parenOpenPos + 1); if(parenClosePos >= 0) { linkParseSuccess = true; for(int j = lastBracketSquareOpen + 1; j < lastBracketSquareClose; j++) { if(input.data[j] == TOKEN_BRACKET_SQUARE_OPEN || input.data[j] == TOKEN_BRACKET_SQUARE_CLOSE) { toRevert[j] = true; } } for(int j = lastBracketSquareClose + 1; j < parenOpenPos; j++) { toDelete[j] = true; } for(int j = parenOpenPos + 1; j < parenClosePos; j++) { if(input.data[j] < 0) { toRevert[j] = true; } else if(input.data[j] == ' ' && input.data[j-1] == ' ') { toDelete[j] = true; } } for(int j = parenOpenPos + 1; input.data[j] == ' '; j++) { toDelete[j] = true; } for(int j = parenClosePos - 1; input.data[j] == ' '; j--) { toDelete[j] = true; } i = parenClosePos; } } } if(!linkParseSuccess) { toRevert[lastBracketSquareOpen] = true; toRevert[lastBracketSquareClose] = true; } } lastBracketSquareOpen = -1; break; case TOKEN_PAREN_OPEN: case TOKEN_PAREN_CLOSE: case TOKEN_UNICODE_CLOSE: toRevert[i] = true; break; case TOKEN_UNICODE_OPEN: final int openingUnicode = i; final int closingUnicode = indexOf(input.data, TOKEN_UNICODE_CLOSE, i + 1, Math.min(input.pos, i + 20)); if(closingUnicode < 0) { toRevert[i] = true; } else if(input.data[i + 1] == '#') { if(input.data[i + 2] == 'x' && isHexDigits(input.data, openingUnicode + 3, closingUnicode)) { final int codePoint = getHex(input.data, openingUnicode + 3, closingUnicode); if(unicodeWhitespace.contains(codePoint)) { input.data[openingUnicode] = ' '; } else { input.data[openingUnicode] = codePoint; } for(int j = openingUnicode + 1; j <= closingUnicode; j++) { toDelete[j] = true; } i = closingUnicode; } else if(isDigits(input.data, openingUnicode + 2, closingUnicode)) { final int codePoint = getDecimal(input.data, openingUnicode + 2, closingUnicode); if(unicodeWhitespace.contains(codePoint)) { input.data[openingUnicode] = ' '; } else { input.data[openingUnicode] = codePoint; } for(int j = openingUnicode + 1; j <= closingUnicode; j++) { toDelete[j] = true; } i = closingUnicode; } else { toRevert[i] = true; } } else { Integer codePoint = null; try { final String name = new String(input.data, openingUnicode + 1, closingUnicode - openingUnicode - 1); final String result = StringEscapeUtils.unescapeHtml4("&" + name + ";"); if(result.length() == 1) { codePoint = (int) result.charAt(0); } else if(name.equalsIgnoreCase("apos")) { codePoint = (int) '\''; } else if(name.equalsIgnoreCase("nsub")) { codePoint = (int) '⊄'; } } catch(Throwable ignore) { // Ignore this } if(codePoint != null) { if(unicodeWhitespace.contains(codePoint)) { input.data[openingUnicode] = ' '; } else { input.data[openingUnicode] = codePoint; } for(int j = openingUnicode + 1; j <= closingUnicode; j++) { toDelete[j] = true; } i = closingUnicode; } else { toRevert[i] = true; } } break; case TOKEN_CARET: if(input.pos <= i + 1 || input.data[i + 1] == ' ') { toRevert[i] = true; } break; case ' ': if(i < 1 || input.data[i - 1] == ' ') { toDelete[i] = true; } break; } } if(openingUnderscore >= 0) toRevert[openingUnderscore] = true; if(openingUnderscoreDouble >= 0) toRevert[openingUnderscoreDouble] = true; if(openingAsterisk >= 0) toRevert[openingAsterisk] = true; if(openingAsteriskDouble >= 0) toRevert[openingAsteriskDouble] = true; if(openingTildeDouble >= 0) toRevert[openingTildeDouble] = true; if(lastBracketSquareOpen >= 0) toRevert[lastBracketSquareOpen] = true; for(int j = input.pos - 1; j >= 0 && input.data[j] == ' '; j--) { toDelete[j] = true; } output.clear(); for(int i = 0; i < input.pos; i++) { if(toDelete[i]) continue; if(toRevert[i]) { final char[] revertTo = reverseLookup[20 + input.data[i]]; output.append(revertTo); } else { output.data[output.pos++] = input.data[i]; } } } private static int findParenClosePos(final IntArrayLengthPair tokens, int startPos) { for(int i = startPos; i < tokens.pos; i++) { switch(tokens.data[i]) { case TOKEN_PAREN_CLOSE: return i; case '"': i = indexOfIgnoreEscaped(tokens, '"', i + 1); if(i < 0) return -1; break; } } return -1; } private static int indexOfIgnoreEscaped(final IntArrayLengthPair haystack, int needle, int startPos) { for(int i = startPos; i < haystack.pos; i++) { if(haystack.data[i] == '\\') i++; else if(haystack.data[i] == needle) return i; } return -1; } public static void naiveTokenize(final IntArrayLengthPair input, final IntArrayLengthPair output) { output.clear(); for(int i = 0; i < input.pos; i++) { final int c = input.data[i]; switch(c) { case '*': if(i < input.pos - 1 && input.data[i + 1] == '*') { i++; output.data[output.pos++] = TOKEN_ASTERISK_DOUBLE; } else { output.data[output.pos++] = TOKEN_ASTERISK; } break; case '_': if(i < input.pos - 1 && input.data[i + 1] == '_') { i++; output.data[output.pos++] = TOKEN_UNDERSCORE_DOUBLE; } else { if ((i < input.pos -1 && input.data[i+1] == ' ') || (i > 0 && input.data[i-1] == ' ') || (i == 0) || (i == input.pos - 1)) { output.data[output.pos++] = TOKEN_UNDERSCORE; } else { output.data[output.pos++] = c; } } break; case '~': if(i < input.pos - 1 && input.data[i + 1] == '~') { i++; output.data[output.pos++] = TOKEN_TILDE_DOUBLE; } else output.data[output.pos++] = '~'; break; case '^': output.data[output.pos++] = TOKEN_CARET; break; case '`': output.data[output.pos++] = TOKEN_GRAVE; break; case '[': output.data[output.pos++] = TOKEN_BRACKET_SQUARE_OPEN; break; case ']': output.data[output.pos++] = TOKEN_BRACKET_SQUARE_CLOSE; break; case '(': output.data[output.pos++] = TOKEN_PAREN_OPEN; break; case ')': output.data[output.pos++] = TOKEN_PAREN_CLOSE; break; case '&': output.data[output.pos++] = TOKEN_UNICODE_OPEN; break; case ';': output.data[output.pos++] = TOKEN_UNICODE_CLOSE; break; case '\\': if(i < input.pos - 1) output.data[output.pos++] = input.data[++i]; else output.data[output.pos++] = '\\'; break; case '\t': case '\r': case '\f': case '\n': output.data[output.pos++] = ' '; break; default: output.data[output.pos++] = c; break; } } } private static int indexOf(final int[] haystack, final int needle, final int startInclusive, final int endExclusive) { for(int i = startInclusive; i < endExclusive; i++) if(haystack[i] == needle) return i; return -1; } private static int reverseIndexOf(final int[] haystack, final int needle, final int startInclusive) { for(int i = startInclusive; i >= 0; i--) if(haystack[i] == needle) return i; return -1; } public static int findCloseWellBracketed( final int[] haystack, final int openBracket, final int closeBracket, final int startInclusive, final int endExclusive) { if(haystack[startInclusive] != openBracket) { throw new RuntimeException("Internal markdown parser error"); } int b = 1; for(int i = startInclusive + 1; i < endExclusive; i++) { if(haystack[i] == openBracket) { b++; } else if(haystack[i] == closeBracket) { b--; } if(b == 0) { return i; } } return -1; } private static boolean isSpaces(final int[] haystack, final int startInclusive, final int endExclusive) { for(int i = startInclusive; i < endExclusive; i++) if(haystack[i] != ' ') return false; return true; } private static boolean isDigits(final int[] haystack, final int startInclusive, final int endExclusive) { for(int i = startInclusive; i < endExclusive; i++) if(haystack[i] < '0' || haystack[i] > '9') return false; return true; } private static boolean isHexDigits(final int[] haystack, final int startInclusive, final int endExclusive) { for(int i = startInclusive; i < endExclusive; i++) { final int c = haystack[i]; if((c < '0' || c > '9') && (c < 'a' || c > 'f') && (c < 'A' || c > 'F')) return false; } return true; } private static int getDecimal(final int[] chars, final int startInclusive, final int endExclusive) { int result = 0; for(int i = startInclusive; i < endExclusive; i++) { result *= 10; result += chars[i] - '0'; } return result; } private static int fromHex(int ch) { if(ch >= '0' && ch <= '9') return ch - '0'; if(ch >= 'a' && ch <= 'f') return 10 + ch - 'a'; return 10 + ch - 'A'; } private static int getHex(final int[] chars, final int startInclusive, final int endExclusive) { int result = 0; for(int i = startInclusive; i < endExclusive; i++) { result *= 16; result += fromHex(chars[i]); } return result; } private static boolean equals(final int[] haystack, final char[] needle, int startInclusive) { for(int i = 0; i < needle.length; i++) if(haystack[startInclusive + i] != needle[i]) return false; return true; } private static int getLinkStartType(final int[] haystack, final int startInclusive, final int endExclusive) { final int maxLen = endExclusive - startInclusive; for(int type = 0; type < linkPrefixes.length; type++) { if(linkPrefixes[type].length <= maxLen && equals(haystack, linkPrefixes[type], startInclusive)) { return type; } } return -1; } private static int getRedditLinkStartType(final int[] haystack, final int startInclusive, final int endExclusive) { final int maxLen = endExclusive - startInclusive; for(int type = 0; type < linkPrefixes_reddit.length; type++) { if(linkPrefixes_reddit[type].length <= maxLen && equals(haystack, linkPrefixes_reddit[type], startInclusive)) { return type; } } return -1; } // TODO avoid generating new array private static int[] revert(final int[] tokens, final int startInclusive, final int endExclusive) { int outputLen = 0; for(int i = startInclusive; i < endExclusive; i++) { final int token = tokens[i]; if(token < 0) { outputLen += reverseLookup[20 + token].length; } else { outputLen++; } } final int[] result = new int[outputLen]; int resultPos = 0; for(int i = startInclusive; i < endExclusive; i++) { final int token = tokens[i]; if(token < 0) { for(final char c : reverseLookup[20 + token]) { result[resultPos++] = c; } } else { result[resultPos++] = token; } } return result; } }