/*
* Overchan Android (Meta Imageboard Client)
* Copyright (C) 2014-2016 miku-nyan <https://github.com/miku-nyan>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package nya.miku.wishmaster.chans.krautchan;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nya.miku.wishmaster.api.models.AttachmentModel;
import nya.miku.wishmaster.api.models.BadgeIconModel;
import nya.miku.wishmaster.api.models.PostModel;
import nya.miku.wishmaster.api.models.ThreadModel;
import nya.miku.wishmaster.api.util.CryptoUtils;
import nya.miku.wishmaster.api.util.RegexUtils;
import nya.miku.wishmaster.common.Logger;
import org.apache.commons.lang3.StringEscapeUtils;
public class KrautReader implements Closeable {
private static final String TAG = "KrautReader";
private static final boolean LINKIFY = true;
private static final DateFormat KRAUT_DATEFORMAT;
static {
KRAUT_DATEFORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US);
KRAUT_DATEFORMAT.setTimeZone(TimeZone.getTimeZone("Europe/Berlin"));
}
private static final Pattern ATTACHMENT_FILENAME_PATTERN =
Pattern.compile("<span id=\"filename_[^>]*>(.*?)</span>", Pattern.DOTALL);
private static final Pattern ATTACHMENT_LINKS_PATTERN =
Pattern.compile("a\\s+href=\"/files/(\\d+\\..+?)\"(?:.+?src=\"?/thumbnails/(\\d+\\..+?)\\s\"?)?", Pattern.DOTALL);
private static final Pattern ATTACHMENT_INFO_PATTERN =
Pattern.compile("<span class=\"fileinfo\">\\s*(.*?),\\s*(.*?),\\s*(.*?)</span>", Pattern.DOTALL);
private static final Pattern ATTACHMENT_PX_SIZE_PATTERN = Pattern.compile("(\\d+)[x×х](\\d+)"); // \u0078 \u00D7 \u0445
private static final Pattern ATTACHMENT_SIZE_PATTERN = Pattern.compile("([,\\.\\d]+) ?([km])?b", Pattern.CASE_INSENSITIVE);
private static final Pattern ICON_DESCRIPTION_PATTERN = Pattern.compile("helpTip\\('([^']*)'");
private static final Pattern BAN_MARK_PATTERN = Pattern.compile("<span class=\"ban_mark\">([^<]*)</span>");
private static final char[] THREAD_START = "id=\"thread_".toCharArray();
private static final char[] BLOCKQUOTE_CLOSE = "</blockquote>".toCharArray();
private static final int FILTER_THREAD_END = 0;
private static final int FILTER_POSTNUMBER = 1;
private static final int FILTER_COUNTRYBALL = 2;
private static final int FILTER_COUNTRYBALL_WAR = 3;
private static final int FILTER_SUBJECT = 4;
private static final int FILTER_POSTERNAME = 5;
private static final int FILTER_TRIPCODE = 6;
private static final int FILTER_ADMINMARK = 7;
private static final int FILTER_DATE = 8;
private static final int FILTER_SAGE = 9;
private static final int FILTER_ATTACHMENT = 10;
private static final int FILTER_ATTACHMENT_OP = 11;
private static final int FILTER_START_COMMENT = 12;
private static final int FILTER_OMITTEDPOSTS = 13;
public static final char[][] FILTERS_OPEN = {
"class=\"thread\"".toCharArray(),
//"<div class=\"postheader\">".toCharArray(),
"<input name=\"post_".toCharArray(),
"<img src=\"/images/balls/".toCharArray(),
"<img src=\"/images/warballs/".toCharArray(),
"<span class=\"postsubject\">".toCharArray(),
"<span class=\"postername\">".toCharArray(),
"<span class=\"tripcode\">".toCharArray(),
"<span class=\"authority_".toCharArray(),
"<span class=\"postdate\">".toCharArray(),
//"<span class=\"postnumber\"".toCharArray(),
"<span class=\"sage\">".toCharArray(),
"div class=\"file_reply\">".toCharArray(),
"div class=\"file_thread\">".toCharArray(),
"<p id=\"post_text_".toCharArray(),
"<span class=\"omittedinfo\">".toCharArray(),
};
private static final char[][] FILTERS_CLOSE = {
null,
"\"".toCharArray(),
">".toCharArray(),
">".toCharArray(),
"</span>".toCharArray(),
"</span>".toCharArray(),
"</span>".toCharArray(),
"</span>".toCharArray(),
"</span>".toCharArray(),
null,
"<blockquote>".toCharArray(),
"<blockquote>".toCharArray(),
">".toCharArray(),
"</span>".toCharArray()
};
private final Reader _in;
private StringBuilder readBuffer = new StringBuilder();
private List<ThreadModel> threads;
private ThreadModel currentThread;
private List<PostModel> postsBuf;
private PostModel currentPost;
private StringBuilder commentBuffer = new StringBuilder();
private StringBuilder omittedDigitsBuffer = new StringBuilder();
private List<AttachmentModel> currentAttachments;
public KrautReader(Reader reader) {
_in = reader;
}
public KrautReader(InputStream in) {
this(new BufferedReader(new InputStreamReader(in)));
}
public ThreadModel[] readPage() throws IOException {
threads = new ArrayList<ThreadModel>();
initThreadModel();
initPostModel();
skipUntilSequence(THREAD_START);
readData();
return threads.toArray(new ThreadModel[threads.size()]);
}
private void readData() throws IOException {
int filtersCount = FILTERS_OPEN.length;
int[] pos = new int[filtersCount];
int[] len = new int[filtersCount];
for (int i=0; i<filtersCount; ++i) len[i] = FILTERS_OPEN[i].length;
int curChar;
while ((curChar = _in.read()) != -1) {
for (int i=0; i<filtersCount; ++i) {
if (curChar == FILTERS_OPEN[i][pos[i]]) {
++pos[i];
if (pos[i] == len[i]) {
handleFilter(i);
pos[i] = 0;
}
} else {
if (pos[i] != 0) pos[i] = curChar == FILTERS_OPEN[i][0] ? 1 : 0;
}
}
}
finalizeThread();
}
private void initThreadModel() {
currentThread = new ThreadModel();
currentThread.postsCount = 0;
currentThread.attachmentsCount = 0;
postsBuf = new ArrayList<PostModel>();
}
private void initPostModel() {
currentPost = new PostModel();
currentPost.trip = "";
currentAttachments = new ArrayList<AttachmentModel>();
}
private void finalizeThread() {
if (postsBuf.size() > 0) {
currentThread.posts = postsBuf.toArray(new PostModel[postsBuf.size()]);
currentThread.threadNumber = currentThread.posts[0].number;
for (PostModel post : currentThread.posts) post.parentThread = currentThread.threadNumber;
threads.add(currentThread);
initThreadModel();
}
}
private void finalizePost() {
if (currentPost.number != null && currentPost.number.length() > 0) {
++currentThread.postsCount;
currentPost.attachments = currentAttachments.toArray(new AttachmentModel[currentAttachments.size()]);
if (currentPost.name == null) currentPost.name = "";
if (currentPost.subject == null) currentPost.subject = "";
if (currentPost.comment == null) currentPost.comment = "";
if (currentPost.email == null) currentPost.email = "";
if (currentPost.trip == null) currentPost.trip = "";
postsBuf.add(currentPost);
}
initPostModel();
}
private void handleFilter(int filterIndex) throws IOException {
switch (filterIndex) {
case FILTER_THREAD_END:
finalizeThread();
break;
case FILTER_POSTNUMBER:
currentPost.number = readUntilSequence(FILTERS_CLOSE[filterIndex]).trim();
break;
case FILTER_COUNTRYBALL:
case FILTER_COUNTRYBALL_WAR:
parseIcon(readUntilSequence(FILTERS_CLOSE[filterIndex]), filterIndex == FILTER_COUNTRYBALL_WAR);
break;
case FILTER_SUBJECT:
currentPost.subject = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])).trim();
currentPost.subject = CryptoUtils.fixCloudflareEmails(currentPost.subject);
break;
case FILTER_POSTERNAME:
currentPost.name = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])).trim();
break;
case FILTER_TRIPCODE:
currentPost.trip += StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])).trim();
break;
case FILTER_ADMINMARK:
skipUntilSequence(">".toCharArray());
currentPost.trip += StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])).trim();
break;
case FILTER_DATE:
String date = readUntilSequence(FILTERS_CLOSE[filterIndex]);
int ms = 0;
try {
int dotPosition = date.lastIndexOf('.');
if (dotPosition != -1) {
ms = Integer.parseInt(date.substring(dotPosition + 1)) / 1000;
date = date.substring(0, dotPosition);
}
} catch (NumberFormatException e) {}
try {
currentPost.timestamp = KRAUT_DATEFORMAT.parse(date).getTime() + ms;
} catch (Exception e) {
Logger.e(TAG, "unable to parse date", e);
}
break;
case FILTER_SAGE:
currentPost.sage = true;
break;
case FILTER_ATTACHMENT:
case FILTER_ATTACHMENT_OP:
String[] attachments = readUntilSequence(FILTERS_CLOSE[filterIndex]).split("</div>");
for (String attachment : attachments) parseAttachment(attachment);
break;
case FILTER_START_COMMENT:
skipUntilSequence(FILTERS_CLOSE[filterIndex]);
currentPost.comment = readPostComment();
finalizePost();
break;
case FILTER_OMITTEDPOSTS:
parseOmittedString(readUntilSequence(FILTERS_CLOSE[filterIndex]));
break;
}
}
private String readPostComment() throws IOException {
commentBuffer.setLength(0);
commentBuffer.append("<p>");
int len = BLOCKQUOTE_CLOSE.length;
int pos = 0;
int curChar;
while ((curChar = _in.read()) != -1) {
commentBuffer.append((char) curChar);
if (curChar == BLOCKQUOTE_CLOSE[pos]) {
++pos;
if (pos == len) break;
} else {
if (pos != 0) pos = curChar == BLOCKQUOTE_CLOSE[0] ? 1 : 0;
}
}
int buflen = commentBuffer.length();
if (buflen > len) {
commentBuffer.setLength(buflen - len);
String comment = RegexUtils.replaceAll(commentBuffer, BAN_MARK_PATTERN, "<b><font color=\"red\">$1</font></b>");
if (LINKIFY) comment = RegexUtils.linkify(comment);
return CryptoUtils.fixCloudflareEmails(comment);
} else {
return "";
}
}
private void parseOmittedString(String omitted) {
int postsOmitted = -1;
int filesOmitted = -1;
try {
int len = omitted.length();
for (int i=0; i<=len; ++i) {
char ch = i == len ? ' ' : omitted.charAt(i);
if (ch >= '0' && ch <= '9') {
omittedDigitsBuffer.append(ch);
} else {
if (omittedDigitsBuffer.length() > 0) {
int parsedValue = Integer.parseInt(omittedDigitsBuffer.toString());
omittedDigitsBuffer.setLength(0);
if (postsOmitted == -1) postsOmitted = parsedValue;
else filesOmitted = parsedValue;
}
}
}
} catch (NumberFormatException e) {}
if (postsOmitted > 0) currentThread.postsCount += postsOmitted;
if (filesOmitted > 0) currentThread.attachmentsCount += filesOmitted;
}
private void parseAttachment(String html) {
Matcher attachmentMatcher = ATTACHMENT_LINKS_PATTERN.matcher(html);
if (attachmentMatcher.find()) {
AttachmentModel model = new AttachmentModel();
model.type = AttachmentModel.TYPE_OTHER_FILE;
model.size = -1;
model.width = -1;
model.height = -1;
model.path = "/files/" + attachmentMatcher.group(1);
String thumbnailGroup = attachmentMatcher.group(2);
model.thumbnail = thumbnailGroup == null ? null : "/thumbnails/" + thumbnailGroup;
String ext = model.path.substring(model.path.lastIndexOf('.') + 1).toLowerCase(Locale.US);
if (ext.equals("png") || ext.equals("jpg") || ext.equals("jpeg")) model.type = AttachmentModel.TYPE_IMAGE_STATIC;
else if (ext.equals("gif")) model.type = AttachmentModel.TYPE_IMAGE_GIF;
else if (ext.equals("webm")) model.type = AttachmentModel.TYPE_VIDEO;
else if (ext.equals("mp3") || ext.equals("ogg")) model.type = AttachmentModel.TYPE_AUDIO;
Matcher origFilenameMatcher = ATTACHMENT_FILENAME_PATTERN.matcher(html);
if (origFilenameMatcher.find()) {
model.originalName = StringEscapeUtils.unescapeHtml4(RegexUtils.removeHtmlTags(origFilenameMatcher.group(1)).trim());
}
Matcher infoMatcher = ATTACHMENT_INFO_PATTERN.matcher(html);
if (infoMatcher.find()) {
Matcher pxSizeMatcher = ATTACHMENT_PX_SIZE_PATTERN.matcher(infoMatcher.group(2));
if (pxSizeMatcher.find()) {
try {
int width = Integer.parseInt(pxSizeMatcher.group(1));
int height = Integer.parseInt(pxSizeMatcher.group(2));
model.width = width;
model.height = height;
} catch (NumberFormatException e) {}
}
Matcher byteSizeMatcher = ATTACHMENT_SIZE_PATTERN.matcher(infoMatcher.group(3));
if (byteSizeMatcher.find()) {
try {
String digits = byteSizeMatcher.group(1).replace(',', '.');
int multiplier = 1;
String prefix = byteSizeMatcher.group(2);
if (prefix != null) {
if (prefix.equalsIgnoreCase("k")) multiplier = 1024;
else if (prefix.equalsIgnoreCase("m")) multiplier = 1024 * 1024;
}
int value = Math.round(Float.parseFloat(digits) / 1024 * multiplier);
model.size = value;
} catch (NumberFormatException e) {}
}
}
++currentThread.attachmentsCount;
currentAttachments.add(model);
}
}
private void parseIcon(String html, boolean warball) {
int fqp = html.indexOf('\"');
if (fqp != -1) {
BadgeIconModel model = new BadgeIconModel();
model.source = (warball ? "/images/warballs/" : "/images/balls/") + html.substring(0, fqp);
Matcher descMatcher = ICON_DESCRIPTION_PATTERN.matcher(html);
if (descMatcher.find()) model.description = descMatcher.group(1);
currentPost.icons = new BadgeIconModel[] { model };
}
}
private void skipUntilSequence(char[] sequence) throws IOException {
int len = sequence.length;
if (len == 0) return;
int pos = 0;
int curChar;
while ((curChar = _in.read()) != -1) {
if (curChar == sequence[pos]) {
++pos;
if (pos == len) break;
} else {
if (pos != 0) pos = curChar == sequence[0] ? 1 : 0;
}
}
}
private String readUntilSequence(char[] sequence) throws IOException {
int len = sequence.length;
if (len == 0) return "";
readBuffer.setLength(0);
int pos = 0;
int curChar;
while ((curChar = _in.read()) != -1) {
readBuffer.append((char) curChar);
if (curChar == sequence[pos]) {
++pos;
if (pos == len) break;
} else {
if (pos != 0) pos = curChar == sequence[0] ? 1 : 0;
}
}
int buflen = readBuffer.length();
if (buflen >= len) {
readBuffer.setLength(buflen - len);
return readBuffer.toString();
} else {
return "";
}
}
@Override
public void close() throws IOException {
_in.close();
}
}