/* * Overchan Android (Meta Imageboard Client) * Copyright (C) 2014-2016 miku-nyan <https://github.com/miku-nyan> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package nya.miku.wishmaster.chans.cirno; import java.io.BufferedReader; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Set; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringEscapeUtils; import nya.miku.wishmaster.api.models.AttachmentModel; import nya.miku.wishmaster.api.models.PostModel; import nya.miku.wishmaster.api.models.ThreadModel; import nya.miku.wishmaster.api.util.CryptoUtils; import nya.miku.wishmaster.common.Logger; /** * Парсер страниц борды hatsune.ru * @author miku-nyan * */ public class MikubaReader implements Closeable { private static final String TAG = "MikubaReader"; private static final DateFormat DATEFORMAT; static { DATEFORMAT = new SimpleDateFormat("EEE dd MMM yyyy hh:mm:ss", Locale.US); DATEFORMAT.setTimeZone(TimeZone.getTimeZone("GMT+3")); } private static final char[] DATA_START = "<div id=\"page\">".toCharArray(); private static final int FILTER_PAGE_END = 0; private static final int FILTER_THREAD_END = 1; private static final int FILTER_ATTACHMENT = 2; private static final int FILTER_POSTNUMBER_OP = 3; private static final int FILTER_POSTNUMBER = 4; private static final int FILTER_SUBJECT = 5; private static final int FILTER_ENDDATE = 6; private static final int FILTER_START_COMMENT = 7; private static final char[][] FILTERS_OPEN = { "<center>".toCharArray(), "<hr".toCharArray(), "<td class=\"image\"".toCharArray(), "<td class=\"post\" id=\"".toCharArray(), "<td class=\"reply\" id=\"".toCharArray(), "<span class=\"replytitle\">".toCharArray(), "</label>".toCharArray(), "<blockquote".toCharArray() }; private static final char[][] FILTERS_CLOSE = { null, null, "</td>".toCharArray(), "\"".toCharArray(), "\"".toCharArray(), "</span>".toCharArray(), null, ">".toCharArray() }; //in comment private static final char[] BLOCKQUOTE_OPEN = "<blockquote".toCharArray(); private static final char[] BLOCKQUOTE_CLOSE = "</blockquote>".toCharArray(); private static final char[] OMITTED_OPEN = "<span class=\"omitted\">".toCharArray(); private static final char[] OMITTED_CLOSE = "</span>".toCharArray(); private static final Pattern POST_REFERENCE = Pattern.compile("<a href=\"/reply/(\\d+)"); private final Reader _in; private StringBuilder readBuffer = new StringBuilder(); private List<ThreadModel> threads; private ThreadModel currentThread; private List<PostModel> postsBuf; private Set<String> postsNumBuf; private PostModel currentPost; private boolean inDate; private StringBuilder dateBuffer = new StringBuilder(); private StringBuilder commentBuffer = new StringBuilder(); private StringBuilder omittedDigitsBuffer = new StringBuilder(); private List<AttachmentModel> currentAttachments; public MikubaReader(InputStream in) { _in = new BufferedReader(new InputStreamReader(in)); } private void initThreadModel() { currentThread = new ThreadModel(); currentThread.postsCount = 0; currentThread.attachmentsCount = -1; postsBuf = new ArrayList<>(); postsNumBuf = new HashSet<>(); } private void initPostModel() { currentPost = new PostModel(); currentPost.name = ""; currentPost.email = ""; currentPost.trip = ""; currentAttachments = new ArrayList<AttachmentModel>(); inDate = false; dateBuffer.setLength(0); } private void finalizeThread() { if (postsBuf.size() > 0) { currentThread.posts = postsBuf.toArray(new PostModel[postsBuf.size()]); currentThread.threadNumber = currentThread.posts[0].number; for (PostModel post : currentThread.posts) post.parentThread = currentThread.threadNumber; threads.add(currentThread); initThreadModel(); } } private void finalizePost() { if (currentPost.number != null && currentPost.number.length() > 0) { ++currentThread.postsCount; currentPost.attachments = currentAttachments.toArray(new AttachmentModel[currentAttachments.size()]); if (currentPost.subject == null) currentPost.subject = ""; if (currentPost.comment == null) currentPost.comment = ""; postsBuf.add(currentPost); postsNumBuf.add(currentPost.number); } initPostModel(); } public ThreadModel[] readPage() throws IOException { threads = new ArrayList<ThreadModel>(); initThreadModel(); initPostModel(); skipUntilSequence(DATA_START); readData(); return threads.toArray(new ThreadModel[threads.size()]); } private void readData() throws IOException { int filtersCount = FILTERS_OPEN.length; int[] pos = new int[filtersCount]; int[] len = new int[filtersCount]; for (int i=0; i<filtersCount; ++i) len[i] = FILTERS_OPEN[i].length; int curChar; while ((curChar = _in.read()) != -1) { if (inDate) dateBuffer.append((char) curChar); for (int i=0; i<filtersCount; ++i) { if (curChar == FILTERS_OPEN[i][pos[i]]) { ++pos[i]; if (pos[i] == len[i]) { if (i == FILTER_PAGE_END) { finalizeThread(); return; } handleFilter(i); pos[i] = 0; } } else { if (pos[i] != 0) pos[i] = curChar == FILTERS_OPEN[i][0] ? 1 : 0; } } } finalizeThread(); } private void handleFilter(int filterIndex) throws IOException { if (inDate && filterIndex != FILTER_ENDDATE) dateBuffer.setLength(0); switch (filterIndex) { case FILTER_THREAD_END: finalizeThread(); break; case FILTER_ATTACHMENT: parseAttachment(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; case FILTER_POSTNUMBER: case FILTER_POSTNUMBER_OP: currentPost.number = readUntilSequence(FILTERS_CLOSE[filterIndex]).trim().substring(1); break; case FILTER_SUBJECT: currentPost.subject = StringEscapeUtils.unescapeHtml4(readUntilSequence(FILTERS_CLOSE[filterIndex])).trim(); currentPost.subject = CryptoUtils.fixCloudflareEmails(currentPost.subject); inDate = true; break; case FILTER_ENDDATE: if (dateBuffer.length() > FILTERS_OPEN[FILTER_ENDDATE].length) { String date = dateBuffer.substring(0, dateBuffer.length() - FILTERS_OPEN[FILTER_ENDDATE].length).trim(); if (date.length() > 0) { try { currentPost.timestamp = DATEFORMAT.parse(date).getTime(); } catch (Exception e) { Logger.e(TAG, "cannot parse date; make sure you choose the right DateFormat for this chan", e); } } } inDate = false; dateBuffer.setLength(0); break; case FILTER_START_COMMENT: skipUntilSequence(FILTERS_CLOSE[filterIndex]); currentPost.comment = readPostComment(); finalizePost(); break; } } private String readPostComment() throws IOException { commentBuffer.setLength(0); int len1 = BLOCKQUOTE_OPEN.length; int len2 = BLOCKQUOTE_CLOSE.length; int len3 = OMITTED_OPEN.length; int pos1 = 0; int pos2 = 0; int pos3 = 0; int tagCounter = 1; int curChar; while ((curChar = _in.read()) != -1) { commentBuffer.append((char) curChar); if (curChar == BLOCKQUOTE_OPEN[pos1]) { ++pos1; if (pos1 == len1) { ++tagCounter; pos1 = 0; } } else { if (pos1 != 0) pos1 = curChar == BLOCKQUOTE_OPEN[0] ? 1 : 0; } if (curChar == BLOCKQUOTE_CLOSE[pos2]) { ++pos2; if (pos2 == len2) { --tagCounter; if (tagCounter == 0) break; pos2 = 0; } } else { if (pos2 != 0) pos2 = curChar == BLOCKQUOTE_CLOSE[0] ? 1 : 0; } if (curChar == OMITTED_OPEN[pos3]) { ++pos3; if (pos3 == len3) { parseOmittedString(readUntilSequence(OMITTED_CLOSE)); pos3 = 0; } } else { if (pos3 != 0) pos3 = curChar == BLOCKQUOTE_OPEN[0] ? 1 : 0; } } int buflen = commentBuffer.length(); if (buflen > len2) { commentBuffer.setLength(buflen - len2); return CryptoUtils.fixCloudflareEmails(fixPostRefs(commentBuffer)); } else { return ""; } } private String fixPostRefs(StringBuilder commentBuffer) { String comment = commentBuffer.toString(); commentBuffer.setLength(0); if (postsBuf == null || postsBuf.size() == 0) return comment; Matcher matcher = POST_REFERENCE.matcher(comment); if (!matcher.find()) return comment; String threadNum = postsBuf.get(0).number; int appendPos = 0; boolean replacements = false; do { String num = matcher.group(1); commentBuffer.append(comment, appendPos, matcher.start(1)); appendPos = matcher.end(); if (!num.equals(threadNum) && postsNumBuf.contains(num)) { replacements = true; commentBuffer.append(threadNum); } else { commentBuffer.append(num); } } while (matcher.find()); commentBuffer.append(comment, appendPos, comment.length()); if (replacements) comment = commentBuffer.toString(); commentBuffer.setLength(0); return comment; } private void parseOmittedString(String omitted) { try { int len = omitted.length(); for (int i=0; i<=len; ++i) { char ch = i == len ? ' ' : omitted.charAt(i); if (ch >= '0' && ch <= '9') { omittedDigitsBuffer.append(ch); } else { if (omittedDigitsBuffer.length() > 0) { currentThread.postsCount += Integer.parseInt(omittedDigitsBuffer.toString()); omittedDigitsBuffer.setLength(0); break; } } } } catch (NumberFormatException e) {} } private void parseAttachment(String html) { int index = html.indexOf("<img"); if (index != -1) { index = html.indexOf("src=\"", index + 4); if (index != -1) { int start = index + 5; int end = html.indexOf("\"", start); if (end != -1) { AttachmentModel attachment = new AttachmentModel(); attachment.size = -1; attachment.thumbnail = html.substring(start, end); if (attachment.thumbnail.contains("/thu/")) { attachment.path = attachment.thumbnail.replace("/thu/", "/src/"); attachment.type = attachment.path.toLowerCase(Locale.US).endsWith(".gif") ? AttachmentModel.TYPE_IMAGE_GIF : AttachmentModel.TYPE_IMAGE_STATIC; } else { attachment.path = attachment.thumbnail; attachment.type = AttachmentModel.TYPE_OTHER_FILE; int startHref, endHref; if ((startHref = html.indexOf("href=\"")) != -1 && (endHref = html.indexOf('\"', startHref + 6)) != -1) { attachment.path = html.substring(startHref + 6, endHref); String pathLower = attachment.path.toLowerCase(Locale.US); if (pathLower.endsWith(".mp3") || pathLower.endsWith(".ogg")) attachment.type = AttachmentModel.TYPE_AUDIO; } } currentAttachments.add(attachment); return; } } } index = html.indexOf("<embed"); if (index != -1) { index = html.indexOf("src=\"", index + 6); if (index != -1) { int start = index + 5; int end = html.indexOf("\"", start); if (end != -1) { AttachmentModel attachment = new AttachmentModel(); attachment.size = -1; attachment.path = html.substring(start, end); if (attachment.path.contains("youtube")) { int youtubeIdIndex = attachment.path.indexOf("/v/"); if (youtubeIdIndex != -1) { String youtubeId = attachment.path.substring(youtubeIdIndex + 3); attachment.path = "http://youtube.com/watch?v=" + youtubeId; attachment.thumbnail = "http://img.youtube.com/vi/" + youtubeId + "/default.jpg"; } } attachment.type = AttachmentModel.TYPE_OTHER_NOTFILE; currentAttachments.add(attachment); } } } } private void skipUntilSequence(char[] sequence) throws IOException { int len = sequence.length; if (len == 0) return; int pos = 0; int curChar; while ((curChar = _in.read()) != -1) { if (curChar == sequence[pos]) { ++pos; if (pos == len) break; } else { if (pos != 0) pos = curChar == sequence[0] ? 1 : 0; } } } private String readUntilSequence(char[] sequence) throws IOException { int len = sequence.length; if (len == 0) return ""; readBuffer.setLength(0); int pos = 0; int curChar; while ((curChar = _in.read()) != -1) { readBuffer.append((char) curChar); if (curChar == sequence[pos]) { ++pos; if (pos == len) break; } else { if (pos != 0) pos = curChar == sequence[0] ? 1 : 0; } } int buflen = readBuffer.length(); if (buflen >= len) { readBuffer.setLength(buflen - len); return readBuffer.toString(); } else { return ""; } } @Override public void close() throws IOException { _in.close(); } }