package nya.miku.wishmaster.chans.arhivach; import android.annotation.SuppressLint; import java.io.BufferedReader; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.text.DateFormat; import java.text.DateFormatSymbols; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; import nya.miku.wishmaster.api.models.AttachmentModel; import nya.miku.wishmaster.api.models.PostModel; import nya.miku.wishmaster.api.models.ThreadModel; /** * Created by Kalaver <Kalaver@users.noreply.github.com> on 03.07.2015. */ @SuppressLint("SimpleDateFormat") public class ArhivachBoardReader implements Closeable { //private static final String TAG = "ArhivachBoardReader"; private static final DateFormat CHAN_DATEFORMAT; static { DateFormatSymbols chanSymbols = new DateFormatSymbols(); chanSymbols.setShortWeekdays(new String[] { "", "Вск", "Пнд", "Втр", "Срд", "Чтв", "Птн", "Суб" }); CHAN_DATEFORMAT = new SimpleDateFormat("dd/MM/yy EEE HH:mm:ss", chanSymbols); CHAN_DATEFORMAT.setTimeZone(TimeZone.getTimeZone("GMT+3")); } private static final Pattern URL_PATTERN = Pattern.compile("((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|])"); private static final char[] DATA_START = "id=\"thread_row_".toCharArray(); private static final int FILTER_THREAD_END = 0; private static final int FILTER_DATE = 1; private static final int FILTER_ATTACHMENT = 2; private static final int FILTER_ATTACHMENT_ORIGINAL = 3; private static final int FILTER_ATTACHMENT_THUMBNAIL = 4; private static final int FILTER_START_COMMENT = 5; private static final int FILTER_THREAD_LINK = 6; private static final int FILTER_END_COMMENT = 7; private static final int FILTER_OMITTEDPOSTS = 8; private static final int FILTER_THREAD_SAVED = 9; private static final int FILTER_THREAD_TAGS = 10; public static final char[][] FILTERS_OPEN = { "</tr>".toCharArray(), "class=\"thread_date\">".toCharArray(), "<div class=\"post_image_block\"".toCharArray(), "<a".toCharArray(), "<img".toCharArray(), "<div class=\"thread_text\">".toCharArray(), "<a".toCharArray(), "</a>".toCharArray(), "class=\"thread_posts_count\"".toCharArray(), "label-thread-saved\">".toCharArray(), "class=\"thread_tags\"".toCharArray(), }; private static final char[][] FILTERS_CLOSE = { null, "</td>".toCharArray(), ">".toCharArray(), ">".toCharArray(), ">".toCharArray(), ">".toCharArray(), ">".toCharArray(), "</a>".toCharArray(), "</span>".toCharArray(), "</span>".toCharArray(), "</div>".toCharArray(), }; private final Reader _in; private StringBuilder readBuffer = new StringBuilder(); private List<ThreadModel> threads; private ThreadModel currentThread; private List<PostModel> postsBuf; private PostModel currentPost; private StringBuilder omittedDigitsBuffer = new StringBuilder(); private List<AttachmentModel> currentAttachments; public ArhivachBoardReader(Reader reader) { _in = reader; } public ArhivachBoardReader(InputStream in) { this(new BufferedReader(new InputStreamReader(in))); } public ThreadModel[] readPage() throws IOException { threads = new ArrayList<ThreadModel>(); initThreadModel(); initPostModel(); skipUntilSequence(DATA_START); readData(); return threads.toArray(new ThreadModel[threads.size()]); } private void readData() throws IOException { int filtersCount = FILTERS_OPEN.length; int[] pos = new int[filtersCount]; int[] len = new int[filtersCount]; for (int i=0; i<filtersCount; ++i) len[i] = FILTERS_OPEN[i].length; int curChar; while ((curChar = _in.read()) != -1) { for (int i=0; i<filtersCount; ++i) { if (curChar == FILTERS_OPEN[i][pos[i]]) { ++pos[i]; if (pos[i] == len[i]) { handleFilter(i); pos[i] = 0; } } else { if (pos[i] != 0) pos[i] = curChar == FILTERS_OPEN[i][0] ? 1 : 0; } } } finalizeThread(); } private void initThreadModel() { currentThread = new ThreadModel(); currentThread.postsCount = 0; currentThread.attachmentsCount = 0; postsBuf = new ArrayList<PostModel>(); } private void initPostModel() { currentPost = new PostModel(); currentPost.number = "unknown"; currentPost.trip = ""; currentAttachments = new ArrayList<AttachmentModel>(); } private void finalizeThread() { if (postsBuf.size() > 0) { currentThread.posts = postsBuf.toArray(new PostModel[postsBuf.size()]); currentThread.threadNumber = currentThread.posts[0].number; for (PostModel post : currentThread.posts) post.parentThread = currentThread.threadNumber; threads.add(currentThread); initThreadModel(); } } private void finalizePost() { if (currentPost.number != null && currentPost.number.length() > 0) { ++currentThread.postsCount; currentPost.attachments = currentAttachments.toArray(new AttachmentModel[currentAttachments.size()]); if (currentPost.name == null) currentPost.name = ""; if (currentPost.subject == null) currentPost.subject = ""; if (currentPost.comment == null) currentPost.comment = ""; if (currentPost.email == null) currentPost.email = ""; if (currentPost.trip == null) currentPost.trip = ""; postsBuf.add(currentPost); } initPostModel(); } private void handleFilter(int filterIndex) throws IOException { switch (filterIndex) { case FILTER_THREAD_END: finalizeThread(); break; case FILTER_DATE: String date = readUntilSequence(FILTERS_CLOSE[filterIndex]); parseDate(date); finalizePost(); break; case FILTER_ATTACHMENT: skipUntilSequence(FILTERS_CLOSE[filterIndex]); parseAttachment(); break; case FILTER_START_COMMENT: readPost(); break; case FILTER_OMITTEDPOSTS: parseOmittedString(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; case FILTER_THREAD_SAVED: String data = readUntilSequence(FILTERS_CLOSE[filterIndex]); if (data.contains("Сохранен")) { currentThread.isClosed=true; } break; case FILTER_THREAD_TAGS: parseTags(readUntilSequence(FILTERS_CLOSE[filterIndex])); break; } } protected void parseTags(String s) throws IOException { Matcher matcher = Pattern.compile("<a[^>]*>([^<]*)</a>",Pattern.MULTILINE).matcher(s); String tags=""; while (matcher.find()) { tags+="["+matcher.group(1)+"]"; } currentPost.name=tags; } protected void readPost() throws IOException { skipUntilSequence(FILTERS_OPEN[FILTER_THREAD_LINK]); String link = readUntilSequence(FILTERS_CLOSE[FILTER_THREAD_LINK]); Matcher matcher = Pattern.compile("(href *= *\"(.*)\")").matcher(link); String url=""; if (matcher.find()) url = matcher.group(1); matcher = Pattern.compile("(\\d+)").matcher(url); if (matcher.find()) currentPost.number = matcher.group(0); String commentData = readUntilSequence(FILTERS_CLOSE[FILTER_END_COMMENT]); matcher = Pattern.compile("<b>(.*)</b>\\s*—\\s*").matcher(commentData); if (matcher.find()) { currentPost.subject = matcher.group(1); currentPost.comment = commentData.substring(matcher.group(0).length()); } else currentPost.comment = commentData; } private void parseOmittedString(String omitted) { int postsOmitted = -1; int filesOmitted = -1; Matcher matcher = Pattern.compile("(\\d+)").matcher(omitted); String Omitted = ""; if (matcher.find()) Omitted = matcher.group(0); try { int parsedValue = Integer.parseInt(Omitted); omittedDigitsBuffer.setLength(0); postsOmitted = parsedValue - 1; } catch (NumberFormatException e) {} if (postsOmitted > 0) currentThread.postsCount += postsOmitted; if (filesOmitted > 0) currentThread.attachmentsCount += filesOmitted; } private void parseAttachment() throws IOException { skipUntilSequence(FILTERS_OPEN[FILTER_ATTACHMENT_ORIGINAL]); String attachment = readUntilSequence(FILTERS_CLOSE[FILTER_ATTACHMENT_ORIGINAL]); String thumbnail=""; String original=""; Matcher matcher = URL_PATTERN.matcher(attachment); if (matcher.find()) original = matcher.group(1); skipUntilSequence(FILTERS_OPEN[FILTER_ATTACHMENT_THUMBNAIL]); attachment = readUntilSequence(FILTERS_CLOSE[FILTER_ATTACHMENT_THUMBNAIL]); matcher = URL_PATTERN.matcher(attachment); if (matcher.find()) thumbnail = matcher.group(1); if ((original.length()>0)) { AttachmentModel model = new AttachmentModel(); model.type = AttachmentModel.TYPE_OTHER_FILE; model.size = -1; model.width = -1; model.height = -1; model.path = original; if (thumbnail.length()>0) model.thumbnail = thumbnail; else model.thumbnail = original; String ext = model.path.substring(model.path.lastIndexOf('.') + 1).toLowerCase(Locale.US); if (ext.equals("png") || ext.equals("jpg") || ext.equals("jpeg")) model.type = AttachmentModel.TYPE_IMAGE_STATIC; else if (ext.equals("gif")) model.type = AttachmentModel.TYPE_IMAGE_GIF; else if (ext.equals("webm")) model.type = AttachmentModel.TYPE_VIDEO; else if (ext.equals("mp3") || ext.equals("ogg")) model.type = AttachmentModel.TYPE_VIDEO; ++currentThread.attachmentsCount; currentAttachments.add(model); } } protected void parseDate(String date) { //TODO: Implement date parser /* if (date.length() > 0) { try { currentPost.timestamp = CHAN_DATEFORMAT.parse(date).getTime(); } catch (Exception e) { currentPost.timestamp = Calendar.getInstance().getTimeInMillis(); Logger.e(TAG, "cannot parse date; make sure you choose the right DateFormat for this chan", e); } } */ } private void skipUntilSequence(char[] sequence) throws IOException { int len = sequence.length; if (len == 0) return; int pos = 0; int curChar; while ((curChar = _in.read()) != -1) { if (curChar == sequence[pos]) { ++pos; if (pos == len) break; } else { if (pos != 0) pos = curChar == sequence[0] ? 1 : 0; } } } private String readUntilSequence(char[] sequence) throws IOException { int len = sequence.length; if (len == 0) return ""; readBuffer.setLength(0); int pos = 0; int curChar; while ((curChar = _in.read()) != -1) { readBuffer.append((char) curChar); if (curChar == sequence[pos]) { ++pos; if (pos == len) break; } else { if (pos != 0) pos = curChar == sequence[0] ? 1 : 0; } } int buflen = readBuffer.length(); if (buflen >= len) { readBuffer.setLength(buflen - len); return readBuffer.toString(); } else { return ""; } } @Override public void close() throws IOException { _in.close(); } }