/* * Copyright 2016 Hippo Seven * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.hippo.ehviewer.client.parser; import android.support.annotation.NonNull; import android.support.annotation.Nullable; import android.text.TextUtils; import com.hippo.ehviewer.Settings; import com.hippo.ehviewer.client.EhUrl; import com.hippo.ehviewer.client.EhUtils; import com.hippo.ehviewer.client.data.GalleryComment; import com.hippo.ehviewer.client.data.GalleryDetail; import com.hippo.ehviewer.client.data.GalleryTagGroup; import com.hippo.ehviewer.client.data.LargePreviewSet; import com.hippo.ehviewer.client.data.NormalPreviewSet; import com.hippo.ehviewer.client.data.PreviewSet; import com.hippo.ehviewer.client.exception.EhException; import com.hippo.ehviewer.client.exception.OffensiveException; import com.hippo.ehviewer.client.exception.ParseException; import com.hippo.ehviewer.client.exception.PiningException; import com.hippo.util.JsoupUtils; import com.hippo.yorozuya.NumberUtils; import com.hippo.yorozuya.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; public class GalleryDetailParser { private static final Pattern PATTERN_ERROR = Pattern.compile("<div class=\"d\">\n<p>([^<]+)</p>"); private static final Pattern PATTERN_DETAIL = Pattern.compile("var gid = (\\d+);.+?var token = \"([a-f0-9]+)\";.+?var apiuid = ([\\-\\d]+);.+?var apikey = \"([a-f0-9]+)\";", Pattern.DOTALL); private static final Pattern PATTERN_TORRENT = Pattern.compile("<a[^<>]*onclick=\"return popUp\\('([^']+)'[^)]+\\)\">Torrent Download \\( (\\d+) \\)</a>"); private static final Pattern PATTERN_ARCHIVE = Pattern.compile("<a[^<>]*onclick=\"return popUp\\('([^']+)'[^)]+\\)\">Archive Download</a>"); private static final Pattern PATTERN_TAG_GROUP = Pattern.compile("<tr><td[^<>]+>([\\w\\s]+):</td><td>(?:<div[^<>]+><a[^<>]+>[\\w\\s]+</a></div>)+</td></tr>"); private static final Pattern PATTERN_TAG = Pattern.compile("<div[^<>]+><a[^<>]+>([\\w\\s]+)</a></div>"); private static final Pattern PATTERN_COMMENT = Pattern.compile("<div class=\"c3\">Posted on ([^<>]+) by:   <a[^<>]+>([^<>]+)</a>.+?<div class=\"c6\"[^>]*>(.+?)</div><div class=\"c[78]\""); private static final Pattern PATTERN_PAGES = Pattern.compile("<tr><td[^<>]*>Length:</td><td[^<>]*>([\\d,]+) pages</td></tr>"); private static final Pattern PATTERN_PREVIEW_PAGES = Pattern.compile("<td[^>]+><a[^>]+>([\\d,]+)</a></td><td[^>]+>(?:<a[^>]+>)?>(?:</a>)?</td>"); private static final Pattern PATTERN_NORMAL_PREVIEW = Pattern.compile("<div class=\"gdtm\"[^<>]*><div[^<>]*width:(\\d+)[^<>]*height:(\\d+)[^<>]*\\((.+?)\\)[^<>]*-(\\d+)px[^<>]*><a[^<>]*href=\"(.+?)\"[^<>]*><img alt=\"([\\d,]+)\""); private static final Pattern PATTERN_LARGE_PREVIEW = Pattern.compile("<div class=\"gdtl\".+?<a href=\"(.+?)\"><img alt=\"([\\d,]+)\".+?src=\"(.+?)\""); private static final GalleryTagGroup[] EMPTY_GALLERY_TAG_GROUP_ARRAY = new GalleryTagGroup[0]; private static final GalleryComment[] EMPTY_GALLERY_COMMENT_ARRAY = new GalleryComment[0]; private static final DateFormat WEB_COMMENT_DATE_FORMAT = new SimpleDateFormat("dd MMMMM yyyy, HH:mm z", Locale.US); static { WEB_COMMENT_DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC")); } private static final String OFFENSIVE_STRING = "<p>(And if you choose to ignore this warning, you lose all rights to complain about it in the future.)</p>"; private static final String PINING_STRING = "<p>This gallery is pining for the fjords.</p>"; public static GalleryDetail parse(String body) throws EhException { if (body.contains(OFFENSIVE_STRING)) { throw new OffensiveException(); } if (body.contains(PINING_STRING)) { throw new PiningException(); } // Error info Matcher m = PATTERN_ERROR.matcher(body); if (m.find()) { throw new EhException(m.group(1)); } GalleryDetail galleryDetail = new GalleryDetail(); Document document = Jsoup.parse(body); parseDetail(galleryDetail, document, body); galleryDetail.tags = parseTagGroups(document); galleryDetail.comments = parseComments(document); galleryDetail.previewPages = parsePreviewPages(document, body); galleryDetail.previewSet = parsePreviewSet(document, body); return galleryDetail; } @SuppressWarnings("ConstantConditions") private static void parseDetail(GalleryDetail gd, Document d, String body) throws ParseException { Matcher matcher = PATTERN_DETAIL.matcher(body); if (matcher.find()) { gd.gid = Long.parseLong(matcher.group(1)); gd.token = matcher.group(2); gd.apiUid = NumberUtils.parseLongSafely(matcher.group(3), -1L); gd.apiKey = matcher.group(4); } else { throw new ParseException("Can't parse gallery detail", body); } matcher = PATTERN_TORRENT.matcher(body); if (matcher.find()) { gd.torrentUrl = StringUtils.unescapeXml(StringUtils.trim(matcher.group(1))); gd.torrentCount = NumberUtils.parseIntSafely(matcher.group(2), 0); } else { gd.torrentCount = 0; gd.torrentUrl = ""; } matcher = PATTERN_ARCHIVE.matcher(body); if (matcher.find()) { gd.archiveUrl = StringUtils.unescapeXml(StringUtils.trim(matcher.group(1))); } else { gd.archiveUrl = ""; } try { Element gm = JsoupUtils.getElementByClass(d, "gm"); // Thumb url Element gd1 = gm.getElementById("gd1"); try { gd.thumb = EhUtils.handleThumbUrlResolution(StringUtils.trim(gd1.child(0).attr("src"))); } catch (Exception e) { gd.thumb = ""; } // Title Element gn = gm.getElementById("gn"); if (null != gn) { gd.title = StringUtils.trim(gn.text()); } else { gd.title = ""; } // Jpn title Element gj = gm.getElementById("gj"); if (null != gj) { gd.titleJpn = StringUtils.trim(gj.text()); } else { gd.titleJpn = ""; } // Category Element gdc = gm.getElementById("gdc"); try { String href = gdc.child(0).attr("href"); String category = href.substring(href.lastIndexOf('/') + 1); gd.category = EhUtils.getCategory(category); } catch (Exception e) { gd.category = EhUtils.UNKNOWN; } // Uploader Element gdn = gm.getElementById("gdn"); if (null != gdn) { gd.uploader = StringUtils.trim(gdn.text()); } else { gd.uploader = ""; } Element gdd = gm.getElementById("gdd"); gd.posted = ""; gd.parent = ""; gd.visible = ""; gd.visible = ""; gd.size = ""; gd.pages = 0; gd.favoriteCount = 0; try { Elements es = gdd.child(0).child(0).children(); for (int i = 0, n = es.size(); i < n; i++) { parseDetailInfo(gd, es.get(i), body); } } catch (Exception e) { // Ignore } // Rating count Element rating_count = gm.getElementById("rating_count"); if (null != rating_count) { gd.ratingCount = NumberUtils.parseIntSafely( StringUtils.trim(rating_count.text()), 0); } else { gd.ratingCount = 0; } // Rating Element rating_label = gm.getElementById("rating_label"); if (null != rating_label) { String ratingStr = StringUtils.trim(rating_label.text()); if ("Not Yet Rated".equals(ratingStr)) { gd.rating = -1.0f; } else { int index = ratingStr.indexOf(' '); if (index == -1 || index >= ratingStr.length()) { gd.rating = 0f; } else { gd.rating = NumberUtils.parseFloatSafely(ratingStr.substring(index + 1), 0f); } } } else { gd.rating = -1.0f; } // isFavorited Element gdf = gm.getElementById("gdf"); gd.isFavorited = null != gdf && !StringUtils.trim(gdf.text()).equals("Add to Favorites"); } catch (Exception e) { throw new ParseException("Can't parse gallery detail", body); } } private static void parseDetailInfo(GalleryDetail gd, Element e, String body) { Elements es = e.children(); if (es.size() < 2) { return; } String key = StringUtils.trim(es.get(0).text()); String value = StringUtils.trim(es.get(1).ownText()); if (key.startsWith("Posted")) { gd.posted = value; } else if (key.startsWith("Parent")) { gd.parent = value; } else if (key.startsWith("Visible")) { gd.visible = value; } else if (key.startsWith("Language")) { gd.language = value; } else if (key.startsWith("File Size")) { gd.size = value; } else if (key.startsWith("Length")) { int index = value.indexOf(' '); if (index >= 0) { gd.pages = NumberUtils.parseIntSafely(value.substring(0, index), 1); } else { gd.pages = 1; } } else if (key.startsWith("Favorited")) { switch (value) { case "Never": gd.favoriteCount = 0; break; case "Once": gd.favoriteCount = 1; break; default: int index = value.indexOf(' '); if (index == -1) { gd.favoriteCount = 0; } else { gd.favoriteCount = NumberUtils.parseIntSafely(value.substring(0, index), 0); } break; } } } @Nullable private static GalleryTagGroup parseTagGroup(Element element) { try { GalleryTagGroup group = new GalleryTagGroup(); String nameSpace = element.child(0).text(); // Remove last ':' nameSpace = nameSpace.substring(0, nameSpace.length() - 1); group.groupName = nameSpace; Elements tags = element.child(1).children(); for (int i = 0, n = tags.size(); i < n; i++) { String tag = tags.get(i).text(); // Sometimes parody tag is followed with '|' and english translate, just remove them int index = tag.indexOf('|'); if (index >= 0) { tag = tag.substring(0, index); } group.addTag(tag); } return group.size() > 0 ? group : null; } catch (Exception e) { e.printStackTrace(); return null; } } /** * Parse tag groups with html parser */ @NonNull public static GalleryTagGroup[] parseTagGroups(Document document) { try { Element taglist = document.getElementById("taglist"); Elements tagGroups = taglist.child(0).child(0).children(); List<GalleryTagGroup> list = new ArrayList<>(tagGroups.size()); for (int i = 0, n = tagGroups.size(); i < n; i++) { GalleryTagGroup group = parseTagGroup(tagGroups.get(i)); if (null != group) { list.add(group); } } return list.toArray(new GalleryTagGroup[list.size()]); } catch (Exception e) { e.printStackTrace(); return EMPTY_GALLERY_TAG_GROUP_ARRAY; } } /** * Parse tag groups with regular expressions */ @NonNull private static GalleryTagGroup[] parseTagGroups(String body) throws EhException { List<GalleryTagGroup> list = new LinkedList<>(); Matcher m = PATTERN_TAG_GROUP.matcher(body); while (m.find()) { GalleryTagGroup tagGroup = new GalleryTagGroup(); tagGroup.groupName = ParserUtils.trim(m.group(1)); parseGroup(tagGroup, m.group(0)); list.add(tagGroup); } return list.toArray(new GalleryTagGroup[list.size()]); } private static void parseGroup(GalleryTagGroup tagGroup, String body) { Matcher m = PATTERN_TAG.matcher(body); while (m.find()) { tagGroup.addTag(ParserUtils.trim(m.group(1))); } } @Nullable @SuppressWarnings("ConstantConditions") public static GalleryComment parseComment(Element element) { try { GalleryComment comment = new GalleryComment(); // Id Element a = element.previousElementSibling(); String name = a.attr("name"); comment.id = Integer.parseInt(StringUtils.trim(name).substring(1)); // Vote up and vote down Element c4 = JsoupUtils.getElementByClass(element, "c4"); if (null != c4) { Elements es = c4.children(); if (2 == es.size()) { comment.voteUp = !TextUtils.isEmpty(StringUtils.trim(es.get(0).attr("style"))); comment.voteDown = !TextUtils.isEmpty(StringUtils.trim(es.get(1).attr("style"))); } } // Vote state Element c7 = JsoupUtils.getElementByClass(element, "c7"); if (null != c7) { comment.voteState = StringUtils.trim(c7.text()); } // Score Element c5 = JsoupUtils.getElementByClass(element, "c5"); if (null != c5) { Elements es = c5.children(); if (!es.isEmpty()) { comment.score = NumberUtils.parseIntSafely(StringUtils.trim(es.get(0).text()), 0); } } // time Element c3 = JsoupUtils.getElementByClass(element, "c3"); String temp = c3.ownText(); temp = temp.substring("Posted on ".length(), temp.length() - " by:  ".length()); comment.time = WEB_COMMENT_DATE_FORMAT.parse(temp).getTime(); // user comment.user = c3.child(0).text(); // comment comment.comment = JsoupUtils.getElementByClass(element, "c6").html(); return comment; } catch (Exception e) { e.printStackTrace(); return null; } } /** * Parse comments with html parser */ @NonNull public static GalleryComment[] parseComments(Document document) { try { Element cdiv = document.getElementById("cdiv"); Elements c1s = cdiv.getElementsByClass("c1"); List<GalleryComment> list = new ArrayList<>(c1s.size()); for (int i = 0, n = c1s.size(); i < n; i++) { GalleryComment comment = parseComment(c1s.get(i)); if (null != comment) { list.add(comment); } } return list.toArray(new GalleryComment[list.size()]); } catch (Exception e) { e.printStackTrace(); return EMPTY_GALLERY_COMMENT_ARRAY; } } /** * Parse comments with regular expressions */ @NonNull public static GalleryComment[] parseComments(String body) { List<GalleryComment> list = new LinkedList<>(); Matcher m = PATTERN_COMMENT.matcher(body); while (m.find()) { String webDateString = ParserUtils.trim(m.group(1)); Date date; try { date = WEB_COMMENT_DATE_FORMAT.parse(webDateString); } catch (java.text.ParseException e) { date = new Date(0L); } GalleryComment comment = new GalleryComment(); comment.time = date.getTime(); comment.user = ParserUtils.trim(m.group(2)); comment.comment = m.group(3); list.add(comment); } return list.toArray(new GalleryComment[list.size()]); } /** * Parse preview pages with html parser */ public static int parsePreviewPages(Document document, String body) throws ParseException { try { Elements elements = document.getElementsByClass("ptt").first().child(0).child(0).children(); return Integer.parseInt(elements.get(elements.size() - 2).text()); } catch (Exception e) { e.printStackTrace(); throw new ParseException("Can't parse preview pages", body); } } /** * Parse preview pages with regular expressions */ public static int parsePreviewPages(String body) throws ParseException { Matcher m = PATTERN_PREVIEW_PAGES.matcher(body); int previewPages = -1; if (m.find()) { previewPages = ParserUtils.parseInt(m.group(1)); } if (previewPages <= 0) { throw new ParseException("Parse preview page count error", body); } return previewPages; } /** * Parse pages with regular expressions */ public static int parsePages(String body) throws ParseException { Matcher m = PATTERN_PAGES.matcher(body); if (m.find()) { return ParserUtils.parseInt(m.group(1)); } else { throw new ParseException("Parse pages error", body); } } public static PreviewSet parsePreviewSet(Document d, String body) throws ParseException { try { return parseLargePreviewSet(d, body); } catch (ParseException e) { return parseNormalPreviewSet(body); } } public static PreviewSet parsePreviewSet(String body) throws ParseException { try { return parseLargePreviewSet(body); } catch (ParseException e) { return parseNormalPreviewSet(body); } } /** * Parse large previews with regular expressions */ private static LargePreviewSet parseLargePreviewSet(Document d, String body) throws ParseException { try { LargePreviewSet largePreviewSet = new LargePreviewSet(); Element gdt = d.getElementById("gdt"); Elements gdtls = gdt.getElementsByClass("gdtl"); int n = gdtls.size(); if (n <= 0) { throw new ParseException("Can't parse large preview", body); } for (int i = 0; i < n; i++) { Element element = gdtls.get(i).child(0); String pageUrl = element.attr("href"); element = element.child(0); String imageUrl = element.attr("src"); if (Settings.getFixThumbUrl()) { imageUrl = EhUrl.getFixedPreviewThumbUrl(imageUrl); } int index = Integer.parseInt(element.attr("alt")) - 1; largePreviewSet.addItem(index, imageUrl, pageUrl); } return largePreviewSet; } catch (Exception e) { e.printStackTrace(); throw new ParseException("Can't parse large preview", body); } } /** * Parse large previews with regular expressions */ private static LargePreviewSet parseLargePreviewSet(String body) throws ParseException { Matcher m = PATTERN_LARGE_PREVIEW.matcher(body); LargePreviewSet largePreviewSet = new LargePreviewSet(); while (m.find()) { int index = ParserUtils.parseInt(m.group(2)) - 1; String imageUrl = ParserUtils.trim(m.group(3)); String pageUrl = ParserUtils.trim(m.group(1)); if (Settings.getFixThumbUrl()) { imageUrl = EhUrl.getFixedPreviewThumbUrl(imageUrl); } largePreviewSet.addItem(index, imageUrl, pageUrl); } if (largePreviewSet.size() == 0) { throw new ParseException("Can't parse large preview", body); } return largePreviewSet; } /** * Parse normal previews with regular expressions */ private static NormalPreviewSet parseNormalPreviewSet(String body) throws ParseException { Matcher m = PATTERN_NORMAL_PREVIEW.matcher(body); NormalPreviewSet normalPreviewSet = new NormalPreviewSet(); while (m.find()) { normalPreviewSet.addItem(ParserUtils.parseInt(m.group(6)) - 1, ParserUtils.trim(m.group(3)), ParserUtils.parseInt((m.group(4))), 0, ParserUtils.parseInt(m.group(1)), ParserUtils.parseInt(m.group(2)), ParserUtils.trim(m.group(5))); } if (normalPreviewSet.size() == 0) { throw new ParseException("Can't parse normal preview", body); } return normalPreviewSet; } }