package com.erakk.lnreader.parser; import android.preference.PreferenceManager; import android.util.Log; import com.erakk.lnreader.Constants; import com.erakk.lnreader.LNReaderApplication; import com.erakk.lnreader.UIHelper; import com.erakk.lnreader.dao.NovelsDao; import com.erakk.lnreader.helper.Util; import com.erakk.lnreader.model.BookModel; import com.erakk.lnreader.model.ImageModel; import com.erakk.lnreader.model.NovelCollectionModel; import com.erakk.lnreader.model.PageModel; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Locale; import java.util.TimeZone; public class CommonParser { private static final String TAG = CommonParser.class.toString(); /** * Set Up image path * * @param content * @return */ public static String replaceImagePath(String content) { String root = UIHelper.getImageRoot(LNReaderApplication.getInstance().getApplicationContext()); // standard image String imagePath = "src=\"file://" + root + "/project/images/"; content = content.replace("src=\"/project/images/", imagePath); // new thumbnail handling // /project/thumb.php?f=Masou_Gakuen_HxH_V09_BW_03.png&width=85 // /project/thumbs/M/Masou_Gakuen_HxH_V09_BW_03-85px.png String regex = "src=\"/project/thumb.php.f=(.)([\\w-]*)\\.(\\w*)&width=(\\d+)\""; String replace = "src=\"/project/thumbs/$1/$1$2-$4px.$3\""; content = content.replaceAll(regex, replace); content = content.replace("src=\"/project/thumbs/", "src=\"file://" + root + "/project/thumbs/"); // remove srcset content = content.replace("srcset=", "srcset-disabled="); //Log.v(TAG, content); return content; } /** * Get all img element * * @param doc * @return */ public static ArrayList<ImageModel> processImagesFromContent(Document doc) { String baseUrl = UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()); Elements imageElements = doc.select("img"); ArrayList<ImageModel> images = new ArrayList<ImageModel>(); for (Element imageElement : imageElements) { ImageModel image = new ImageModel(); String urlStr = imageElement.attr("src").replace("/project/", baseUrl + "/project/"); String name = urlStr.substring(urlStr.lastIndexOf("/")); image.setName(name); try { image.setUrl(new URL(urlStr)); } catch (MalformedURLException e) { // shouldn't happened Log.e(TAG, "Invalid URL: " + urlStr, e); } images.add(image); // Log.d("ParseNovelContent", image.getName() + "==>" + image.getUrl().toString()); } return images; } /** * Sanitizes a title by removing unnecessary stuff. * * @param title * @return */ public static String sanitize(String title, boolean isAggresive) { Log.v(TAG, "Before: " + title); title = title.replaceAll("<.+?>", "") // Strip tags .replaceAll("\\[.+?\\]", "") // Strip [___]s .replaceAll("\\(PDF\\)", "") // Strip (PDF) .replaceAll("\\(Full Text.*\\)| - Full Text", ""); // strip (Full Text) title = title.trim(); if (title.endsWith("-")) { title = title.substring(0, title.length() - 1); title = title.trim(); } Log.v(TAG, "After: " + title); if (isAggresive) { if (PreferenceManager.getDefaultSharedPreferences(LNReaderApplication.getInstance().getApplicationContext()).getBoolean(Constants.PREF_AGGRESIVE_TITLE_CLEAN_UP, true)) { // Leaves only the text before brackets (might be a bit too aggressive) title = title.replaceAll("^(.+?)[(\\[].*$", "$1"); Log.d(TAG, "After Aggressive: " + title); } } return title.trim(); } /** * Remove redlink, user, and ISBN page * * @param book * @return */ public static ArrayList<PageModel> validateNovelChapters(BookModel book) { ArrayList<PageModel> chapters = book.getChapterCollection(); ArrayList<PageModel> validatedChapters = new ArrayList<PageModel>(); int chapterOrder = 0; for (PageModel chapter : chapters) { if (chapter.getPage().contains("User:") // user page || chapter.getPage().contains("Special:BookSources")// ISBN handler // || chapter.getPage().contains("redlink=1") // missing page ) { Log.d(TAG, "Skipping: " + chapter.getPage()); continue; } else { chapter.setOrder(chapterOrder); validatedChapters.add(chapter); ++chapterOrder; } } return validatedChapters; } /** * Remove invalid chapter from volumes * * @param books * @return */ public static ArrayList<BookModel> validateNovelBooks(ArrayList<BookModel> books) { ArrayList<BookModel> validatedBooks = new ArrayList<BookModel>(); int bookOrder = 0; for (BookModel book : books) { BookModel validatedBook = new BookModel(); ArrayList<PageModel> validatedChapters = validateNovelChapters(book); // check if have any chapters if (validatedChapters.size() > 0) { validatedBook = book; validatedBook.setChapterCollection(validatedChapters); validatedBook.setOrder(bookOrder); validatedBooks.add(validatedBook); // Log.d("validateNovelBooks", "Adding: " + validatedBook.getTitle() + " order: " + // validatedBook.getOrder()); ++bookOrder; } } return validatedBooks; } /** * Check if the page is redirected. Return null if not. * * @param doc * @param page * @return */ public static String redirectedFrom(Document doc, PageModel page) { if (page.getRedirectedTo() != null) { try { return URLEncoder.encode(page.getRedirectedTo().replace(" ", "_"), "UTF-8"); } catch (UnsupportedEncodingException e) { Log.e(TAG, "Error when encoding redirected pages", e); return null; } } return null; } /** * parse page info from Wiki API * * @param pageModel page name * @param doc parsed page for given pageName * @return PageModel status, no parent and type defined */ public static PageModel parsePageAPI(PageModel pageModel, Document doc, String url) throws Exception { ArrayList<PageModel> temp = new ArrayList<PageModel>(); temp.add(pageModel); temp = parsePageAPI(temp, doc, url); return temp.get(0); } /** * parse pages info from Wiki API * * @param pageModels ArrayList of pages * @param doc parsed page for given pages * @return PageModel status, no parent and type defined */ public static ArrayList<PageModel> parsePageAPI(ArrayList<PageModel> pageModels, Document doc, String url) throws Exception { Elements normalized = doc.select("n"); Elements redirects = doc.select("r"); // Log.d(TAG, "parsePageAPI redirected size: " + redirects.size()); Elements pages = doc.select("page"); Log.d(TAG, "parsePageAPI pages size: " + pages.size()); DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.getDefault()); formatter.setTimeZone(TimeZone.getTimeZone("UTC")); for (int i = 0; i < pageModels.size(); ++i) { PageModel temp = pageModels.get(i); String to = URLDecoder.decode(temp.getPage(), "utf-8"); Log.v(TAG, "parsePageAPI source: " + to); if (Util.isStringNullOrEmpty(to)) { Log.e(TAG, "Empty source detected for url: " + url); continue; } // get normalized value for this page Elements nElements = normalized.select("n[from=" + to + "]"); if (nElements != null && nElements.size() > 0) { Element nElement = nElements.first(); to = nElement.attr("to"); Log.v(TAG, "parsePageAPI normalized: " + to); if (Util.isStringNullOrEmpty(to)) { Log.e(TAG, "Empty normalized source detected for url: " + url); continue; } } // check redirects if (redirects != null && redirects.size() > 0) { Elements rElements = redirects.select("r[from=" + to + "]"); if (rElements != null && rElements.size() > 0) { Element rElement = rElements.first(); to = rElement.attr("to"); temp.setRedirectedTo(to); Log.w(TAG, "parsePageAPI redirected: " + to); if (Util.isStringNullOrEmpty(to)) { Log.e(TAG, "Empty redirected source detected for url: " + url); continue; } } } Element pElement = pages.select("page[title=" + to + "]").first(); if (pElement == null) { Log.w(TAG, "parsePageAPI " + temp.getPage() + ": No Info, please check the url: " + url); } else if (!pElement.hasAttr("missing")) { // parse date, default use touched attr, if rev not available String tempDate = pElement.attr("touched"); Element rev = pElement.select("rev").first(); if (rev != null) { tempDate = rev.attr("timestamp"); Log.v(TAG, "Using timestamp from revision"); } // parse wiki id int wikiId = -1; try { wikiId = Integer.parseInt(pElement.attr("pageid")); } catch (NumberFormatException nex) { Log.e(TAG, String.format("Invalid pageid: '%s' for %s", pElement.attr("pageid"), temp.getPage())); } // parse categories ArrayList<String> tempCat = new ArrayList<>(); try { Element eCategoryRoot = pElement.select("categories").first(); if (eCategoryRoot != null) { Elements eCategories = eCategoryRoot.select("cl"); if (eCategories != null) { for (Element eCategory : eCategories) { String category = eCategory.attr("title"); Log.d(TAG, "Found category: " + category); tempCat.add(category); } } } } catch (Exception ex) { Log.e(TAG, "Cannot get categories for: " + temp.getPage(), ex); } // update data if (!Util.isStringNullOrEmpty(tempDate)) { Date lastUpdate = formatter.parse(tempDate); temp.setLastUpdate(lastUpdate); temp.setMissing(false); temp.setWikiId(wikiId); temp.setCategories(tempCat); if (Util.isStringNullOrEmpty(temp.getTitle())) temp.setTitle(to); Log.d(TAG, String.format("parsePageAPI [%s]%s Last Update: %s ", temp.getPage(), temp.getWikiId(), temp.getLastUpdate())); } else { Log.w(TAG, "parsePageAPI " + temp.getPage() + " No Last Update Information!"); } } else { temp.setMissing(true); Log.w(TAG, "parsePageAPI missing page info: " + to); } if (temp.getPage().contains("redlink=1")) { temp.setMissing(true); } } return pageModels; } /** * Get the url for the big image http://www.baka-tsuki.org/project/index.php?title=File:xxx * * @param imageUrl * @return */ public static String getImageFilePageFromImageUrl(String imageUrl) { String pageUrl = ""; // http://www.baka-tsuki.org/project/images/4/4a/Bakemonogatari_Up.png // http://www.baka-tsuki.org/project/images/thumb/4/4a/Bakemonogatari_Up.png/200px-Bakemonogatari_Up.png // http://www.baka-tsuki.org/project/index.php?title=File:Bakemonogatari_Up.png // http://www.baka-tsuki.org/project/thumb.php?f=KNT_V01_NewCover.jpg&width=250 String[] tokens = imageUrl.split("/"); if (imageUrl.contains("/thumb/")) { // from thumbnail pageUrl = tokens[8]; } else if (imageUrl.contains("/thumbs/")) { // from new thumbnail // /storage/emulated/0/.bakareaderex2/project/thumbs/S/Survey_cover-300px.jpg pageUrl = tokens[tokens.length-1]; pageUrl = pageUrl.replaceAll("-\\d+px",""); } else if (imageUrl.contains("/thumb.php?")) { String[] temp = imageUrl.split("f="); temp = temp[1].split("&"); pageUrl = temp[0]; } else { // from full page pageUrl = tokens[7]; } pageUrl = UIHelper.getBaseUrl(LNReaderApplication.getInstance()) + "/project/index.php?title=File:" + pageUrl; return pageUrl; } /** * Get the image model from /project/index.php?title=File:xxx * * @param doc * @return */ public static ImageModel parseImagePage(Document doc) { ImageModel image = new ImageModel(); Element mainContent = doc.select("#mw-content-text").first(); Element fullMedia = mainContent.select(".fullMedia").first(); String imageUrl = fullMedia.select("a").first().attr("href"); try { image.setUrl(new URL(UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + imageUrl)); } catch (MalformedURLException e) { // shouldn't happened Log.e(TAG, "Invalid URL: " + UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + imageUrl, e); } return image; } /** * Get all /project/index.php?title=File:xxx from content * * @param doc * @return */ public static ArrayList<String> parseImagesFromContentPage(Document doc) { ArrayList<String> result = new ArrayList<String>(); Elements links = doc.select("a"); for (Element link : links) { String href = link.attr("href"); if (href.contains("/project/index.php?title=File:")) { if (!href.startsWith("http")) href = UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + href; if (!result.contains(href)) result.add(href); } } Log.d(TAG, "Images Found: " + result.size()); return result; } /** * Process <a> to chapter * * @param title * @param parent * @param chapterOrder * @param link * @param language * @return */ public static PageModel processA(String title, String parent, int chapterOrder, Element link, String language) { String href = link.attr("href"); // handle redlink if (!UIHelper.getUpdateIncludeRedlink(LNReaderApplication.getInstance().getApplicationContext()) && href.contains("&redlink=1")) { return null; } PageModel p = new PageModel(); p.setTitle(CommonParser.sanitize(title, false)); p.setParent(parent); p.setType(PageModel.TYPE_CONTENT); p.setOrder(chapterOrder); p.setLastUpdate(new Date(0)); p.setLanguage(language); // External link if (link.className().contains("external text")) { p.setExternal(true); p.setPage(Util.SanitizeBaseUrl(href, false)); // Log.d(TAG, "Found external link for " + p.getTitle() + ": " + link.attr("href")); } else { p.setExternal(false); String tempPage = normalizeInternalUrl(href); p.setPage(tempPage); } return p; } /** * Process li to chapters. * * @param li * @param parent * @param chapterOrder * @return */ public static ArrayList<PageModel> processLI(Element li, String parent, int chapterOrder, String language) { ArrayList<PageModel> pageModels = new ArrayList<>(); Elements links = li.select("a"); if (links != null && links.size() > 0) { for (Element link : links) { // skip if User_talk: if (link.attr("href").contains("User_talk:")) { continue; } // if parent of the link is li element, use only the link text String linkText = link.text(); if (link.parent() != li) linkText = li.text(); PageModel p = processA(linkText, parent, chapterOrder, link, language); if (p != null) pageModels.add(p); } } return pageModels; } /** * Get the volume name and parse the chapter list. * * @param novel * @param books * @param bookElement * @param bookOrder * @return */ public static int processH3(NovelCollectionModel novel, ArrayList<BookModel> books, Element bookElement, int bookOrder, String language) { // Log.d(TAG, "Found: " +bookElement.text()); BookModel book = new BookModel(); if (bookElement.html().contains("href")) { book.setTitle(CommonParser.sanitize(bookElement.text(), true)); } else { book.setTitle(CommonParser.sanitize(bookElement.text(), false)); } String parent = novel.getPage() + Constants.NOVEL_BOOK_DIVIDER + book.getTitle(); book.setOrder(bookOrder); ArrayList<PageModel> chapterCollection = parseChapters(novel, bookElement, language, parent); if (chapterCollection.size() == 0) { Elements bookLinks = bookElement.select("a"); if (bookLinks != null) { for (Element a : bookLinks) { Log.e(TAG, "Got linked Volume without chapter list: " + a.text() + " => " + a.attr("href")); if (a.attr("href").startsWith(Constants.ROOT_URL) || a.attr("href").startsWith(UIHelper.getBaseUrl(LNReaderApplication.getInstance()))) { PageModel p = processA(a.text(), parent, 0, a, language); if (p != null) { Log.i(TAG, "Added chapter list: " + a.text() + " => " + a.attr("href")); chapterCollection.add(p); break; } } } } } book.setChapterCollection(chapterCollection); books.add(book); ++bookOrder; return bookOrder; } /** * Parse chapter from element containing li element. * * @param novel * @param bookElement * @param language * @param parent * @return */ public static ArrayList<PageModel> parseChapters(NovelCollectionModel novel, Element bookElement, String language, String parent) { ArrayList<PageModel> chapterCollection = new ArrayList<PageModel>(); // parse the chapters. boolean walkChapter = true; int chapterOrder = 0; Element chapterElement = bookElement; do { chapterElement = chapterElement.nextElementSibling(); if (chapterElement == null || chapterElement.tagName() == "h2" || chapterElement.tagName() == "h3" || chapterElement.tagName() == "h4") { walkChapter = false; } else { Elements chapters = chapterElement.select("li"); for (Element chapter : chapters) { ArrayList<PageModel> pageModels = processLI(chapter, parent, chapterOrder, language); for (PageModel p : pageModels) { if (p != null) { chapterCollection.add(p); ++chapterOrder; } } } } } while (walkChapter); return chapterCollection; } /** * Remove http(s)://www.baka-tsuki.org/project/index.php?title= * * @param url * @return */ public static String normalizeInternalUrl(String url) { return url.replace("/project/index.php?title=", "").replace(Constants.ROOT_HTTPS, "").replace(Constants.ROOT_URL, ""); } /** * Parse novel cover from the first element of img with css class .thumbimage * * @param doc * @param novel * @return */ public static String parseNovelCover(Document doc, NovelCollectionModel novel) { String imageUrl = ""; Elements images = doc.select(".thumbimage"); if (images.size() > 0) { imageUrl = images.first().attr("src"); if (!imageUrl.startsWith("http")) { imageUrl = UIHelper.getBaseUrl(LNReaderApplication.getInstance()) + imageUrl; } // http://www.baka-tsuki.org/project/images/thumb/f/f5/Daimaou_v01_cover.jpg/294px-Daimaou_v01_cover.jpg if (UIHelper.isUseBigCover(LNReaderApplication.getInstance())) { if (imageUrl.contains("/thumb/")) { imageUrl = imageUrl.replace("/thumb/", "/"); imageUrl = imageUrl.substring(0, imageUrl.lastIndexOf("/")); } else if (imageUrl.contains(".php?")) { // http://www.baka-tsuki.org/project/thumb.php?f=KNT_V01_NewCover.jpg&width=250 // http://www.baka-tsuki.org/project/index.php?title=File:Znt_novel_cover.jpg // need to check the original file String filePage = getImageFilePageFromImageUrl(imageUrl); ImageModel image = new ImageModel(); image.setName(filePage); try { image = NovelsDao.getInstance().getImageModelFromInternet(image, novel.getPage(), null); imageUrl = image.getUrl().toString(); } catch (Exception ex) { Log.e(TAG, "Failed parsing big cover: " + filePage, ex); } } } Log.d(TAG, "Cover: " + imageUrl); } novel.setCover(imageUrl); if (imageUrl != null && imageUrl.length() > 0) { try { URL url = new URL(imageUrl); novel.setCoverUrl(url); } catch (MalformedURLException e) { Log.e(TAG, "Invalid URL: " + imageUrl, e); } } // Log.d(TAG, "Complete parsing cover image"); return imageUrl; } }