package com.dotcool.reader.parser; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.Locale; import java.util.TimeZone; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import android.preference.PreferenceManager; import android.util.Log; import com.dotcool.reader.Constants; import com.dotcool.reader.LNReaderApplication; import com.dotcool.reader.UIHelper; import com.dotcool.reader.helper.Util; import com.dotcool.reader.model.BookModel; import com.dotcool.reader.model.ImageModel; import com.dotcool.reader.model.PageModel; public class CommonParser { private static final String TAG = CommonParser.class.toString(); /** * Set Up image path * * @param content * @return */ public static String replaceImagePath(String content) { String imagePath = "src=\"file://" + UIHelper.getImageRoot(LNReaderApplication.getInstance().getApplicationContext()) + "/project/images/"; content = content.replace("src=\"/project/images/", imagePath); return content; } /** * Get all img element and update the src from /project/ to rootImagePath/project/ * @param doc * @param rootImagePath * @return */ public static ArrayList<ImageModel> getAllImagesFromContent(Document doc, String rootImagePath) { Elements imageElements = doc.select("img"); ArrayList<ImageModel> images = new ArrayList<ImageModel>(); for (Element imageElement : imageElements) { ImageModel image = new ImageModel(); String urlStr = imageElement.attr("src").replace("/project/", rootImagePath + "/project/"); //imageElement.attr("src", urlStr); String name = urlStr.substring(urlStr.lastIndexOf("/")); image.setName(name); try { image.setUrl(new URL(urlStr)); } catch (MalformedURLException e) { // shouldn't happened Log.e(TAG, "Invalid URL: " + urlStr, e); } images.add(image); // Log.d("ParseNovelContent", image.getName() + "==>" + image.getUrl().toString()); } return images; } /** * Sanitizes a title by removing unnecessary stuff. * * @param title * @return */ public static String sanitize(String title, boolean isAggresive) { Log.d(TAG, "Before: " + title); title = title.replaceAll("<.+?>", "") // Strip tags .replaceAll("\\[.+?\\]", "") // Strip [___]s .replaceAll("- PDF", "").replaceAll("\\(PDF\\)", "") // Strip (PDF) // Strip - (Full Text) .replaceAll("- (Full Text)", "").replaceAll("- \\(.*Full Text.*\\)", "").replace("\\(.*Full Text.*\\)", ""); Log.d(TAG, "After: " + title); if (isAggresive) { if (PreferenceManager.getDefaultSharedPreferences(LNReaderApplication.getInstance().getApplicationContext()).getBoolean(Constants.PREF_AGGRESIVE_TITLE_CLEAN_UP, true)) { // Leaves only the text before brackets (might be a bit too aggressive) title = title.replaceAll("^(.+?)[(\\[].*$", "$1"); Log.d(TAG, "After Aggresive: " + title); } } return title.trim(); } /** * Remove redlink, user, and ISBN page * * @param book * @return */ public static ArrayList<PageModel> validateNovelChapters(BookModel book) { ArrayList<PageModel> chapters = book.getChapterCollection(); ArrayList<PageModel> validatedChapters = new ArrayList<PageModel>(); int chapterOrder = 0; for (Iterator<PageModel> iChapter = chapters.iterator(); iChapter.hasNext();) { PageModel chapter = iChapter.next(); // redlink=1 means chapter is missing, commented out to include missing chapters if (!(//chapter.getPage().contains("redlink=1") || // missing page chapter.getPage().contains("User:") || // user page chapter.getPage().contains("Special:BookSources") // ISBN handler )) { chapter.setOrder(chapterOrder); validatedChapters.add(chapter); ++chapterOrder; } } return validatedChapters; } /** * Remove invalid chapter from volumes * * @param books * @return */ public static ArrayList<BookModel> validateNovelBooks(ArrayList<BookModel> books) { ArrayList<BookModel> validatedBooks = new ArrayList<BookModel>(); int bookOrder = 0; for (Iterator<BookModel> iBooks = books.iterator(); iBooks.hasNext();) { BookModel book = iBooks.next(); BookModel validatedBook = new BookModel(); ArrayList<PageModel> validatedChapters = validateNovelChapters(book); // check if have any chapters if (validatedChapters.size() > 0) { validatedBook = book; validatedBook.setChapterCollection(validatedChapters); validatedBook.setOrder(bookOrder); validatedBooks.add(validatedBook); // Log.d("validateNovelBooks", "Adding: " + validatedBook.getTitle() + " order: " + // validatedBook.getOrder()); ++bookOrder; } } return validatedBooks; } /** * Check if the page is redirected. Return null if not. * * @param doc * @param page * @return */ public static String redirectedFrom(Document doc, PageModel page) { if (page.getRedirectedTo() != null) { try { return URLEncoder.encode(page.getRedirectedTo().replace(" ", "_"), "UTF-8"); } catch (UnsupportedEncodingException e) { Log.e(TAG, "Error when encoding redirected pages", e); return null; } } return null; } /** * parse page info from Wiki API * * @param pageModel * page name * @param doc * parsed page for given pageName * @return PageModel status, no parent and type defined */ public static PageModel parsePageAPI(PageModel pageModel, Document doc, String url) throws Exception { ArrayList<PageModel> temp = new ArrayList<PageModel>(); temp.add(pageModel); temp = parsePageAPI(temp, doc, url); return temp.get(0); } /** * parse pages info from Wiki API * * @param pageModels * ArrayList of pages * @param doc * parsed page for given pages * @return PageModel status, no parent and type defined */ public static ArrayList<PageModel> parsePageAPI(ArrayList<PageModel> pageModels, Document doc, String url) throws Exception { Elements normalized = doc.select("n"); Elements redirects = doc.select("r"); // Log.d(TAG, "parsePageAPI redirected size: " + redirects.size()); Elements pages = doc.select("page"); Log.d(TAG, "parsePageAPI pages size: " + pages.size()); DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.getDefault()); formatter.setTimeZone(TimeZone.getTimeZone("UTC")); for (int i = 0; i < pageModels.size(); ++i) { PageModel temp = pageModels.get(i); String to = URLDecoder.decode(temp.getPage(), "utf-8"); Log.d(TAG, "parsePageAPI source: " + to); if (Util.isStringNullOrEmpty(to)) { Log.e(TAG, "Empty source detected for url: " + url); continue; } // get normalized value for this page Elements nElements = normalized.select("n[from=" + to + "]"); if (nElements != null && nElements.size() > 0) { Element nElement = nElements.first(); to = nElement.attr("to"); Log.d(TAG, "parsePageAPI normalized: " + to); if (Util.isStringNullOrEmpty(to)) { Log.e(TAG, "Empty normalized source detected for url: " + url); continue; } } // check redirects if (redirects != null && redirects.size() > 0) { Elements rElements = redirects.select("r[from=" + to + "]"); if (rElements != null && rElements.size() > 0) { Element rElement = rElements.first(); to = rElement.attr("to"); temp.setRedirectedTo(to); Log.i(TAG, "parsePageAPI redirected: " + to); if (Util.isStringNullOrEmpty(to)) { Log.e(TAG, "Empty redirected source detected for url: " + url); continue; } } } Element pElement = pages.select("page[title=" + to + "]").first(); if (pElement == null) { Log.w(TAG, "parsePageAPI " + temp.getPage() + ": No Info, please check the url: " + url); } else if (!pElement.hasAttr("missing")) { // parse date String tempDate = pElement.attr("touched"); if (!Util.isStringNullOrEmpty(tempDate)) { Date lastUpdate = formatter.parse(tempDate); temp.setLastUpdate(lastUpdate); temp.setMissing(false); Log.i(TAG, "parsePageAPI " + temp.getPage() + " Last Update: " + temp.getLastUpdate()); } else { Log.w(TAG, "parsePageAPI " + temp.getPage() + " No Last Update Information!"); } } else { temp.setMissing(true); Log.w(TAG, "parsePageAPI missing page info: " + to); } if (temp.getPage().contains("redlink=1")) { temp.setMissing(true); } } return pageModels; } /** * Get the url for the big image http://www.baka-tsuki.org/project/index.php?title=File:xxx * @param imageUrl * @return */ public static String getImageFilePageFromImageUrl(String imageUrl) { String pageUrl = ""; // http://www.baka-tsuki.org/project/images/4/4a/Bakemonogatari_Up.png // http://www.baka-tsuki.org/project/images/thumb/4/4a/Bakemonogatari_Up.png/200px-Bakemonogatari_Up.png // http://www.baka-tsuki.org/project/index.php?title=File:Bakemonogatari_Up.png String[] tokens = imageUrl.split("/"); if (imageUrl.contains("/thumb/")) { // from thumbnail pageUrl = tokens[8]; } else { // from full page pageUrl = tokens[7]; } pageUrl = UIHelper.getBaseUrl(LNReaderApplication.getInstance()) + "/project/index.php?title=File:" + pageUrl; return pageUrl; } /** * Get the image model from /project/index.php?title=File:xxx * @param doc * @return */ public static ImageModel parseImagePage(Document doc) { ImageModel image = new ImageModel(); Element mainContent = doc.select("#mw-content-text").first(); Element fullMedia = mainContent.select(".fullMedia").first(); String imageUrl = fullMedia.select("a").first().attr("href"); try { image.setUrl(new URL(UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + imageUrl)); } catch (MalformedURLException e) { // shouldn't happened Log.e(TAG, "Invalid URL: " + UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + imageUrl, e); } return image; } /** * Get all /project/index.php?title=File:xxx from content * @param doc * @return */ public static ArrayList<String> parseImagesFromContentPage(Document doc) { ArrayList<String> result = new ArrayList<String>(); Elements links = doc.select("a"); for (Element link : links) { String href = link.attr("href"); if (href.contains("/project/index.php?title=File:")) { if (!href.startsWith("http")) href = UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + href; result.add(href); } } Log.d(TAG, "Images Found: " + result.size()); return result; } }