CommonParser.java example

Explorer
coolreader-master
- src
  - com
    - dotcool
    - nil
      - lu
        ads
        L.java
        MainAdsActivity.java
        MyPointsReceiver.java
        UpdateHelper.java
package com.dotcool.reader.parser;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.TimeZone;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import android.preference.PreferenceManager;
import android.util.Log;

import com.dotcool.reader.Constants;
import com.dotcool.reader.LNReaderApplication;
import com.dotcool.reader.UIHelper;
import com.dotcool.reader.helper.Util;
import com.dotcool.reader.model.BookModel;
import com.dotcool.reader.model.ImageModel;
import com.dotcool.reader.model.PageModel;

public class CommonParser {

	private static final String TAG = CommonParser.class.toString();

	/**
	 * Set Up image path
	 * 
	 * @param content
	 * @return
	 */
	public static String replaceImagePath(String content) {
		String imagePath = "src=\"file://" + UIHelper.getImageRoot(LNReaderApplication.getInstance().getApplicationContext()) + "/project/images/";
		content = content.replace("src=\"/project/images/", imagePath);
		return content;
	}

	/**
	 * Get all img element and update the src from /project/ to rootImagePath/project/
	 * @param doc
	 * @param rootImagePath
	 * @return
	 */
	public static ArrayList<ImageModel> getAllImagesFromContent(Document doc, String rootImagePath) {
		Elements imageElements = doc.select("img");
		ArrayList<ImageModel> images = new ArrayList<ImageModel>();
		for (Element imageElement : imageElements) {
			ImageModel image = new ImageModel();
			String urlStr = imageElement.attr("src").replace("/project/", rootImagePath + "/project/");
			//imageElement.attr("src", urlStr);
			String name = urlStr.substring(urlStr.lastIndexOf("/"));
			image.setName(name);
			try {
				image.setUrl(new URL(urlStr));
			} catch (MalformedURLException e) {
				// shouldn't happened
				Log.e(TAG, "Invalid URL: " + urlStr, e);
			}
			images.add(image);
			// Log.d("ParseNovelContent", image.getName() + "==>" + image.getUrl().toString());
		}
		return images;
	}

	/**
	 * Sanitizes a title by removing unnecessary stuff.
	 * 
	 * @param title
	 * @return
	 */
	public static String sanitize(String title, boolean isAggresive) {
		Log.d(TAG, "Before: " + title);
		title = title.replaceAll("<.+?>", "") // Strip tags
				.replaceAll("\\[.+?\\]", "") // Strip [___]s
				.replaceAll("- PDF", "").replaceAll("\\(PDF\\)", "") // Strip (PDF)
				// Strip - (Full Text)
				.replaceAll("- (Full Text)", "").replaceAll("- \\(.*Full Text.*\\)", "").replace("\\(.*Full Text.*\\)", "");
		Log.d(TAG, "After: " + title);
		if (isAggresive) {
			if (PreferenceManager.getDefaultSharedPreferences(LNReaderApplication.getInstance().getApplicationContext()).getBoolean(Constants.PREF_AGGRESIVE_TITLE_CLEAN_UP, true)) {
				// Leaves only the text before brackets (might be a bit too aggressive)
				title = title.replaceAll("^(.+?)[(\\[].*$", "$1");
				Log.d(TAG, "After Aggresive: " + title);
			}
		}
		return title.trim();
	}

	/**
	 * Remove redlink, user, and ISBN page
	 * 
	 * @param book
	 * @return
	 */
	public static ArrayList<PageModel> validateNovelChapters(BookModel book) {
		ArrayList<PageModel> chapters = book.getChapterCollection();
		ArrayList<PageModel> validatedChapters = new ArrayList<PageModel>();
		int chapterOrder = 0;
		for (Iterator<PageModel> iChapter = chapters.iterator(); iChapter.hasNext();) {
			PageModel chapter = iChapter.next();
			
			// redlink=1 means chapter is missing, commented out to include missing chapters
			if (!(//chapter.getPage().contains("redlink=1") || // missing page
					chapter.getPage().contains("User:") || // user page
					chapter.getPage().contains("Special:BookSources") // ISBN handler
					)) {
				chapter.setOrder(chapterOrder);
				validatedChapters.add(chapter);
				++chapterOrder;
			}
		}
		return validatedChapters;
	}

	/**
	 * Remove invalid chapter from volumes
	 * 
	 * @param books
	 * @return
	 */
	public static ArrayList<BookModel> validateNovelBooks(ArrayList<BookModel> books) {
		ArrayList<BookModel> validatedBooks = new ArrayList<BookModel>();
		int bookOrder = 0;
		for (Iterator<BookModel> iBooks = books.iterator(); iBooks.hasNext();) {
			BookModel book = iBooks.next();
			BookModel validatedBook = new BookModel();

			ArrayList<PageModel> validatedChapters = validateNovelChapters(book);

			// check if have any chapters
			if (validatedChapters.size() > 0) {
				validatedBook = book;
				validatedBook.setChapterCollection(validatedChapters);
				validatedBook.setOrder(bookOrder);
				validatedBooks.add(validatedBook);
				// Log.d("validateNovelBooks", "Adding: " + validatedBook.getTitle() + " order: " +
				// validatedBook.getOrder());
				++bookOrder;
			}
		}
		return validatedBooks;
	}

	/**
	 * Check if the page is redirected. Return null if not.
	 * 
	 * @param doc
	 * @param page
	 * @return
	 */
	public static String redirectedFrom(Document doc, PageModel page) {
		if (page.getRedirectedTo() != null) {
			try {
				return URLEncoder.encode(page.getRedirectedTo().replace(" ", "_"), "UTF-8");
			} catch (UnsupportedEncodingException e) {
				Log.e(TAG, "Error when encoding redirected pages", e);
				return null;
			}
		}
		return null;
	}

	/**
	 * parse page info from Wiki API
	 * 
	 * @param pageModel
	 *            page name
	 * @param doc
	 *            parsed page for given pageName
	 * @return PageModel status, no parent and type defined
	 */
	public static PageModel parsePageAPI(PageModel pageModel, Document doc, String url) throws Exception {
		ArrayList<PageModel> temp = new ArrayList<PageModel>();
		temp.add(pageModel);
		temp = parsePageAPI(temp, doc, url);
		return temp.get(0);
	}

	/**
	 * parse pages info from Wiki API
	 * 
	 * @param pageModels
	 *            ArrayList of pages
	 * @param doc
	 *            parsed page for given pages
	 * @return PageModel status, no parent and type defined
	 */
	public static ArrayList<PageModel> parsePageAPI(ArrayList<PageModel> pageModels, Document doc, String url) throws Exception {
		Elements normalized = doc.select("n");
		Elements redirects = doc.select("r");
		// Log.d(TAG, "parsePageAPI redirected size: " + redirects.size());
		Elements pages = doc.select("page");
		Log.d(TAG, "parsePageAPI pages size: " + pages.size());

		DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.getDefault());
		formatter.setTimeZone(TimeZone.getTimeZone("UTC"));

		for (int i = 0; i < pageModels.size(); ++i) {
			PageModel temp = pageModels.get(i);

			String to = URLDecoder.decode(temp.getPage(), "utf-8");
			Log.d(TAG, "parsePageAPI source: " + to);
			if (Util.isStringNullOrEmpty(to)) {
				Log.e(TAG, "Empty source detected for url: " + url);
				continue;
			}

			// get normalized value for this page
			Elements nElements = normalized.select("n[from=" + to + "]");
			if (nElements != null && nElements.size() > 0) {
				Element nElement = nElements.first();
				to = nElement.attr("to");
				Log.d(TAG, "parsePageAPI normalized: " + to);
				if (Util.isStringNullOrEmpty(to)) {
					Log.e(TAG, "Empty normalized source detected for url: " + url);
					continue;
				}
			}

			// check redirects
			if (redirects != null && redirects.size() > 0) {
				Elements rElements = redirects.select("r[from=" + to + "]");
				if (rElements != null && rElements.size() > 0) {
					Element rElement = rElements.first();
					to = rElement.attr("to");
					temp.setRedirectedTo(to);
					Log.i(TAG, "parsePageAPI redirected: " + to);
					if (Util.isStringNullOrEmpty(to)) {
						Log.e(TAG, "Empty redirected source detected for url: " + url);
						continue;
					}
				}
			}

			Element pElement = pages.select("page[title=" + to + "]").first();
			if (pElement == null) {
				Log.w(TAG, "parsePageAPI " + temp.getPage() + ": No Info, please check the url: " + url);
			} else if (!pElement.hasAttr("missing")) {
				// parse date
				String tempDate = pElement.attr("touched");
				if (!Util.isStringNullOrEmpty(tempDate)) {
					Date lastUpdate = formatter.parse(tempDate);
					temp.setLastUpdate(lastUpdate);
					temp.setMissing(false);
					Log.i(TAG, "parsePageAPI " + temp.getPage() + " Last Update: " + temp.getLastUpdate());
				} else {
					Log.w(TAG, "parsePageAPI " + temp.getPage() + " No Last Update Information!");
				}
			} else {
				temp.setMissing(true);
				Log.w(TAG, "parsePageAPI missing page info: " + to);
			}
			if (temp.getPage().contains("redlink=1")) {
				temp.setMissing(true);
			}
		}
		return pageModels;
	}

	/**
	 * Get the url for the big image http://www.baka-tsuki.org/project/index.php?title=File:xxx
	 * @param imageUrl
	 * @return
	 */
	public static String getImageFilePageFromImageUrl(String imageUrl) {
		String pageUrl = "";
		// http://www.baka-tsuki.org/project/images/4/4a/Bakemonogatari_Up.png
		// http://www.baka-tsuki.org/project/images/thumb/4/4a/Bakemonogatari_Up.png/200px-Bakemonogatari_Up.png
		// http://www.baka-tsuki.org/project/index.php?title=File:Bakemonogatari_Up.png
		String[] tokens = imageUrl.split("/");
		if (imageUrl.contains("/thumb/")) {
			// from thumbnail
			pageUrl = tokens[8];
		} else {
			// from full page
			pageUrl = tokens[7];
		}
		pageUrl = UIHelper.getBaseUrl(LNReaderApplication.getInstance()) + "/project/index.php?title=File:" + pageUrl;
		return pageUrl;
	}

	/**
	 * Get the image model from /project/index.php?title=File:xxx
	 * @param doc
	 * @return
	 */
	public static ImageModel parseImagePage(Document doc) {
		ImageModel image = new ImageModel();

		Element mainContent = doc.select("#mw-content-text").first();
		Element fullMedia = mainContent.select(".fullMedia").first();
		String imageUrl = fullMedia.select("a").first().attr("href");

		try {
			image.setUrl(new URL(UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + imageUrl));
		} catch (MalformedURLException e) {
			// shouldn't happened
			Log.e(TAG, "Invalid URL: " + UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + imageUrl, e);
		}
		return image;
	}

	/**
	 * Get all /project/index.php?title=File:xxx from content
	 * @param doc
	 * @return
	 */
	public static ArrayList<String> parseImagesFromContentPage(Document doc) {
		ArrayList<String> result = new ArrayList<String>();

		Elements links = doc.select("a");
		for (Element link : links) {
			String href = link.attr("href");
			if (href.contains("/project/index.php?title=File:")) {
				if (!href.startsWith("http"))
					href = UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()) + href;
				result.add(href);
			}
		}

		Log.d(TAG, "Images Found: " + result.size());
		return result;
	}
}