/**
* Parse baka-tsuki wiki page - Alternative Language
*/
package com.dotcool.reader.parser;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Map.Entry;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import android.util.Log;
import com.dotcool.reader.AlternativeLanguageInfo;
import com.dotcool.reader.Constants;
import com.dotcool.reader.LNReaderApplication;
import com.dotcool.reader.UIHelper;
import com.dotcool.reader.helper.Util;
import com.dotcool.reader.model.BookModel;
import com.dotcool.reader.model.ImageModel;
import com.dotcool.reader.model.NovelCollectionModel;
import com.dotcool.reader.model.NovelContentModel;
import com.dotcool.reader.model.PageModel;
/**
* @author freedomofkeima
* Modified from: BakaTsukiParser by Nandaka
*
*/
public class BakaTsukiParserAlternative {
private static final String TAG = BakaTsukiParserAlternative.class.toString();
/**
* Parse Alternative Language list from http://www.baka-tsuki.org/project/index.php?title=[Alternative Category]
*
* @param doc
* @return
*/
public static ArrayList<PageModel> ParseAlternativeList(Document doc, String language) {
ArrayList<PageModel> result = new ArrayList<PageModel>();
String category = "";
if (language != null)
category = AlternativeLanguageInfo.getAlternativeLanguageInfo().get(language).getCategoryInfo();
if (doc == null)
throw new NullPointerException("Document cannot be null.");
Element stage = doc.select("#mw-pages").first();
int order = 0;
if (stage != null) {
Elements list = stage.select("li");
for (Element element : list) {
Element link = element.select("a").first();
PageModel page = new PageModel();
page.setParent(category);
String tempPage = link.attr("href").replace("/project/index.php?title=", "").replace(Constants.BASE_URL_HTTPS, "").replace(Constants.BASE_URL, "");
page.setPage(tempPage);
page.setLanguage(language);
page.setType(PageModel.TYPE_NOVEL);
page.setTitle(link.text());
page.setStatus(language);
page.setOrder(order);
result.add(page);
++order;
}
}
return result;
}
private static ArrayList<BookModel> parseDetails(String jsonString) throws JSONException {
JSONArray array = new JSONArray(jsonString);
ArrayList<BookModel> items = new ArrayList<BookModel>();
for (int i = 0; i < array.length(); i++) {
JSONObject data = array.getJSONObject(i);
BookModel result = new BookModel();
result.setId(data.optInt("id"));
result.setTitle(data.optString("title"));
result.setBookId(data.optInt("bookid"));
result.setChapter(data.optInt("chapter"));
result.setCreatetime(data.optString("createtime"));
result.setUpdatetime(data.optString("updatetime"));
ArrayList<PageModel> pages=new ArrayList<PageModel>();
PageModel page = new PageModel();
page.setBook(result);
page.setDownloaded(false);
page.setId(result.getId());
//page.setLastUpdate(new Date(result.getUpdatetime()));
page.setPage(result.getTitle());
pages.add(page);
result.setChapterCollection(pages);
items.add(result);
}
return items;
}
public static NovelCollectionModel ParseNovelDetails(String doc, PageModel page) throws MalformedURLException, JSONException {
NovelCollectionModel novel = new NovelCollectionModel();
if (doc == null)
throw new NullPointerException("Document cannot be null.");
novel.setPage(page.getPage());
novel.setPageModel(page);
String redirected = null;
novel.setRedirectTo(redirected);
novel.setSynopsis(page.getTitle());
URL url = new URL(Constants.BASE_IMAGE_URL+page.getImgurl());
novel.setCoverUrl(url);
ArrayList<BookModel> books = parseDetails(doc);
novel.setBookCollections(books);
return novel;
}
private static PageModel parseNovelStatus(Document doc, PageModel page) {
boolean isTeaser = page.isTeaser();
boolean isStalled = page.isStalled();
boolean isAbandoned = page.isAbandoned();
boolean isPending = page.isPending();
// Template:STALLED
Elements links = doc.select("a[title=Template:STALLED]");
if (links != null && links.size() > 0) {
isStalled = true;
Log.i(TAG, "Novel is stalled: " + page.getPage());
} else
isStalled = false;
// Template:Abandoned
links = doc.select("a[title=Template:Abandoned]");
if (links != null && links.size() > 0) {
isAbandoned = true;
Log.i(TAG, "Novel is abandoned: " + page.getPage());
} else
isAbandoned = false;
// Template:Warning:ATP
links = doc.select("a[title=Template:Warning:ATP]");
if (links != null && links.size() > 0) {
isPending = true;
Log.i(TAG, "Novel is pending authorization: " + page.getPage());
} else
isPending = false;
// Teaser => parent = Category:Teasers
if (page.getParent().equalsIgnoreCase("Category:Teasers")) {
isTeaser = true;
Log.i(TAG, "Novel is Teaser Project: " + page.getPage());
} else
isTeaser = false;
// update the status
ArrayList<String> statuses = new ArrayList<String>();
if (isTeaser)
statuses.add(Constants.STATUS_TEASER);
if (isStalled)
statuses.add(Constants.STATUS_STALLED);
if (isAbandoned)
statuses.add(Constants.STATUS_ABANDONED);
if (isPending)
statuses.add(Constants.STATUS_PENDING);
page.setStatus(Util.join(statuses, "|"));
return page;
}
private static void parseNovelChapters(Document doc, NovelCollectionModel novel, String language) {
// Log.d(TAG, "Start parsing book collections for " + novel.getPage());
// parse the collection
ArrayList<BookModel> books = new ArrayList<BookModel>();
boolean oneBookOnly = false;
ArrayList<String> parser = null;
if (language != null)
parser = AlternativeLanguageInfo.getAlternativeLanguageInfo().get(language).getParserInfo();
try {
Elements h2s = doc.select("h1,h2");
for (Iterator<Element> i = h2s.iterator(); i.hasNext();) {
Element h2 = i.next();
// Log.d(TAG, "checking h2: " +h2.text() + "\n" + h2.id());
Elements spans = h2.select("span");
if (spans.size() > 0) {
// find span with id containing "_by" or 'Full_Text'
// or contains with Page Name or "Side_Stor*" or "Short_Stor*"
// or contains "_Series" (Maru-MA)
// or if redirected, use the redirect page name.
boolean containsBy = false;
for (Iterator<Element> iSpan = spans.iterator(); iSpan.hasNext();) {
Element s = iSpan.next();
Log.d(TAG, "Checking: " + s.id());
boolean tempBool = false;
for (int j = 0; j < parser.size(); j++)
if (s.id().contains(parser.get(j)))
tempBool = true;
if (tempBool || s.id().contains(novel.getPage()) || (novel.getRedirectTo() != null && s.id().contains(novel.getRedirectTo()))) {
containsBy = true;
Log.d(TAG, "Got valid id: " + s.id());
break;
}
Log.d(TAG, "Not valid id: " + s.id());
}
if (!containsBy) {
continue;
}
// Log.d(TAG, "Found h2: " +h2.text());
ArrayList<BookModel> tempBooks = parseBooksMethod1(novel, h2, language);
if (tempBooks != null && tempBooks.size() > 0) {
books.addAll(tempBooks);
}
if (books.size() == 0 || (oneBookOnly && tempBooks.size() == 0)) {
Log.d(TAG, "No books found, use method 2: Only have 1 book, chapter in <p> tag.");
tempBooks = parseBooksMethod2(novel, h2, language);
if (tempBooks != null && tempBooks.size() > 0) {
oneBookOnly = true;
books.addAll(tempBooks);
}
}
if (books.size() == 0 || (oneBookOnly && tempBooks.size() == 0)) {
Log.d(TAG, "No books found, use method 3: Only have 1 book.");
tempBooks = parseBooksMethod3(novel, h2, language);
if (tempBooks != null && tempBooks.size() > 0) {
oneBookOnly = true;
books.addAll(tempBooks);
}
}
}
}
} catch (Exception e) {
Log.e(TAG, "Unknown Exception for " + novel.getPage() + ": " + e.getMessage(), e);
}
// Log.d(TAG, "Complete parsing book collections: " + books.size());
novel.setBookCollections(CommonParser.validateNovelBooks(books));
}
/***
* Look for <h3>after
* <h2>containing the volume list. Treat each li in dl/ul/div as the chapters.
*
* @param novel
* @param h2
* @return
*/
private static ArrayList<BookModel> parseBooksMethod1(NovelCollectionModel novel, Element h2, String language) {
// Log.d(TAG, "method 1");
ArrayList<BookModel> books = new ArrayList<BookModel>();
Element bookElement = h2;
boolean walkBook = true;
int bookOrder = 0;
do {
bookElement = bookElement.nextElementSibling();
if (bookElement == null || bookElement.tagName() == "h2")
walkBook = false;
else if (bookElement.tagName() != "h3") {
Elements h3s = bookElement.select("h3");
if (h3s != null && h3s.size() > 0) {
for (Element h3 : h3s) {
bookOrder = processH3(novel, books, h3, bookOrder, language);
}
}
} else if (bookElement.tagName() == "h3") {
bookOrder = processH3(novel, books, bookElement, bookOrder, language);
}
} while (walkBook);
return books;
}
public static int processH3(NovelCollectionModel novel, ArrayList<BookModel> books, Element bookElement, int bookOrder, String language) {
// Log.d(TAG, "Found: " +bookElement.text());
BookModel book = new BookModel();
book.setTitle(CommonParser.sanitize(bookElement.text(), true));
book.setOrder(bookOrder);
ArrayList<PageModel> chapterCollection = new ArrayList<PageModel>();
String parent = novel.getPage() + Constants.NOVEL_BOOK_DIVIDER + book.getTitle();
// parse the chapters.
boolean walkChapter = true;
int chapterOrder = 0;
Element chapterElement = bookElement;
do {
chapterElement = chapterElement.nextElementSibling();
if (chapterElement == null || chapterElement.tagName() == "h2" || chapterElement.tagName() == "h3") {
walkChapter = false;
} else {
Elements chapters = chapterElement.select("li");
for (Element chapter : chapters) {
PageModel p = processLI(chapter, parent, chapterOrder, language);
if (p != null) {
chapterCollection.add(p);
++chapterOrder;
}
}
}
book.setChapterCollection(chapterCollection);
} while (walkChapter);
books.add(book);
++bookOrder;
return bookOrder;
}
/***
* Process li to chapter.
*
* @param li
* @param parent
* @param chapterOrder
* @return
*/
private static PageModel processLI(Element li, String parent, int chapterOrder, String language) {
PageModel p = null;
Elements links = li.select("a");
if (links != null && links.size() > 0) {
// TODO: need to handle multiple link in one list item
Element link = links.first();
// skip if User_talk:
if (link.attr("href").contains("User_talk:"))
return null;
p = processA(li.text(), parent, chapterOrder, link, language);
}
return p;
}
/***
* Process <a> to chapter
*
* @param title
* @param parent
* @param chapterOrder
* @param link
* @return
*/
private static PageModel processA(String title, String parent, int chapterOrder, Element link, String language) {
PageModel p = new PageModel();
p.setTitle(CommonParser.sanitize(title, false));
p.setParent(parent);
p.setType(PageModel.TYPE_CONTENT);
p.setOrder(chapterOrder);
p.setLastUpdate(new Date(0));
p.setLanguage(language);
// External link
if (link.className().contains("external text")) {
p.setExternal(true);
p.setPage(link.attr("href"));
// Log.d(TAG, "Found external link for " + p.getTitle() + ": " + link.attr("href"));
} else {
p.setExternal(false);
String tempPage = link.attr("href").replace("/project/index.php?title=", "").replace(Constants.BASE_URL_HTTPS, "").replace(Constants.BASE_URL, "");
p.setPage(tempPage);
}
return p;
}
/***
* parse book method 2:
* Look for <p> after <h2> containing the chapter list, usually only have 1 book.
* See 7_Nights
*
* @param novel
* @param h2
* @return
*/
private static ArrayList<BookModel> parseBooksMethod2(NovelCollectionModel novel, Element h2, String language) {
ArrayList<BookModel> books = new ArrayList<BookModel>();
Element bookElement = h2;
boolean walkBook = true;
int bookOrder = 0;
do {
bookElement = bookElement.nextElementSibling();
if (bookElement == null || bookElement.tagName() == "h2")
walkBook = false;
else if (bookElement.tagName() == "p") {
// Log.d(TAG, "Found: " + bookElement.text());
BookModel book = new BookModel();
book.setTitle(CommonParser.sanitize(bookElement.text(), true));
book.setOrder(bookOrder);
ArrayList<PageModel> chapterCollection = new ArrayList<PageModel>();
String parent = novel.getPage() + Constants.NOVEL_BOOK_DIVIDER + book.getTitle();
// parse the chapters.
boolean walkChapter = true;
int chapterOrder = 0;
Element chapterElement = bookElement;
do {
chapterElement = chapterElement.nextElementSibling();
if (chapterElement == null)
walkChapter = false;
else if (chapterElement.tagName() == "p")
walkChapter = false;
else if (chapterElement.tagName() == "dl" || chapterElement.tagName() == "ul" || chapterElement.tagName() == "div") {
Elements chapters = chapterElement.select("li");
for (Element chapter : chapters) {
PageModel p = processLI(chapter, parent, chapterOrder, language);
if (p != null) {
chapterCollection.add(p);
++chapterOrder;
}
}
}
// no subchapter
if (chapterCollection.size() == 0) {
Elements links = bookElement.select("a");
if (links.size() > 0) {
Element link = links.first();
PageModel p = processA(link.text(), parent, chapterOrder, link, chapterCollection.get(0).getLanguage());
// Log.d(TAG, "chapter: " + p.getTitle() + " = " + p.getPage());
chapterCollection.add(p);
++chapterOrder;
}
}
book.setChapterCollection(chapterCollection);
} while (walkChapter);
books.add(book);
++bookOrder;
}
} while (walkBook);
return books;
}
/***
* Only have 1 book, chapter list is nested in ul/dl, e.g:Fate/Apocrypha, Gekkou
* Parse the li as the chapters.
*
* @param novel
* @param h2
* @return
*/
private static ArrayList<BookModel> parseBooksMethod3(NovelCollectionModel novel, Element h2, String language) {
ArrayList<BookModel> books = new ArrayList<BookModel>();
Element bookElement = h2;
boolean walkBook = true;
int bookOrder = 0;
do {
bookElement = bookElement.nextElementSibling();
if (bookElement == null || bookElement.tagName() == "h2")
walkBook = false;
else if (bookElement.tagName() == "ul" || bookElement.tagName() == "dl") {
// Log.d(TAG, "Found: " +bookElement.text());
BookModel book = new BookModel();
book.setTitle(CommonParser.sanitize(h2.text(), true));
book.setOrder(bookOrder);
ArrayList<PageModel> chapterCollection = new ArrayList<PageModel>();
String parent = novel.getPage() + Constants.NOVEL_BOOK_DIVIDER + book.getTitle();
// parse the chapters.
int chapterOrder = 0;
Elements chapters = bookElement.select("li");
for (Element chapter : chapters) {
PageModel p = processLI(chapter, parent, chapterOrder, language);
if (p != null) {
chapterCollection.add(p);
++chapterOrder;
}
}
book.setChapterCollection(chapterCollection);
books.add(book);
++bookOrder;
}
} while (walkBook);
return books;
}
private static String parseNovelCover(Document doc, NovelCollectionModel novel) {
// Log.d(TAG, "Start parsing cover image");
// parse the cover image
String imageUrl = "";
Elements images = doc.select(".thumbimage");
if (images.size() > 0) {
imageUrl = images.first().attr("src");
if (!imageUrl.startsWith("http")) {
imageUrl = "http://www.baka-tsuki.org" + imageUrl;
}
Log.d(TAG, "Cover: " + imageUrl);
}
novel.setCover(imageUrl);
if (imageUrl != null && imageUrl.length() > 0) {
try {
URL url = new URL(imageUrl);
novel.setCoverUrl(url);
} catch (MalformedURLException e) {
Log.e(TAG, "Invalid URL: " + imageUrl, e);
}
}
// Log.d(TAG, "Complete parsing cover image");
return imageUrl;
}
private static String parseNovelSynopsis(Document doc, NovelCollectionModel novel, String language) {
// Log.d(TAG, "Start parsing synopsis");
// parse the synopsis
String synopsis = "";
String source = "";
if (language != null)
source = AlternativeLanguageInfo.getAlternativeLanguageInfo().get(language).getMarkerSynopsis();
// from Story_Synopsis id
Elements stage = doc.select(source);// .first().parent().nextElementSibling();
// from main text
if (stage == null || stage.size() <= 0) {
source = "#mw-content-text,p";
stage = doc.select(source);
Log.i(TAG, "Synopsis from: " + source);
}
if (stage.size() > 0) {
Element synopsisE = stage.first().children().first();
Iterator<Entry<String, AlternativeLanguageInfo>> it = AlternativeLanguageInfo.getAlternativeLanguageInfo().entrySet().iterator();
while (it.hasNext()) {
AlternativeLanguageInfo info = it.next().getValue();
if (source.equals(info.getMarkerSynopsis()))
synopsisE = stage.first().parent().nextElementSibling();
it.remove();
}
boolean processOne = false;
if (synopsisE == null || synopsisE.select("p").size() == 0) {
// cannot found any synopsis, take the first available p
synopsisE = stage.first();
processOne = true;
}
int i = 0;
do {
if (synopsisE == null)
break;
if (synopsisE.tagName() != "p") {
synopsisE = synopsisE.nextElementSibling();
// Log.d(TAG, synopsisE.html());
continue;
}
i++;
synopsis += synopsisE.text() + "\n";
synopsisE = synopsisE.nextElementSibling();
if (synopsisE != null && synopsisE.tagName() != "p" && i > 0)
break;
if (i > 10)
break; // limit only first 10 paragraph.
if (processOne)
break;
} while (true);
}
novel.setSynopsis(synopsis);
// Log.d(TAG, "Completed parsing synopsis.");
return synopsis;
}
public static NovelContentModel ParseNovelContent(Document doc, PageModel page) throws Exception {
NovelContentModel content = new NovelContentModel();
page.setDownloaded(true);
content.setPage(page.getPage());
content.setPageModel(page);
Element textElement = doc.select("text").first();
if (textElement == null)
throw new Exception("Empty content!");
String text = textElement.text();
// get valid image list
Document imgDoc = Jsoup.parse(text);
ArrayList<ImageModel> images = CommonParser.getAllImagesFromContent(imgDoc, UIHelper.getBaseUrl(LNReaderApplication.getInstance().getApplicationContext()));
content.setImages(images);
content.setContent(CommonParser.replaceImagePath(text));
content.setLastXScroll(0);
content.setLastYScroll(0);
content.setLastZoom(Constants.DISPLAY_SCALE);
return content;
}
}