/**
* Parse baka-tsuki wiki page
*/
package com.erakk.lnreader.parser;
import android.util.Log;
import com.erakk.lnreader.Constants;
import com.erakk.lnreader.helper.BakaReaderException;
import com.erakk.lnreader.helper.Util;
import com.erakk.lnreader.model.BookModel;
import com.erakk.lnreader.model.ImageModel;
import com.erakk.lnreader.model.NovelCollectionModel;
import com.erakk.lnreader.model.NovelContentModel;
import com.erakk.lnreader.model.PageModel;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.Iterator;
/**
* @author Nandaka
*/
public class BakaTsukiParser {
private static final String TAG = BakaTsukiParser.class.toString();
/**
* Parse "#mw-pages" English Novels.
*
* @param doc
* @param parent
* @param status
* @return
*/
public static ArrayList<PageModel> parseGenericNovelList(Document doc, String parent, String status) {
ArrayList<PageModel> result = new ArrayList<PageModel>();
if (doc == null)
throw new NullPointerException("Document cannot be null.");
Element stage = doc.select("#mw-pages").first();
int order = 0;
if (stage != null) {
Elements list = stage.select("li");
for (Element element : list) {
Element link = element.select("a").first();
PageModel page = new PageModel();
page.setParent(parent);
String tempPage = CommonParser.normalizeInternalUrl(link.attr("href"));
page.setPage(tempPage);
page.setLanguage(Constants.LANG_ENGLISH);
page.setType(PageModel.TYPE_NOVEL);
page.setTitle(link.text());
page.setStatus(status);
page.setOrder(order);
result.add(page);
++order;
}
}
return result;
}
/**
* Parse novel Title, Synopsis, Cover, and Chapter list.
*
* @param doc
* @param page
* @return
*/
public static NovelCollectionModel ParseNovelDetails(Document doc, PageModel page) {
NovelCollectionModel novel = new NovelCollectionModel();
if (doc == null)
throw new NullPointerException("Document cannot be null.");
novel.setPage(page.getPage());
novel.setPageModel(page);
String redirected = CommonParser.redirectedFrom(doc, page);
novel.setRedirectTo(redirected);
parseNovelSynopsis(doc, novel);
CommonParser.parseNovelCover(doc, novel);
parseNovelChapters(doc, novel);
parseNovelStatus(doc, page);
return novel;
}
public static NovelContentModel ParseNovelContent(Document doc, PageModel page) throws Exception {
NovelContentModel content = new NovelContentModel();
page.setDownloaded(true);
content.setPage(page.getPage());
content.setPageModel(page);
Element textElement = doc.select("text").first();
String text = "";
if (textElement != null) {
text = textElement.text();
} else {
textElement = doc.select(".noarticletext, error[code=missingtitle]").first();
if (textElement != null) {
if (!Util.isStringNullOrEmpty(textElement.attr("info")))
text = textElement.attr("info");
else
text = textElement.html();
page.setMissing(true);
Log.e(TAG, "Chapter is missing/deleted: " + page.getPage());
} else {
Log.d(TAG, "Content: \r\n" + doc.html());
throw new BakaReaderException("Empty Content", BakaReaderException.EMPTY_CONTENT);
}
}
// get valid image list
Document imgDoc = Jsoup.parse(text);
ArrayList<ImageModel> images = CommonParser.processImagesFromContent(imgDoc);
content.setImages(images);
content.setContent(CommonParser.replaceImagePath(text));
return content;
}
private static PageModel parseNovelStatus(Document doc, PageModel page) {
boolean isTeaser = page.isTeaser();
boolean isStalled = page.isStalled();
boolean isAbandoned = page.isAbandoned();
boolean isPending = page.isPending();
// Template:STALLED
Elements links = doc.select("a[title=Template:STALLED]");
if (links != null && links.size() > 0) {
isStalled = true;
Log.i(TAG, "Novel is stalled: " + page.getPage());
} else
isStalled = false;
// Template:Abandoned
links = doc.select("a[title=Template:Abandoned]");
if (links != null && links.size() > 0) {
isAbandoned = true;
Log.i(TAG, "Novel is abandoned: " + page.getPage());
} else
isAbandoned = false;
// Template:Warning:ATP
links = doc.select("a[title=Template:Warning:ATP]");
if (links != null && links.size() > 0) {
isPending = true;
Log.i(TAG, "Novel is pending authorization: " + page.getPage());
} else
isPending = false;
// Teaser => parent = Category:Teasers
if (page.getParent().equalsIgnoreCase(Constants.ROOT_TEASER)) {
isTeaser = true;
Log.i(TAG, "Novel is Teaser Project: " + page.getPage());
} else
isTeaser = false;
// update the status
ArrayList<String> statuses = new ArrayList<String>();
if (isTeaser)
statuses.add(Constants.STATUS_TEASER);
if (isStalled)
statuses.add(Constants.STATUS_STALLED);
if (isAbandoned)
statuses.add(Constants.STATUS_ABANDONED);
if (isPending)
statuses.add(Constants.STATUS_PENDING);
page.setStatus(Util.join(statuses, "|"));
return page;
}
/**
* find span with id containing "_by" or 'Full_Text'
* or contains with Page Name or "Side_Stor*" or "Short_Stor*"
* or Official_Parody_Stories
* or contains "_Series" (Maru-MA)
* or if redirected, use the redirect page name.
*
* @param s
* @param novel
* @param language
* @return
*/
private static boolean validateH2(Element s, NovelCollectionModel novel, String language) {
if (language.equalsIgnoreCase(Constants.LANG_ENGLISH)) {
String rules[] = {novel.getPage(), novel.getRedirectTo(), "_by", "Full_Text", "_Series", "_series", "Side_Stor", "Short_Stor", "Parody_Stor", "Bonus_Track"};
for (String rule : rules) {
if (!Util.isStringNullOrEmpty(rule) && s.id().contains(rule)) {
return true;
}
}
}
return false;
}
private static void parseNovelChapters(Document doc, NovelCollectionModel novel) {
// Log.d(TAG, "Start parsing book collections for " + novel.getPage());
// parse the collection
ArrayList<BookModel> books = new ArrayList<BookModel>();
boolean oneBookOnly = false;
try {
Elements h2s = doc.select("h1,h2");
for (Iterator<Element> i = h2s.iterator(); i.hasNext(); ) {
Element h2 = i.next();
// Log.d(TAG, "checking h2: " +h2.text() + "\n" + h2.id());
Elements spans = h2.select("span");
if (spans.size() > 0) {
boolean containsBy = false;
for (Element s : spans) {
Log.d(TAG, "Checking: " + s.id());
if (validateH2(s, novel, Constants.LANG_ENGLISH)) {
containsBy = true;
Log.d(TAG, "Got valid id: " + s.id());
break;
}
Log.d(TAG, "Not valid id: " + s.id());
}
if (!containsBy) {
continue;
}
// Log.d(TAG, "Found h2: " +h2.text());
ArrayList<BookModel> tempBooks = parseBooksMethod1(novel, h2);
if (tempBooks != null && tempBooks.size() > 0) {
books.addAll(tempBooks);
}
if (books.size() == 0 || (oneBookOnly && tempBooks.size() == 0)) {
Log.d(TAG, "No books found, use method 2: Only have 1 book, chapter in <p> tag.");
tempBooks = parseBooksMethod2(novel, h2);
if (tempBooks != null && tempBooks.size() > 0) {
oneBookOnly = true;
books.addAll(tempBooks);
}
}
if (books.size() == 0 || (oneBookOnly && tempBooks.size() == 0)) {
Log.d(TAG, "No books found, use method 3: Only have 1 book.");
tempBooks = parseBooksMethod3(novel, h2);
if (tempBooks != null && tempBooks.size() > 0) {
oneBookOnly = true;
books.addAll(tempBooks);
}
}
}
}
} catch (Exception e) {
Log.e(TAG, "Unknown Exception for " + novel.getPage() + ": " + e.getMessage(), e);
}
// Log.d(TAG, "Complete parsing book collections: " + books.size());
novel.setBookCollections(CommonParser.validateNovelBooks(books));
}
/**
* Look for <h3>after
* <h2>containing the volume list. Treat each li in dl/ul/div as the chapters.
*
* @param novel
* @param h2
* @return
*/
private static ArrayList<BookModel> parseBooksMethod1(NovelCollectionModel novel, Element h2) {
// Log.d(TAG, "method 1");
ArrayList<BookModel> books = new ArrayList<BookModel>();
Element bookElement = h2;
boolean walkBook = true;
int bookOrder = 0;
do {
bookElement = bookElement.nextElementSibling();
if (bookElement == null || bookElement.tagName() == "h2")
walkBook = false;
else if (bookElement.tagName() != "h3") {
Elements h3s = bookElement.select("h3");
if (h3s != null && h3s.size() > 0) {
for (Element h3 : h3s) {
bookOrder = CommonParser.processH3(novel, books, h3, bookOrder, Constants.LANG_ENGLISH);
}
}
} else if (bookElement.tagName() == "h3") {
bookOrder = CommonParser.processH3(novel, books, bookElement, bookOrder, Constants.LANG_ENGLISH);
}
} while (walkBook);
return books;
}
/**
* parse book method 2:
* Look for <p> after <h2> containing the chapter list, usually only have 1 book.
* See 7_Nights
*
* @param novel
* @param h2
* @return
*/
private static ArrayList<BookModel> parseBooksMethod2(NovelCollectionModel novel, Element h2) {
ArrayList<BookModel> books = new ArrayList<BookModel>();
Element bookElement = h2;
boolean walkBook = true;
int bookOrder = 0;
do {
bookElement = bookElement.nextElementSibling();
if (bookElement == null || bookElement.tagName() == "h2")
walkBook = false;
else if (bookElement.tagName() == "p") {
// Log.d(TAG, "Found: " + bookElement.text());
BookModel book = new BookModel();
if (bookElement.html().contains("href"))
// title contains link
book.setTitle(CommonParser.sanitize(bookElement.text(), true));
else
book.setTitle(CommonParser.sanitize(bookElement.text(), false));
book.setOrder(bookOrder);
ArrayList<PageModel> chapterCollection = new ArrayList<PageModel>();
String parent = novel.getPage() + Constants.NOVEL_BOOK_DIVIDER + book.getTitle();
// parse the chapters.
boolean walkChapter = true;
int chapterOrder = 0;
Element chapterElement = bookElement;
do {
chapterElement = chapterElement.nextElementSibling();
if (chapterElement == null)
walkChapter = false;
else if (chapterElement.tagName() == "p")
walkChapter = false;
else if (chapterElement.tagName() == "dl" || chapterElement.tagName() == "ul" || chapterElement.tagName() == "div") {
Elements chapters = chapterElement.select("li");
for (Element chapter : chapters) {
ArrayList<PageModel> pageModels = CommonParser.processLI(chapter, parent, chapterOrder, Constants.LANG_ENGLISH);
for (PageModel p : pageModels) {
if (p != null) {
chapterCollection.add(p);
++chapterOrder;
}
}
}
}
// no subchapter
if (chapterCollection.size() == 0) {
Elements links = bookElement.select("a");
if (links.size() > 0) {
Element link = links.first();
PageModel p = CommonParser.processA(link.text(), parent, chapterOrder, link, Constants.LANG_ENGLISH);
// Log.d(TAG, "chapter: " + p.getTitle() + " = " + p.getPage());
if (p != null) {
chapterCollection.add(p);
++chapterOrder;
}
}
}
book.setChapterCollection(chapterCollection);
} while (walkChapter);
books.add(book);
++bookOrder;
}
} while (walkBook);
return books;
}
/**
* Only have 1 book, chapter list is nested in ul/dl, e.g:Fate/Apocrypha, Gekkou
* Parse the li as the chapters.
*
* @param novel
* @param h2
* @return
*/
private static ArrayList<BookModel> parseBooksMethod3(NovelCollectionModel novel, Element h2) {
ArrayList<BookModel> books = new ArrayList<BookModel>();
Element bookElement = h2;
boolean walkBook = true;
int bookOrder = 0;
do {
bookElement = bookElement.nextElementSibling();
if (bookElement == null || bookElement.tagName() == "h2")
walkBook = false;
else if (bookElement.tagName() == "ul" || bookElement.tagName() == "dl") {
// Log.d(TAG, "Found: " +bookElement.text());
BookModel book = new BookModel();
if (h2.html().contains("href"))
// title contains link
book.setTitle(CommonParser.sanitize(h2.text(), true));
else
book.setTitle(CommonParser.sanitize(h2.text(), false));
book.setOrder(bookOrder);
ArrayList<PageModel> chapterCollection = new ArrayList<PageModel>();
String parent = novel.getPage() + Constants.NOVEL_BOOK_DIVIDER + book.getTitle();
// parse the chapters.
int chapterOrder = 0;
Elements chapters = bookElement.select("li");
for (Element chapter : chapters) {
ArrayList<PageModel> pageModels = CommonParser.processLI(chapter, parent, chapterOrder, Constants.LANG_ENGLISH);
for (PageModel p : pageModels) {
if (p != null) {
chapterCollection.add(p);
++chapterOrder;
}
}
}
book.setChapterCollection(chapterCollection);
books.add(book);
++bookOrder;
}
} while (walkBook);
return books;
}
private static String parseNovelSynopsis(Document doc, NovelCollectionModel novel) {
// Log.d(TAG, "Start parsing synopsis");
// parse the synopsis
String synopsis = "";
String source = "[id*=Synopsis]";// "#Story_Synopsis";
// from Story_Synopsis id
Elements stage = doc.select(source);// .first().parent().nextElementSibling();
// from main text
if (stage == null || stage.size() <= 0) {
source = "#mw-content-text,p";
stage = doc.select(source);
Log.i(TAG, "Synopsis From: " + source);
}
if (stage.size() > 0) {
Element synopsisE;
if (source == "[id*=Synopsis]")// "#Story_Synopsis")
synopsisE = stage.first().parent().nextElementSibling();
else
synopsisE = stage.first().children().first();
boolean processOne = false;
if (synopsisE == null || synopsisE.select("p").size() == 0) {
// cannot found any synopsis, take the first available p
synopsisE = stage.first();
processOne = true;
}
int i = 0;
do {
if (synopsisE == null)
break;
if (synopsisE.tagName() != "p") {
synopsisE = synopsisE.nextElementSibling();
// Log.d(TAG, synopsisE.html());
continue;
}
i++;
synopsis += synopsisE.text() + "\n";
synopsisE = synopsisE.nextElementSibling();
if (synopsisE != null && synopsisE.tagName() != "p" && i > 0)
break;
if (i > 10)
break; // limit only first 10 paragraph.
if (processOne)
break;
} while (true);
}
novel.setSynopsis(synopsis);
// Log.d(TAG, "Completed parsing synopsis.");
return synopsis;
}
}