/**
* @author Niki Parmar <nikijitp@usc.edu>
*/
package edu.usc.cssl.tacit.crawlers.latin.services;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.eclipse.core.runtime.Assert;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.Platform;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
public class LatinCrawler {
private StringBuilder readMe = new StringBuilder();
String outputDir;
private Map<String, String> authorNames;
Set<String> skipBooks;
private IProgressMonitor monitor;
private int work;
public LatinCrawler() {
authorNames = new HashMap<String, String>();
// authorUrl = new HashSet<String>();
skipBooks = new HashSet<String>();
skipBooks.add("The Latin Library");
skipBooks.add("The Classics Page");
skipBooks.add("The Classics Homepage");
skipBooks.add("Christian Latin");
skipBooks.add("Thomas May");
skipBooks.add("Contemporary Latin");
skipBooks.add("Apollinaris Sidonius");
skipBooks.add("The Miscellany");
skipBooks.add("St. Thomas Aquinas");
skipBooks.add("St. Jerome");
// skipBooks.add("Leo the Great"); //check this
skipBooks.add("Isidore of Seville");
skipBooks.add("Seneca the Younger");
skipBooks.add("Seneca the Elder");
skipBooks.add("Miscellanea Carminum");
skipBooks.add("Velleius");
skipBooks.add("Neo-Latin");
skipBooks.add("The Bible");
skipBooks.add("Medieval Latin");
skipBooks.add("Christian");
skipBooks.add("Christina Latin");
skipBooks.add("Medieval");
skipBooks.add("Ius Romanum");
skipBooks.add("Miscellany");
skipBooks.add("Paulus Diaconus");
}
/**
* Connect to the url and retrieve the document
*/
protected Document retrieveDocumentFromUrl(String url) {
Document doc = null;
try {
monitor.subTask("Retrieving content from " +url);
doc = Jsoup.connect(url).timeout(10 * 1000).get();
} catch (IOException e) {
// Error handling->will do later
}
return doc;
}
/**
* Crawls the website http://www.thelatinlibrary.com/ and extract all books
* into specified output folder
*
* @throws IOException
*/
// public void crawl(SubProgressMonitor monitor, int work) throws IOException {
// this.monitor = monitor;
// this.work = work;
// monitor.beginTask("crawling Author list...", work);
// getAllBooks();
// writeReadMe(this.outputDir);
// this.monitor.done();
// }
//
// private void getAllBooks() throws IOException {
// Set<String> authors = (Set<String>) authorNames.keySet();
// int singleWork = work / authors.size();
// for (String name : authors) {
// monitor.subTask("Crawling " + name);
// if (monitor.isCanceled()) {
// throw new OperationCanceledException();
// }
// getBooksByAuthor(name, authorNames.get(name));
// monitor.worked(singleWork);
// }
// return;
// }
/**
*
* @param name
* : name of the author
* @param url
* : url of author's page which lists all his books
* @param monitor2
* @throws IOException
*/
public int getBooksByAuthor(String author, String url, IProgressMonitor monitor2) throws IOException {
int totalFilesCreated = 0;
Assert.isNotNull(author, "Parameter author can't be empty");
Assert.isNotNull(author, "Parameter url can't be empty");
this.monitor = monitor2;
String authDir = outputDir + File.separator + author;
createIfMissing(authDir);
Map<String, BookData> myBooks = new HashMap<String, BookData>();
try {
Document doc = retrieveDocumentFromUrl(url);// Jsoup.connect(url).timeout(10*1000).get();
Boolean isText = doc.getElementsByTag("a").get(0).attr("abs:href")
.contains("#");
if (!isText) {
getBooksByPage(doc, author, authDir, myBooks);
}
if (isText || myBooks.size() == 0) {
myBooks.put(url, new BookData(author, url, authDir));
}
} catch (Exception e) {
System.out.println(e);
}
for (Map.Entry<String, BookData> entry : myBooks.entrySet()) {
String bookName = entry.getValue().getBookName();
String bookUrl = entry.getValue().getBookUrl();
String bookDir = entry.getValue().getBookDir();
totalFilesCreated += getBookContent(bookUrl, bookName, bookDir);
}
return totalFilesCreated;
}
class BookData {
private String bookName;
private String bookUrl;
private String bookDir;
BookData(String bookName, String bookUrl, String bookDir) {
this.bookName = bookName;
this.bookUrl = bookUrl;
this.bookDir = bookDir;
}
public String getBookUrl() {
return bookUrl;
}
public String getBookDir() {
return bookDir;
}
public String getBookName() {
return bookName;
}
public void setBookName(String bookName) {
this.bookName = bookName;
}
public void setBookUrl(String bookUrl) {
this.bookUrl = bookUrl;
}
public void setBookDir(String bookDir) {
this.bookDir = bookDir;
}
}
/**
*
* @param doc
* This is the document from which books are collected.
* @param author
* This is the author of all the books in this page
* @param authDir
* This is the output directory of author. Books are to be
* extracted into this directory
* @param bookList
* The book information gathered from the page is filled into
* this Map for the callee function to access
*/
private void getBooksByPage(Document doc, String author, String authDir,
Map<String, BookData> bookList) {
Elements subLists = doc.select("div.work"); // the books are listed
// inside div.work
Elements subHeaders = doc.select("h2.work"); // headers of sections
// sometimes there are no div.work . Instead books are listed inside a
// table element.
if (subLists.size() == 0) {
Elements tableLists = doc.select("table");
for (Element table : tableLists) {
if (table.getElementsByTag("a").size() > 0) {
subLists.add(table);
} else {
subHeaders.add(table);
}
}
}
int i = 0, size1 = subLists.size(), size2 = subHeaders.size(), j = 0;
Element head = null;
String subDir = "";
authDir += File.separator; // books will be inside authDir>subDir>
while (i < size1 || j < size2) {
try {
if (j < size2) {
head = subHeaders.get(j);
Elements booksLink = head.getElementsByTag("a");
boolean booksPresent = getBooksFromElement(booksLink,
bookList, subDir, author);
if (booksPresent) {
j++;
continue;
}
subDir = authDir + head.text() + File.separator;
} else
subDir = authDir + File.separator;
if (i < size1) {
Elements booksLink = subLists.get(i).getElementsByTag("a");
getBooksFromElement(booksLink, bookList, subDir, author);
i++;
}
j++;
} catch (Exception e) {
}
}
}
private boolean getBooksFromElement(Elements bookLinks,
Map<String, BookData> bookList, String bookDir, String author) {
if (bookLinks.size() <= 0)
return false;
for (int i = 0; i < bookLinks.size(); i++) {
String bookUrl = bookLinks.get(i).attr("abs:href");
String bookName = bookLinks.get(i).text();
if (skipBooks.contains(bookName))
continue;
if (authorNames.containsKey(bookName)
|| (bookName.toLowerCase()).equals(author.toLowerCase()))
continue;
BookData book = new BookData(bookName, bookUrl, bookDir);
bookList.put(bookUrl, book);
}
return true;
}
/**
* Creates a directory in the file system if it does not already exists
*
* @param folder
* : full path of the directory which has to be created.
*/
private void createIfMissing(String folder) {
File path = new File(folder);
if (!path.exists()) {
path.mkdirs();
}
}
public void initialize(String outputDir) {
this.outputDir = outputDir;
}
public Map<String, String> getAuthorNames() throws IOException {
Map<String, String> authorNames = new HashMap<String, String>();
authorNames.putAll(getAuthorsList("http://www.thelatinlibrary.com/",
false));
authorNames.putAll(getAuthorsList(
"http://www.thelatinlibrary.com/medieval.html", true));
authorNames.putAll(getAuthorsList(
"http://www.thelatinlibrary.com/christian.html", true));
authorNames.putAll(getAuthorsList(
"http://www.thelatinlibrary.com/neo.html", true));
authorNames.putAll(getAuthorsList(
"http://www.thelatinlibrary.com/misc.html", true));
authorNames.putAll(getAuthorsList(
"http://www.thelatinlibrary.com/ius.html", true));
// authorNames.put("<All>", "All authors in the list");
return authorNames;
}
public Map<String, String> getAuthorsList(String url, boolean isSubAuthor)
throws IOException {
int i, size = 0;
String name;
Map<String, String> authorNames = new HashMap<String, String>();
Document doc = Jsoup.connect(url).timeout(10 * 1000).get();
if (!isSubAuthor) {
Elements authorsList = doc.getElementsByTag("option");
size = authorsList.size();
int count = 0;
for (i = 0; i < size; i++) {
name = authorsList.get(i).text();
// url = "http://www.thelatinlibrary.com/" +
// authorsList.get(i).attr("value");
if (skipBooks.contains(name))
continue;
authorNames.put(name, "http://www.thelatinlibrary.com/"
+ authorsList.get(i).attr("value"));
count++;
}
}
Element secondList = null;
if (isSubAuthor)
secondList = doc.getElementsByTag("table").get(0);
else
secondList = doc.getElementsByTag("table").get(1);
Elements auth2List = secondList.getElementsByTag("td");
for (Element auth : auth2List) {
name = auth.text();
// url = auth.getElementsByTag("a").attr("abs:href");
if (authorNames.containsKey(name))
continue;
if (skipBooks.contains(name))
continue;
authorNames.put(name, auth.getElementsByTag("a").attr("abs:href"));
}
return authorNames;
}
public void getAllAuthors() throws IOException {
Set<String> authors = (Set<String>) authorNames.keySet();
for (String name : authors) {
getSingleAuthor(name, authorNames.get(name));
}
return;
}
public void getSingleAuthor(String name, String url) throws IOException {
String aurl = outputDir + File.separator + name;
File authorDir = new File(aurl);
if (!authorDir.exists()) {
authorDir.mkdirs();
}
getBooks(name, url, aurl);
}
/*
* Get all books of a single author Recursive function to trace all books
* and links of a particular author
*/
private void getBooks(String author, String aurl, String apath)
throws IOException {
try {
Document doc = Jsoup.connect(aurl).timeout(10 * 1000).get();
Elements subLists = doc.select("div.work");
if (subLists != null && subLists.size() > 0) {
getBooksList(author, aurl, doc, apath);
return;
}
Elements booksList = doc.getElementsByTag("td");
int count = 0;
String bookname = "";
int i = 0;
if (booksList != null) {
int size = booksList.size();
for (i = 0; i < size; i++) {
Element bookItem = booksList.get(i);
String bookText = bookItem.getElementsByTag("a").attr(
"abs:href");
if (bookText.contains("#"))
continue;
bookname = bookItem.text();
if (bookText.isEmpty() || bookText == null)
continue;
if (skipBooks.contains(bookname))
continue;
if (authorNames.containsKey(bookname))
continue;
getBooks(bookname, bookText, apath);
count++;
}
}
if (count == 0) {
if (doc.select("title") != null
&& doc.select("title").size() > 0)
bookname = doc.select("title").first().text();
else if (doc.select("p.pagehead") != null
&& doc.select("p.pagehead").size() > 0)
bookname = doc.select("p.pagehead").first().text();
else if (doc.select("h1") != null
&& doc.select("h1").size() > 0)
bookname = doc.select("h1").first().text();
else
bookname = author;
getContent(aurl, bookname, apath);
}
} catch (Exception e) {
// ConsoleView.writeInConsole("Something went wrong when extracting books of Author "
// + author + e);
}
}
private void getBooksList(String author, String aurl, Document doc,
String apath) {
Elements subLists = doc.select("div.work");
Elements subHeaders = doc.select("h2.work");
int i = 0, size1 = subLists.size(), size2 = subHeaders.size(), j = 0;
String bookText = "";
String bookname = "";
Element head = null;
int k;
// handle count of headers and div
while (i < size1 || j < size2) {
try {
if (j < size2) {
head = subHeaders.get(j);
Elements bookLink = head.getElementsByTag("a");
if (bookLink != null && bookLink.size() > 0) {
// make for all links
bookText = bookLink.get(0).attr("abs:href");
bookname = bookLink.get(0).text();
if (skipBooks.contains(bookname))
continue;
if (authorNames.containsKey(bookname)
|| (bookname.toLowerCase()).equals(author
.toLowerCase()))
continue;
File authorDir = new File(apath + File.separator
+ bookname);
if (!authorDir.exists()) {
authorDir.mkdirs();
}
String apath1 = authorDir.toString();
getBooks(bookname, bookText, apath1);
j++;
continue;
}
}
if (i < size1) {
Element list = subLists.get(i);
Elements booksList = list.getElementsByTag("td");
String authorNewDir = apath + File.separator;
if (j < size2)
authorNewDir += head.text();
else
authorNewDir += "Others";
File authorDir = new File(authorNewDir);
if (!authorDir.exists()) {
authorDir.mkdirs();
}
String apath1 = authorDir.toString();
k = 0;
int count1 = 0;
Element bookItem = null;
if (booksList != null) {
for (k = 0; k < booksList.size(); k++) {
bookItem = booksList.get(k);
bookText = bookItem.getElementsByTag("a").attr(
"abs:href");
if (bookText == null)
continue;
bookname = bookItem.text();
if (skipBooks.contains(bookname))
continue;
if (authorNames.containsKey(bookname))
continue;
getBooks(bookname, bookText, apath1);
count1++;
}
}
if (count1 == 0) {
if (doc.select("title") != null)
bookname = doc.select("title").first().text();
else if (doc.select("p.pagehead") != null
&& doc.select("p.pagehead").size() > 0)
bookname = doc.select("p.pagehead").first().text();
else if (doc.select("h1") != null
&& doc.select("h1").size() > 0)
bookname = doc.select("h1").first().text();
else
bookname = author;
getContent(aurl, bookname, apath1);
}
i++;
}
j++;
} catch (Exception e) {
// ConsoleView.writeInConsole("Something went wrong when extracting books of Author "
// + author);
}
}
}
private void getContent(String bookUri, String bookDir, String authorDir)
throws IOException {
BufferedWriter csvWriter = null;
try {
bookDir = bookDir.replaceAll(
"[.,;\"!-()\\[\\]{}:?'/\\`~$%#@&*_=+<>*$]", "");
csvWriter = new BufferedWriter(new FileWriter(new File(authorDir
+ System.getProperty("file.separator") + bookDir + ".txt")));
Document doc = Jsoup.connect(bookUri).timeout(10 * 10000).get();
Elements content = doc.getElementsByTag("p");
if (content.size() == 0) {
if (csvWriter != null)
csvWriter.close();
getBookContent(bookUri, bookDir, authorDir);
return;
}
ConsoleView.printlInConsoleln("Writing Content at " + authorDir
+ System.getProperty("file.separator") + bookDir + ".txt");
for (Element c : content) {
csvWriter.write(c.text() + "\n");
}
} catch (Exception e) {
getBookContent(bookUri, bookDir, authorDir);
} finally {
if (csvWriter != null)
csvWriter.close();
}
}
private int getBookContent(String bookUri, String bookName, String bookDir)
throws IOException {
BufferedWriter csvWriter = null;
try {
Document doc = retrieveDocumentFromUrl(bookUri);// Jsoup.connect(bookUri).timeout(10*10000).get();
// bookName = getBookNameFromDoc(doc, bookName);
bookName = bookName.replaceAll(
"[.,;\"!-()\\[\\]{}:?'/\\`~$%#@&*_=+<>*$]", "");
createIfMissing(bookDir);
File fName = new File(bookDir
+ System.getProperty("file.separator") + bookName
+ ".txt");
csvWriter = new BufferedWriter(
new FileWriter(fName));
Elements content = doc.getElementsByTag("p");
if (content.size() == 0) {
if (csvWriter != null)
csvWriter.close();
return getBookContent2(bookUri, bookName, bookDir);
}
for (Element c : content) {
csvWriter.write(c.text() + "\n");
}
ConsoleView.printlInConsoleln("Writing Content at"+fName.getAbsolutePath());
} catch (Exception e) {
return getBookContent2(bookUri, bookName, bookDir);
} finally {
if (csvWriter != null)
csvWriter.close();
}
return 1;
}
private int getBookContent2(String bookUri, String bookDir,
String authorDir) throws IOException {
BufferedWriter csvWriter = null;
try {
File fName = new File(authorDir
+ System.getProperty("file.separator") + bookDir + ".txt");
csvWriter = new BufferedWriter(new FileWriter(fName));
Document doc = Jsoup.parse(new URL(bookUri).openStream(), "UTF-16",
bookUri);
Elements content = doc.getElementsByTag("p");
ConsoleView.printlInConsoleln("Writing Contents at " +fName.getAbsolutePath());
for (Element c : content) {
csvWriter.write(c.text() + "\n");
}
} catch (Exception e) {
} finally {
if (csvWriter != null)
csvWriter.close();
}
return 1;
} /* The sub libraries */
/*
* public void getAllSubAuthors(String connectUrl, String output) throws
* IOException{ int i = 0, size = 0; String name, url; Document doc =
* Jsoup.connect(connectUrl).timeout(10*1000).get();
*
* Element secondList = doc.getElementsByTag("table").get(0); Elements
* auth2List = secondList.getElementsByTag("td"); for(Element auth :
* auth2List) { name = auth.text(); url =
* auth.getElementsByTag("a").attr("abs:href");
* if(authorNames.containsKey(name)) continue; //authorNames.add(name);
* if(skipBooks.contains(name)) continue; String aurl = output +
* File.separator + name; File authorDir = new File(aurl);
* if(!authorDir.exists()){ authorDir.mkdirs(); }
* //ConsoleView.writeInConsole("Extracting Books of Author "+ name
* +"..."); appendLog("\nExtracting Books of Author "+ name +"...");
* getBooks(name, url, aurl); i++;
*
* }
*
* return; }
*/
/*
* private void callSpecialAuthors(String url, String name) throws
* IOException{ File authorDir = new File(outputDir + File.separator +
* name); if(!authorDir.exists()){ authorDir.mkdirs(); }
* //getAllSubAuthors(url, outputDir + File.separator + name); }
*
*
* }
*/
public void writeReadMe(String location) {
monitor.subTask("Writing Read Me ...");
File readme = new File(location + File.separator + "README.txt");
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(readme));
String plugV = Platform
.getBundle("edu.usc.cssl.tacit.plugins.latincrawler")
.getHeaders().get("Bundle-Version");
String appV = Platform
.getBundle("edu.usc.cssl.tacit.application")
.getHeaders().get("Bundle-Version");
Date date = new Date();
bw.write("Latin Crawler Output\n--------------------\n\nApplication Version: "
+ appV
+ "\nPlugin Version: "
+ plugV
+ "\nDate: "
+ date.toString() + "\n\n");
bw.write(readMe.toString());
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}