LatinCrawler.java example

Explorer

TACIT-master
- edu.usc.cssl.tacit.classify.naivebayes
  - src
    - edu
      - usc
        cssl
        tacit
        classify
        naivebayes
        Activator.java
        services
        CommandOption.java
        CrossValidationIterator.java
        CrossValidator.java
        NaiveBayesClassifier.java
        Text2Vectors.java
        Vectors2Classify.java
- edu.usc.cssl.tacit.classify.naivebayes.ui
  - src
    - edu
      - usc
        cssl
        tacit
        classify
        naivebayes
        ui
        Activator.java
        NaiveBayesClassifierView.java
        handlers
        NaiveBayesClassifierViewHandler.java
        internal
        INaiveBayesClassifierViewConstants.java
        NaiveBayesClassifierViewImageRegistry.java
- edu.usc.cssl.tacit.classify.svm
  - src
    - edu
      - usc
        cssl
        tacit
        classify
        svm
        Activator.java
        services
        CrossValidator.java
        SVMClassify.java
        SVMPredict.java
        SVMTrain.java
- edu.usc.cssl.tacit.classify.svm.ui
  - src
    - edu
      - usc
        cssl
        tacit
        classifiy
        svm
        ui
        Activator.java
        SVMView.java
        classify
        svm
        ui
        handlers
        OpenSVMViewHandler.java
        internal
        ISVMViewConstants.java
        SVMViewImageRegistry.java
- edu.usc.cssl.tacit.cluster.hierarchical
  - src
    - edu
      - usc
        cssl
        tacit
        cluster
        hierarchical
        Activator.java
        services
        HierarchicalClusterAnalysis.java
- edu.usc.cssl.tacit.cluster.hierarchical.ui
  - src
    - edu
      - usc
        cssl
        tacit
        cluster
        hierarchical
        ui
        Activator.java
        HierarchicalClusterView.java
        handlers
        OpenHierarchicalClusterViewHandler.java
        internal
        HeirarchicalClusterViewImageRegistry.java
        IHeirarchicalClusterViewConstants.java
- edu.usc.cssl.tacit.cluster.kmeans
  - src
    - edu
      - uc
        cssl
        tacit
        cluster
        kmeans
        Activator.java
        services
        KmeansClusterAnalysis.java
- edu.usc.cssl.tacit.cluster.kmeans.ui
  - src
    - edu
      - usc
        cssl
        tacit
        cluster
        kmeans
        ui
        Activator.java
        KmeansClusterView.java
        handlers
        OpenKmeansClusterViewHandler.java
        internal
        IKmeansClusterViewConstants.java
        KmeansClusterViewImageRegistry.java
- edu.usc.cssl.tacit.common
  - src
    - edu
      - usc
        cssl
        tacit
        common
        Activator.java
        JsonHandler.java
        Preprocess.java
        TacitUtility.java
        crawlers
        ICrawler.java
        snowballstemmer
        Among.java
        DanishStemmer.java
        DutchStemmer.java
        EnglishStemmer.java
        FinnishStemmer.java
        FrenchStemmer.java
        GermanStemmer.java
        HungarianStemmer.java
        ItalianStemmer.java
        NorwegianStemmer.java
        PorterStemmer.java
        PortugueseStemmer.java
        RomanianStemmer.java
        RussianStemmer.java
        SnowballProgram.java
        SnowballStemmer.java
        SpanishStemmer.java
        SwedishStemmer.java
        TurkishStemmer.java
- edu.usc.cssl.tacit.common.ui
  - src
    - edu
      - usc
        cssl
        tacit
        common
        ui
        CommonUiActivator.java
        ICommonUiConstants.java
        IPreprocessorSettingsConstant.java
        composite
        from
        TacitFormComposite.java
        handlers
        OpenConsoleViewHandler.java
        OpenCorpusManagementViewHandler.java
        OpenPreProcessViewHandler.java
        OpenProblemsViewHandler.java
        OpenProgressViewHandler.java
        internal
        CommonUiViewImageRegistry.java
        TargetLocationContentProvider.java
        TargetLocationLabelProvider.java
        TargetLocationsGroup.java
        TreeParent.java
        outputdata
        OutputLayoutData.java
        TableLayoutData.java
        preferencepage
        ELanguageType.java
        MainPreferencePage.java
        PrepocessorSettings.java
        utility
        INlpCommonUiConstants.java
        IconRegistry.java
        validation
        OutputPathValidation.java
        views
        ConsoleView.java
        CorpusManagementView.java
        PreprocessorView.java
- edu.usc.cssl.tacit.crawlers.latin
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        latin
        Activator.java
        ILatinCrawlerConstants.java
        services
        LatinCrawler.java
- edu.usc.cssl.tacit.crawlers.latin.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        latin
        ui
        Activator.java
        AuthorListDialog.java
        LatinCrawlerView.java
        handlers
        OpenLatinCrawlerViewHandler.java
        internal
        ILatinCrawlerUIConstants.java
        LatinCrawlerImageRegistry.java
- edu.usc.cssl.tacit.crawlers.reddit
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        reddit
        Activator.java
        services
        RedditCrawler.java
        RedditPlugin.java
- edu.usc.cssl.tacit.crawlers.reddit.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        reddit
        ui
        Activator.java
        RedditCrawlerView.java
        handlers
        RedditCrawlerViewHandler.java
        internal
        IRedditCrawlerViewConstants.java
        RedditCrawlerViewImageRegistry.java
- edu.usc.cssl.tacit.crawlers.senate
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        senate
        Activator.java
        services
        AvailableRecords.java
        SenateCrawler.java
        SenatorDetails.java
- edu.usc.cssl.tacit.crawlers.senate.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        senate
        ui
        Activator.java
        SenateCrawlerView.java
        SenatorListDialog.java
        handlers
        SenateCrawlerViewHandler.java
        internal
        AbstractElementListSelectionDialog.java
        ElementListSelectionDialog.java
        FilteredList.java
        ISenateCrawlerViewConstants.java
        SenateCrawlerViewImageRegistry.java
- edu.usc.cssl.tacit.crawlers.supremecourt
  - src
    - edu
      - uc
        cssl
        tacit
        crawlers
        supremecourt
        Activator.java
        services
        CrawlerData.java
        CrawlerJob.java
        MyThread.java
        SupremCrawlerFilter.java
        SupremeCourtCrawler.java
- edu.usc.cssl.tacit.crawlers.supremecourt.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        supremecourt
        ui
        Activator.java
        SupremeCrawlerView.java
        handlers
        OpenSupremeCrawlerViewHandler.java
        internal
        ISupremeCrawlerUIConstants.java
        SupremeCrawlerImageRegistry.java
- edu.usc.cssl.tacit.crawlers.twitter
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        twitter
        Activator.java
        services
        TwitterStreamApi.java
        TwitterTestProgram.java
- edu.usc.cssl.tacit.crawlers.twitter.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        twitter
        ui
        Activator.java
        TwitterCrawlerView.java
        handlers
        OpenTwitterCrawlerViewHandler.java
        internal
        ITwitterCrawlerUIConstants.java
        TwitterCrawlerImageRegistry.java
        preferencepage
        ITwitterConstant.java
        TwitterUserConfiguration.java
- edu.usc.cssl.tacit.help
  - src
    - edu
      - usc
        cssl
        tacit
        help
        Activator.java
- edu.usc.cssl.tacit.repository
  - src
    - edu
      - usc
        cssl
        tacit
        repository
        Activator.java
        Application.java
        ApplicationActionBarAdvisor.java
        ApplicationWorkbenchAdvisor.java
        ApplicationWorkbenchWindowAdvisor.java
        Perspective.java
- edu.usc.cssl.tacit.topicmodel.lda
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        lda
        Activator.java
        services
        LdaAnalysis.java
        Text2Vectors.java
        Vectors2Topics.java
- edu.usc.cssl.tacit.topicmodel.lda.ui
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        lda
        ui
        Activator.java
        LdaTopicModelView.java
        handlers
        OpenLdaTopicModelViewHandler.java
        internal
        ILdaTopicModelClusterViewConstants.java
        LdaTopicModelViewImageRegistry.java
- edu.usc.cssl.tacit.topicmodel.zlda
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        zlda
        Activator.java
        services
        DTWC.java
        ZlabelLDA.java
        ZlabelTopicModelAnalysis.java
- edu.usc.cssl.tacit.topicmodel.zlda.ui
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        zlda
        ui
        Activator.java
        ZlabelLdaTopicModelView.java
        handlers
        OpenZlabelLdaTopicModelViewHandler.java
        internal
        IZlabelLdaTopicModelClusterViewConstants.java
        ZlabelLdaTopicModelViewImageRegistry.java
- edu.usc.cssl.tacit.wordcount.cooccurrence
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        cooccurrence
        Activator.java
        services
        CooccurrenceAnalysis.java
- edu.usc.cssl.tacit.wordcount.cooccurrence.ui
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        cooccurrence
        ui
        Activator.java
        CooccurrenceWordCountView.java
        handlers
        OpenCooccurrenceWordCountViewHandler.java
        internal
        CooccurrenceWordCountImageRegistry.java
        ICooccurrenceWordCountViewConstants.java
- edu.usc.cssl.tacit.wordcount.standard
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        standard
        Activator.java
        services
        WordCountPlugin.java
- edu.usc.cssl.tacit.wordcount.standard.ui
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        standard
        ui
        Activator.java
        StandardWordCountView.java
        handlers
        OpenStandardWordCountViewHandler.java
        internal
        IStandardWordCountViewConstants.java
        StandardWordCountImageRegistry.java
- edu.usc.cssl.tacit.wordcount.weighted
  - src
    - edu
      - uc
        cssl
        tacit
        wordcount
        weighted
        Activator.java
        services
        Trie.java
        WordCountApi.java
- edu.usc.cssl.tacit.wordcount.weighted.ui
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        weighted
        ui
        Activator.java
        WeightedWordCountView.java
        handlers
        OpenWeightedWordCountViewHandler.java
        internal
        AddLocationPage.java
        EditDirectoryContainerPage.java
        IWeightedWordCountViewConstants.java
        WeightedWordCountImageRegistry.java

/**
 * @author Niki Parmar <nikijitp@usc.edu>
 */

package edu.usc.cssl.tacit.crawlers.latin.services;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.eclipse.core.runtime.Assert;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.Platform;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import edu.usc.cssl.tacit.common.ui.views.ConsoleView;

public class LatinCrawler {
	private StringBuilder readMe = new StringBuilder();
	String outputDir;
	private Map<String, String> authorNames;
	Set<String> skipBooks;
	private IProgressMonitor monitor;
	private int work;

	public LatinCrawler() {
		authorNames = new HashMap<String, String>();
		// authorUrl = new HashSet<String>();
		skipBooks = new HashSet<String>();

		skipBooks.add("The Latin Library");
		skipBooks.add("The Classics Page");
		skipBooks.add("The Classics Homepage");
		skipBooks.add("Christian Latin");
		skipBooks.add("Thomas May");
		skipBooks.add("Contemporary Latin");
		skipBooks.add("Apollinaris Sidonius");
		skipBooks.add("The Miscellany");
		skipBooks.add("St. Thomas Aquinas");
		skipBooks.add("St. Jerome");
		// skipBooks.add("Leo the Great"); //check this
		skipBooks.add("Isidore of Seville");
		skipBooks.add("Seneca the Younger");
		skipBooks.add("Seneca the Elder");
		skipBooks.add("Miscellanea Carminum");
		skipBooks.add("Velleius");
		skipBooks.add("Neo-Latin");
		skipBooks.add("The Bible");
		skipBooks.add("Medieval Latin");
		skipBooks.add("Christian");
		skipBooks.add("Christina Latin");
		skipBooks.add("Medieval");
		skipBooks.add("Ius Romanum");
		skipBooks.add("Miscellany");
		skipBooks.add("Paulus Diaconus");
	}

	/**
	 * Connect to the url and retrieve the document
	 */

	protected Document retrieveDocumentFromUrl(String url) {
		Document doc = null;
		try {
			monitor.subTask("Retrieving content from " +url);
			doc = Jsoup.connect(url).timeout(10 * 1000).get();
		} catch (IOException e) {
			// Error handling->will do later
		}
		return doc;
	}

	/**
	 * Crawls the website http://www.thelatinlibrary.com/ and extract all books
	 * into specified output folder
	 * 
	 * @throws IOException
	 */
//	public void crawl(SubProgressMonitor monitor, int work) throws IOException {
//		this.monitor = monitor;
//		this.work = work;
//		monitor.beginTask("crawling Author list...", work);
//		getAllBooks();
//		writeReadMe(this.outputDir);
//		this.monitor.done();
//	}
//
//	private void getAllBooks() throws IOException {
//		Set<String> authors = (Set<String>) authorNames.keySet();
//		int singleWork = work / authors.size();
//		for (String name : authors) {
//			monitor.subTask("Crawling " + name);
//			if (monitor.isCanceled()) {
//				throw new OperationCanceledException();
//			}
//			getBooksByAuthor(name, authorNames.get(name));
//			monitor.worked(singleWork);
//		}
//		return;
//	}

	/**
	 * 
	 * @param name
	 *            : name of the author
	 * @param url
	 *            : url of author's page which lists all his books
	 * @param monitor2 
	 * @throws IOException
	 */
	public int getBooksByAuthor(String author, String url, IProgressMonitor monitor2) throws IOException {
		int totalFilesCreated = 0;
		Assert.isNotNull(author, "Parameter author can't be empty");
		Assert.isNotNull(author, "Parameter url can't be empty");
       this.monitor = monitor2;
		String authDir = outputDir + File.separator + author;
		createIfMissing(authDir);

		Map<String, BookData> myBooks = new HashMap<String, BookData>();
		try {
			Document doc = retrieveDocumentFromUrl(url);// Jsoup.connect(url).timeout(10*1000).get();
			Boolean isText = doc.getElementsByTag("a").get(0).attr("abs:href")
					.contains("#");
			if (!isText) {
				getBooksByPage(doc, author, authDir, myBooks);
			}
			if (isText || myBooks.size() == 0) {
				myBooks.put(url, new BookData(author, url, authDir));
			}
		} catch (Exception e) {
			System.out.println(e);
		}

		for (Map.Entry<String, BookData> entry : myBooks.entrySet()) {
			String bookName = entry.getValue().getBookName();
			String bookUrl = entry.getValue().getBookUrl();
			String bookDir = entry.getValue().getBookDir();
			totalFilesCreated += getBookContent(bookUrl, bookName, bookDir);
		}
		return totalFilesCreated;
	}

	class BookData {

		private String bookName;
		private String bookUrl;
		private String bookDir;

		BookData(String bookName, String bookUrl, String bookDir) {
			this.bookName = bookName;
			this.bookUrl = bookUrl;
			this.bookDir = bookDir;
		}

		public String getBookUrl() {
			return bookUrl;
		}

		public String getBookDir() {
			return bookDir;
		}

		public String getBookName() {
			return bookName;
		}

		public void setBookName(String bookName) {
			this.bookName = bookName;
		}

		public void setBookUrl(String bookUrl) {
			this.bookUrl = bookUrl;
		}

		public void setBookDir(String bookDir) {
			this.bookDir = bookDir;
		}
	}

	/**
	 * 
	 * @param doc
	 *            This is the document from which books are collected.
	 * @param author
	 *            This is the author of all the books in this page
	 * @param authDir
	 *            This is the output directory of author. Books are to be
	 *            extracted into this directory
	 * @param bookList
	 *            The book information gathered from the page is filled into
	 *            this Map for the callee function to access
	 */
	private void getBooksByPage(Document doc, String author, String authDir,
			Map<String, BookData> bookList) {
		Elements subLists = doc.select("div.work"); // the books are listed
													// inside div.work
		Elements subHeaders = doc.select("h2.work"); // headers of sections

		// sometimes there are no div.work . Instead books are listed inside a
		// table element.
		if (subLists.size() == 0) {
			Elements tableLists = doc.select("table");
			for (Element table : tableLists) {
				if (table.getElementsByTag("a").size() > 0) {
					subLists.add(table);
				} else {
					subHeaders.add(table);
				}
			}
		}

		int i = 0, size1 = subLists.size(), size2 = subHeaders.size(), j = 0;
		Element head = null;
		String subDir = "";
		authDir += File.separator; // books will be inside authDir>subDir>

		while (i < size1 || j < size2) {
			try {
				if (j < size2) {
					head = subHeaders.get(j);
					Elements booksLink = head.getElementsByTag("a");
					boolean booksPresent = getBooksFromElement(booksLink,
							bookList, subDir, author);
					if (booksPresent) {
						j++;
						continue;
					}
					subDir = authDir + head.text() + File.separator;
				} else
					subDir = authDir + File.separator;

				if (i < size1) {
					Elements booksLink = subLists.get(i).getElementsByTag("a");
					getBooksFromElement(booksLink, bookList, subDir, author);
					i++;
				}
				j++;
			} catch (Exception e) {
			}
		}
	}

	private boolean getBooksFromElement(Elements bookLinks,
			Map<String, BookData> bookList, String bookDir, String author) {
		if (bookLinks.size() <= 0)
			return false;
		for (int i = 0; i < bookLinks.size(); i++) {
			String bookUrl = bookLinks.get(i).attr("abs:href");
			String bookName = bookLinks.get(i).text();
			if (skipBooks.contains(bookName))
				continue;
			if (authorNames.containsKey(bookName)
					|| (bookName.toLowerCase()).equals(author.toLowerCase()))
				continue;
			BookData book = new BookData(bookName, bookUrl, bookDir);
			bookList.put(bookUrl, book);
		}
		return true;
	}

	/**
	 * Creates a directory in the file system if it does not already exists
	 * 
	 * @param folder
	 *            : full path of the directory which has to be created.
	 */
	private void createIfMissing(String folder) {
		File path = new File(folder);
		if (!path.exists()) {
			path.mkdirs();
		}
	}

	public void initialize(String outputDir) {
		this.outputDir = outputDir;
	}

	public Map<String, String> getAuthorNames() throws IOException {
		Map<String, String> authorNames = new HashMap<String, String>();
		authorNames.putAll(getAuthorsList("http://www.thelatinlibrary.com/",
				false));
		authorNames.putAll(getAuthorsList(
				"http://www.thelatinlibrary.com/medieval.html", true));
		authorNames.putAll(getAuthorsList(
				"http://www.thelatinlibrary.com/christian.html", true));
		authorNames.putAll(getAuthorsList(
				"http://www.thelatinlibrary.com/neo.html", true));
		authorNames.putAll(getAuthorsList(
				"http://www.thelatinlibrary.com/misc.html", true));
		authorNames.putAll(getAuthorsList(
				"http://www.thelatinlibrary.com/ius.html", true));
		// authorNames.put("<All>", "All authors in the list");
		return authorNames;
	}

	public Map<String, String> getAuthorsList(String url, boolean isSubAuthor)
			throws IOException {
		int i, size = 0;
		String name;
		Map<String, String> authorNames = new HashMap<String, String>();

		Document doc = Jsoup.connect(url).timeout(10 * 1000).get();
		if (!isSubAuthor) {
			Elements authorsList = doc.getElementsByTag("option");

			size = authorsList.size();
			int count = 0;
			for (i = 0; i < size; i++) {
				name = authorsList.get(i).text();
				// url = "http://www.thelatinlibrary.com/" +
				// authorsList.get(i).attr("value");
				if (skipBooks.contains(name))
					continue;
				authorNames.put(name, "http://www.thelatinlibrary.com/"
						+ authorsList.get(i).attr("value"));
				count++;
			}
		}

		Element secondList = null;
		if (isSubAuthor)
			secondList = doc.getElementsByTag("table").get(0);
		else
			secondList = doc.getElementsByTag("table").get(1);
		Elements auth2List = secondList.getElementsByTag("td");
		for (Element auth : auth2List) {
			name = auth.text();
			// url = auth.getElementsByTag("a").attr("abs:href");
			if (authorNames.containsKey(name))
				continue;
			if (skipBooks.contains(name))
				continue;
			authorNames.put(name, auth.getElementsByTag("a").attr("abs:href"));
		}

		return authorNames;
	}

	public void getAllAuthors() throws IOException {
		Set<String> authors = (Set<String>) authorNames.keySet();
		for (String name : authors) {
			getSingleAuthor(name, authorNames.get(name));
		}
		return;
	}

	public void getSingleAuthor(String name, String url) throws IOException {
		String aurl = outputDir + File.separator + name;
		File authorDir = new File(aurl);
		if (!authorDir.exists()) {
			authorDir.mkdirs();
		}
		getBooks(name, url, aurl);
	}

	/*
	 * Get all books of a single author Recursive function to trace all books
	 * and links of a particular author
	 */
	private void getBooks(String author, String aurl, String apath)
			throws IOException {

		try {
			Document doc = Jsoup.connect(aurl).timeout(10 * 1000).get();
			Elements subLists = doc.select("div.work");
			if (subLists != null && subLists.size() > 0) {
				getBooksList(author, aurl, doc, apath);
				return;
			}

			Elements booksList = doc.getElementsByTag("td");

			int count = 0;
			String bookname = "";
			int i = 0;
			if (booksList != null) {
				int size = booksList.size();
				for (i = 0; i < size; i++) {
					Element bookItem = booksList.get(i);
					String bookText = bookItem.getElementsByTag("a").attr(
							"abs:href");
					if (bookText.contains("#"))
						continue;
					bookname = bookItem.text();
					if (bookText.isEmpty() || bookText == null)
						continue;
					if (skipBooks.contains(bookname))
						continue;
					if (authorNames.containsKey(bookname))
						continue;
					getBooks(bookname, bookText, apath);
					count++;

				}
			}
			if (count == 0) {
				if (doc.select("title") != null
						&& doc.select("title").size() > 0)
					bookname = doc.select("title").first().text();
				else if (doc.select("p.pagehead") != null
						&& doc.select("p.pagehead").size() > 0)
					bookname = doc.select("p.pagehead").first().text();
				else if (doc.select("h1") != null
						&& doc.select("h1").size() > 0)
					bookname = doc.select("h1").first().text();
				else
					bookname = author;
				getContent(aurl, bookname, apath);
			}
		} catch (Exception e) {
			// ConsoleView.writeInConsole("Something went wrong when extracting books of Author "
			// + author + e);
		}
	}

	private void getBooksList(String author, String aurl, Document doc,
			String apath) {
		Elements subLists = doc.select("div.work");
		Elements subHeaders = doc.select("h2.work");
		int i = 0, size1 = subLists.size(), size2 = subHeaders.size(), j = 0;
		String bookText = "";
		String bookname = "";
		Element head = null;
		int k;

		// handle count of headers and div
		while (i < size1 || j < size2) {
			try {
				if (j < size2) {
					head = subHeaders.get(j);
					Elements bookLink = head.getElementsByTag("a");
					if (bookLink != null && bookLink.size() > 0) {
						// make for all links
						bookText = bookLink.get(0).attr("abs:href");
						bookname = bookLink.get(0).text();
						if (skipBooks.contains(bookname))
							continue;
						if (authorNames.containsKey(bookname)
								|| (bookname.toLowerCase()).equals(author
										.toLowerCase()))
							continue;
						File authorDir = new File(apath + File.separator
								+ bookname);
						if (!authorDir.exists()) {
							authorDir.mkdirs();
						}
						String apath1 = authorDir.toString();
						getBooks(bookname, bookText, apath1);
						j++;
						continue;
					}
				}

				if (i < size1) {
					Element list = subLists.get(i);
					Elements booksList = list.getElementsByTag("td");

					String authorNewDir = apath + File.separator;
					if (j < size2)
						authorNewDir += head.text();
					else
						authorNewDir += "Others";
					File authorDir = new File(authorNewDir);
					if (!authorDir.exists()) {
						authorDir.mkdirs();
					}
					String apath1 = authorDir.toString();

					k = 0;
					int count1 = 0;
					Element bookItem = null;
					if (booksList != null) {
						for (k = 0; k < booksList.size(); k++) {
							bookItem = booksList.get(k);
							bookText = bookItem.getElementsByTag("a").attr(
									"abs:href");

							if (bookText == null)
								continue;
							bookname = bookItem.text();
							if (skipBooks.contains(bookname))
								continue;
							if (authorNames.containsKey(bookname))
								continue;
							getBooks(bookname, bookText, apath1);
							count1++;
						}
					}

					if (count1 == 0) {
						if (doc.select("title") != null)
							bookname = doc.select("title").first().text();
						else if (doc.select("p.pagehead") != null
								&& doc.select("p.pagehead").size() > 0)
							bookname = doc.select("p.pagehead").first().text();
						else if (doc.select("h1") != null
								&& doc.select("h1").size() > 0)
							bookname = doc.select("h1").first().text();
						else
							bookname = author;
						getContent(aurl, bookname, apath1);
					}
					i++;

				}
				j++;
			} catch (Exception e) {
				// ConsoleView.writeInConsole("Something went wrong when extracting books of Author "
				// + author);
			}
		}
	}

	private void getContent(String bookUri, String bookDir, String authorDir)
			throws IOException {
		BufferedWriter csvWriter = null;
		try {
			bookDir = bookDir.replaceAll(
					"[.,;\"!-()\\[\\]{}:?'/\\`~$%#@&*_=+<>*$]", "");
			csvWriter = new BufferedWriter(new FileWriter(new File(authorDir
					+ System.getProperty("file.separator") + bookDir + ".txt")));
			Document doc = Jsoup.connect(bookUri).timeout(10 * 10000).get();
			Elements content = doc.getElementsByTag("p");
			if (content.size() == 0) {
				if (csvWriter != null)
					csvWriter.close();
				getBookContent(bookUri, bookDir, authorDir);
				return;
			}
			ConsoleView.printlInConsoleln("Writing Content at " + authorDir
					+ System.getProperty("file.separator") + bookDir + ".txt");
			for (Element c : content) {
				csvWriter.write(c.text() + "\n");
			}
		} catch (Exception e) {
			getBookContent(bookUri, bookDir, authorDir);
		} finally {
			if (csvWriter != null)
				csvWriter.close();
		}
	}

	private int getBookContent(String bookUri, String bookName, String bookDir)
			throws IOException {
		BufferedWriter csvWriter = null;
		try {
			Document doc = retrieveDocumentFromUrl(bookUri);// Jsoup.connect(bookUri).timeout(10*10000).get();
			// bookName = getBookNameFromDoc(doc, bookName);
			bookName = bookName.replaceAll(
					"[.,;\"!-()\\[\\]{}:?'/\\`~$%#@&*_=+<>*$]", "");
			createIfMissing(bookDir);
			File fName = new File(bookDir
					+ System.getProperty("file.separator") + bookName
					+ ".txt");
			csvWriter = new BufferedWriter(
					new FileWriter(fName));
			Elements content = doc.getElementsByTag("p");
			if (content.size() == 0) {
				if (csvWriter != null)
					csvWriter.close();
				return getBookContent2(bookUri, bookName, bookDir);
				
			}
			for (Element c : content) {
				csvWriter.write(c.text() + "\n");
			}
			ConsoleView.printlInConsoleln("Writing Content at"+fName.getAbsolutePath());
		} catch (Exception e) {
			return getBookContent2(bookUri, bookName, bookDir);
		} finally {
			if (csvWriter != null)
				csvWriter.close();
		}
		return 1;
	}

	private int getBookContent2(String bookUri, String bookDir,
			String authorDir) throws IOException {
		BufferedWriter csvWriter = null;
		try {
			File fName = new File(authorDir
					+ System.getProperty("file.separator") + bookDir + ".txt");
			csvWriter = new BufferedWriter(new FileWriter(fName));
			Document doc = Jsoup.parse(new URL(bookUri).openStream(), "UTF-16",
					bookUri);
			Elements content = doc.getElementsByTag("p");
			ConsoleView.printlInConsoleln("Writing Contents at " +fName.getAbsolutePath());
			for (Element c : content) {
				csvWriter.write(c.text() + "\n");
			}
		} catch (Exception e) {
		} finally {
			if (csvWriter != null)
				csvWriter.close();
		}
		return 1;
	} /* The sub libraries */

	/*
	 * public void getAllSubAuthors(String connectUrl, String output) throws
	 * IOException{ int i = 0, size = 0; String name, url; Document doc =
	 * Jsoup.connect(connectUrl).timeout(10*1000).get();
	 * 
	 * Element secondList = doc.getElementsByTag("table").get(0); Elements
	 * auth2List = secondList.getElementsByTag("td"); for(Element auth :
	 * auth2List) { name = auth.text(); url =
	 * auth.getElementsByTag("a").attr("abs:href");
	 * if(authorNames.containsKey(name)) continue; //authorNames.add(name);
	 * if(skipBooks.contains(name)) continue; String aurl = output +
	 * File.separator + name; File authorDir = new File(aurl);
	 * if(!authorDir.exists()){ authorDir.mkdirs(); }
	 * //ConsoleView.writeInConsole("Extracting Books of Author  "+ name
	 * +"..."); appendLog("\nExtracting Books of Author  "+ name +"...");
	 * getBooks(name, url, aurl); i++;
	 * 
	 * }
	 * 
	 * return; }
	 */
	/*
	 * private void callSpecialAuthors(String url, String name) throws
	 * IOException{ File authorDir = new File(outputDir + File.separator +
	 * name); if(!authorDir.exists()){ authorDir.mkdirs(); }
	 * //getAllSubAuthors(url, outputDir + File.separator + name); }
	 * 
	 * 
	 * }
	 */

	public void writeReadMe(String location) {
		monitor.subTask("Writing Read Me ...");
		File readme = new File(location + File.separator + "README.txt");
		try {
			BufferedWriter bw = new BufferedWriter(new FileWriter(readme));
			String plugV = Platform
					.getBundle("edu.usc.cssl.tacit.plugins.latincrawler")
					.getHeaders().get("Bundle-Version");
			String appV = Platform
					.getBundle("edu.usc.cssl.tacit.application")
					.getHeaders().get("Bundle-Version");
			Date date = new Date();
			bw.write("Latin Crawler Output\n--------------------\n\nApplication Version: "
					+ appV
					+ "\nPlugin Version: "
					+ plugV
					+ "\nDate: "
					+ date.toString() + "\n\n");
			bw.write(readMe.toString());
			bw.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}