package edu.uc.cssl.tacit.crawlers.supremecourt.services;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.OperationCanceledException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SupremeCourtCrawler {
private String filter, url, outputDir;
private boolean truncate, downloadAudio;
private String baseUrl;
private CrawlerJob crawler;
public SupremeCourtCrawler(String filter, String outputDir, String crawlUrl) {
this.filter = filter;
this.outputDir = outputDir;
this.truncate = false;
this.downloadAudio = false;
this.baseUrl = crawlUrl;
this.url = crawlUrl + filter + "?order=title&sort=asc";
}
public String getOutputDir() {
return outputDir;
}
public void setTruncate(boolean truncate) {
this.truncate = truncate;
}
public boolean isTruncate() {
return truncate;
}
public void setDownloadAudio(boolean downloadAudio) {
this.downloadAudio = downloadAudio;
}
public boolean isDownloadAudio() {
return downloadAudio;
}
public void looper(IProgressMonitor monitor) throws IOException {
int noOfPages = 0;
Document doc = Jsoup.connect(url).timeout(10 * 1000).get();
Elements pages = doc.getElementsByClass("pager-last");
monitor.subTask("Finding Total number of cases...");
// Sometimes the pager element is absent
if (!pages.isEmpty()) {
Element pageList = pages.get(0);
Pattern pattern = Pattern.compile("page=([0-9]+)");
Matcher matcher = pattern.matcher(pageList.toString());
if (matcher.find()) {
noOfPages = Integer.parseInt(matcher.group(1));
}
}
monitor.worked(10);
if (monitor.isCanceled()) {
throw new OperationCanceledException();
}
// ExecutorService executor = Executors.newFixedThreadPool(5);
crawler = new CrawlerJob(this.filter, getOutputDir(), this.baseUrl,
monitor, isDownloadAudio(), isTruncate());
try {
for (int i = 0; i <= noOfPages; i++) {
if (monitor.isCanceled()) {
throw new OperationCanceledException();
}
// monitor.subTask("crawling " + url);
crawler.run(url + "&page=" + i,noOfPages);
}
} catch (IOException exc) {
crawler.summaryFileClose();
throw exc;
}
crawler.summaryFileClose();
monitor.worked(100);
// This will make the executor accept no new threads
// and finish all existing threads in the queue
}
}