package edu.uc.cssl.tacit.crawlers.supremecourt.services;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.io.FileUtils;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.core.runtime.OperationCanceledException;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
public class CrawlerJob {
private String filter;
private String outputDir;
private boolean truncate;
private boolean downloadAudio;
private String baseUrl;
private String url;
private IProgressMonitor monitor;
private FileWriter fileWriter;
private BufferedWriter bw;
public CrawlerJob(String filter, String outputDir, String crawlUrl,
IProgressMonitor monitor, boolean downloadAudio, boolean truncate) {
this.filter = filter;
this.outputDir = outputDir;
this.truncate = truncate;
this.downloadAudio = downloadAudio;
this.baseUrl = crawlUrl;
this.monitor = monitor;
openSummaryFile();
}
private void openSummaryFile() {
DateFormat df = new SimpleDateFormat("MM-dd-yyyy-HH-mm-ss");
Date dateobj = new Date();
try {
fileWriter = new FileWriter(this.outputDir + "/"
+ "supremecourt-crawler-summary-" + df.format(dateobj)
+ ".csv");
bw = new BufferedWriter(fileWriter);
addContentsToSummary("Case", "Location", "Docket No", "Argued",
"Decided", "Majority Author", "Vote", "File Type",
"File name");
} catch (IOException e) {
}
}
private void addContentsToSummary(String... contents) {
try {
for (String content : contents) {
if (content.contains(",")) {
content = content.replace(",", " ");
}
bw.write(content);
bw.write(",");
}
bw.newLine();
} catch (IOException e) {
e.printStackTrace();
}
}
public void summaryFileClose() {
try {
bw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void run(String url, int noOfPages) throws IOException {
crawl(url, noOfPages);
}
protected Document retrieveDocumentFromUrl(String url) throws IOException {
Document doc = null;
doc = Jsoup.connect(url).timeout(10 * 1000).get();
return doc;
}
public void crawl(String url, int noOfPages) throws IOException {
Document doc = retrieveDocumentFromUrl(url);
Element table = doc.select("tbody").get(0);
Elements rows = table.select("tr");
int totalDone = 0;
int remaining = 0;
if (rows.size() > 0)
remaining = 9900 / rows.size();
if (noOfPages > 0)
remaining = remaining / noOfPages;
if (remaining == 0) {
remaining = 1;
}
for (Element row : rows) {
if (monitor.isCanceled()) {
throw new OperationCanceledException();
}
// ConsoleView.writeInConsole(row.select("a").get(0).attr("href"));
String contenturl = baseUrl + row.select("a").get(0).attr("href");
// ConsoleView.writeInConsole(row.select("td").get(1).text().trim());
String date = row.select("td").get(2).text().trim();
// Skip if no argument date
if (date.equals("")) {
ConsoleView.printlInConsoleln("No argument date found for "
+ row.select("td").get(1).text().trim()
+ ". Hence it will not be crawled ");
continue;
}
String[] casesSplit = row.select("a").get(0).attr("href")
.split("/");
monitor.subTask("Crawling " + "Case : "
+ row.select("a").get(0).text() + " year : "
+ casesSplit[casesSplit.length - 2] + " url : " + url);
ConsoleView.printlInConsole("Crawling " + "Case : "
+ row.select("a").get(0).text() + " year : "
+ casesSplit[casesSplit.length - 2]);
String filename = row.select("td").get(1).text().trim() + "_"
+ date.substring(6) + date.substring(0, 2)
+ date.substring(3, 5);
ConsoleView.printlInConsoleln(" url :" + contenturl);
// Fixing the unhandled exception without cascading.
try {
CrawlerData crawlDetails = getFiles(contenturl, filename);
if (crawlDetails.getFileLocation().length() > 1) {
if (crawlDetails.getFileLocation().contains(",")) {
addContentsToSummary(row.select("td").get(0).text(),
row.select("td").get(1).text(),
crawlDetails.getLocation(), row.select("td")
.get(2).text(), row.select("td").get(3)
.text(),
row.select("td").get(4).text(), row
.select("td").get(5).text(),
"Transcript", crawlDetails.getFileLocation()
.split(",")[0]);
addContentsToSummary(row.select("td").get(0).text(),
crawlDetails.getLocation(), row.select("td")
.get(1).text(), row.select("td").get(2)
.text(),
row.select("td").get(3).text(), row
.select("td").get(4).text(), row
.select("td").get(5).text(), "Mp3",
crawlDetails.getFileLocation().split(",")[1]);
} else {
addContentsToSummary(row.select("td").get(0).text(),
crawlDetails.getLocation(), row.select("td")
.get(1).text(), row.select("td").get(2)
.text(),
row.select("td").get(3).text(), row
.select("td").get(4).text(), row
.select("td").get(5).text(),
"Transcript", crawlDetails.getFileLocation());
}
}
totalDone += remaining;
if (totalDone <= 9900)
monitor.worked(remaining);
} catch (IOException e) {
ConsoleView.printlInConsoleln("Error Accessing the URL "
+ contenturl);
e.printStackTrace();
} finally {
}
// break;
}
}
private CrawlerData getFiles(String contenturl, String filename)
throws IOException {
File trans = new File(this.outputDir + "/" + filename
+ "-transcript.txt");
BufferedWriter bw = new BufferedWriter(new FileWriter(trans));
Document doc = retrieveDocumentFromUrl(contenturl);
CrawlerData crawlData = new CrawlerData();
Elements hidden = doc.select("div.hidden");
if (hidden.size() == 0) {
ConsoleView.printlInConsoleln("No data. Skipping page "
+ contenturl);
bw.close();
return crawlData;
}
if (monitor.isCanceled()) {
bw.close();
throw new OperationCanceledException();
}
// "-transcript.txt"
String outputDetail = "Writing " + outputDir + "/" + filename
+ "-transcript.txt";
ConsoleView.printlInConsoleln(outputDetail);
this.monitor.subTask(outputDetail);
// Element transcript = doc.select("div.hidden").get(0);
Element transcript = hidden.get(0);
Elements lines = transcript.select("p");
if (lines.size() < 1) {
lines = doc.select("div.content").select("p");
}
for (Element line : lines) {
if (monitor.isCanceled()) {
bw.close();
throw new OperationCanceledException();
}
bw.write(line.text() + "\n");
}
bw.close();
boolean exist = true;
if (FileUtils.sizeOf(trans) <= 20) {
trans.delete();
exist = false;
}
String dwn = "";
if (this.downloadAudio) {
if (monitor.isCanceled()) {
throw new OperationCanceledException();
}
dwn = downloadAudioFilesFromWebPage(filename, doc);
}
if (dwn.length() > 1) {
crawlData.setFileLocation(outputDir + "/" + filename
+ "-transcript.txt" + "," + dwn);
} else {
if (exist)
crawlData.setFileLocation(outputDir + "/" + filename
+ "-transcript.txt");
}
if (doc.select("div.case-location") != null
&& doc.select("div.case-location").select("a") != null
&& doc.select("div.case-location").select("a").size() > 0
&& doc.select("div.case-location").select("a").get(0).text() != "")
crawlData.setLocation(doc.select("div.case-location").select("a")
.get(0).text());
else {
crawlData.setLocation("");
}
return crawlData;
}
public String downloadAudioFilesFromWebPage(String filename, Document doc)
throws IOException {
// "-argument.mp3"
Elements links = doc.select(".audio");
for (Element mp3 : links) {
if (mp3.attr("href").contains(".mp3")) {
if (monitor.isCanceled()) {
throw new OperationCanceledException();
}
return downloadTranscriptMp3File(filename, mp3);
// Once mp3 found, no need to continue for loop
}
}
return "";
}
private String downloadTranscriptMp3File(String filename, Element mp3)
throws IOException {
ConsoleView.printlInConsoleln("Downloading " + baseUrl
+ mp3.attr("href"));
Response audio;
FileOutputStream fos;
audio = downloadAudio(mp3);
File file = new File(outputDir + "/" + filename + "-argument.mp3");
fos = new FileOutputStream(file);
fos.write(audio.bodyAsBytes());
fos.close();
if (FileUtils.sizeOf(file) <= 0) {
file.delete();
}
return file.getAbsolutePath();
}
protected Response downloadAudio(Element mp3) throws IOException {
Response audio;
if (!this.truncate)
audio = Jsoup.connect(baseUrl + mp3.attr("href"))
.cookie("oyez-tos", "1.0").maxBodySize(0)
.ignoreContentType(true).execute();
else
audio = Jsoup.connect(baseUrl + mp3.attr("href"))
.cookie("oyez-tos", "1.0").ignoreContentType(true)
.execute();
return audio;
}
protected Document parseContentFromUrl(String crawlUrl) throws IOException {
return Jsoup.connect(crawlUrl.toString()).get();
}
}