import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.sql.Time;
import java.text.ParseException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@SuppressWarnings("deprecation")
public class Scraper {
private static void runScanner(String website) {
String webURL = "the internet.";
try {
if (website.equals("BoilerLink")) {
webURL = "https://boilerlink.purdue.edu/EventRss/EventsRss";
scanBoilerLink(webURL);
} else if (website.equals("PurdueNews")) {
webURL = "http://www.purdue.edu/newsroom/rss/EventNews.xml";
scanPurdueNews(webURL);
}
} catch (IOException | ParseException e) {
System.out.println("ERROR: Unable to connect to "+website+" at "+webURL+
"\nTry again in a few seconds.");
e.printStackTrace();
}
}
private static void scanBoilerLink(String url) throws IOException, ParseException {
Connection com = Jsoup.connect(url);
Document scan_homepage = com.ignoreContentType(true).get();
Elements events = scan_homepage.select("item");
for (Element event : events) {
Event e = parseBoilerEvent(event);
System.out.println(e.getSendFormat());
// sendData(e);
}
}
private static void scanPurdueNews(String url) throws IOException, ParseException {
Connection com = Jsoup.connect(url);
Document scan_homepage = com.ignoreContentType(true).get();
Elements events = scan_homepage.select("item");
for (Element event : events) {
Event e = parsePurdueNewsEvent(event);
if (e.getError() == 0) {
System.out.println(e.getSendFormat());
//sendData(e);
}
}
}
public static Event parseBoilerEvent(Element event) {
// Create new event from parsed url
String page_url = event.text().split("<link />")[0].split(" ")[0];
Event k = new Event(page_url);
// Get author
String auth = event.select("author").text();
k.setAuthor(auth);
// TODO Implement ORGS
//k.setOrganization(org);
// Get category
String[] categories = event.select("category").text().split("/");
if (categories[0].length() == 0) {
String[] tmp = new String[1];
tmp[0] = "error";
k.setCategories(tmp);
} else {
k.setCategories(categories);
}
k.setTitle(event.select("title").text());
// Prepare body parser
String description = event.select("description").text();
String[] times = description.split("<span class=\"");
// Get start time
String startTime = times[1];
if (startTime.equals("dtstart\">")) {
k.setStartTime("error");
} else {
startTime = startTime.split("title=\"")[1].split("\">")[0];
if (!startTime.contains("T")) {
startTime = startTime + "T00:00:00";
}
k.setStartTime(startTime);
}
// Get end time
k.setEndTime(times[2].split("</span>")[0].split("title=\"")[1].split("\">")[0]);
// Add T constraint to the end of times
if (!k.getEndTime().contains("T") && !k.getEndTime().contains("error")) {
k.setEndTime(k.getEndTime() + "T00:00:00");
}
if (k.getStartTime() == "error" && k.getEndTime() != "error") {
k.setStartTime(k.getEndTime());
}
// Remove "T" from times
if (!k.getStartTime().contains("error")) {
k.setStartTime(k.getStartTime().replace("T", " "));
}
if (!k.getEndTime().contains("error")) {
k.setEndTime(k.getEndTime().replace("T", " "));
}
// MONTH DAY YEAR
if (!k.getStartTime().contains("error")) {
String[] sta = k.getStartTime().split("-");
String time = sta[2].split(" ")[1];
sta[2] = sta[2].split(" ")[0];
k.setStartTime(sta[1]+"-"+sta[2]+"-"+sta[0]+" "+time);
}
if (!k.getEndTime().contains("error")) {
String[] en = k.getEndTime().split("-");
String time = en[2].split(" ")[1];
en[2] = en[2].split(" ")[0];
k.setEndTime(en[1]+"-"+en[2]+"-"+en[0]+" "+time);
}
// Parse Location
k.setLocation(description.split("<span class=\"location\">")[1].split("</span>")[0]);
// Parse Description
description = description.split("<div class=\"description\">")[1];
if (description.equals("</div>")) {
k.setDescription("error");
} else {
description = description.split("</div>")[0];
description = description.replace(" ", "");
description = description.replace("&", "");
// Strip stray tags from description
String alt[] = description.split("(<)(.+?)(>)");
for (int i=1; i<alt.length; i++) {
alt[0] = alt[0] + alt[i];
}
description = alt[0];
k.setDescription(description);
}
return k;
}
public static Event parsePurdueNewsEvent(Element event) {
String page_URL = event.toString().split("<link />")[1].split("<description>")[0].trim();
Event k = new Event(page_URL);
// Parse and set title
String title = event.toString().split("<title>")[1].split("</title>")[0].trim();
k.setTitle(title);
// Parse and set description
String desc = event.toString().split("<description>")[1].split("</description>")[0].trim();
k.setDescription(desc);
// Set Category
String[] cat = {"University Event"};
k.setCategories(cat);
// Parse date
// Parse start and end times from description
String[] months = {"null", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"};
Pattern date_pattern = Pattern.compile("((January)|(February)|(March)|(April)|(May)|(June)|(July)|(August)|(September)|(October)|(November)|(December))(\\s+)(\\d+)");
Matcher date_matcher = date_pattern.matcher(desc);
String date = null;
int month = 0;
int day = 0;
while (date_matcher.find()) {
date = date_matcher.group();
month = 12;
while (month > 0 && !months[month].equals(date.split(" ")[0])) {
month--;
}
day = Integer.parseInt(date.split(" ")[1]);
}
// Indicate invalid month or day
if (month == 0 || day == 0) { k.setError(); }
String strMonth = String.valueOf(month);
if (strMonth.length() < 2) {
strMonth = "0" + strMonth;
}
String strDay = String.valueOf(day);
if (strDay.length() < 2) {
strDay = "0" + strDay;
}
// Parse start and end times from description
Pattern time_pattern = Pattern.compile("([\\d]*)(:*)([\\d]+)(\\s+?)((a.m.)|(p.m.)){1}");
Matcher time_matcher = time_pattern.matcher(desc.toLowerCase());
int i = 0;
while (time_matcher.find()) {
String found = time_matcher.group();
String time;
if (found.contains("a.m.") && !found.contains(":")) {
time = found.split("a.m.")[0].split(" ")[0];
if (time.length() < 2) { time = "0" + time; }
time += ":00:00";
} else if (found.contains("a.m.") && found.contains(":")) {
time = found.split("a.m.")[0].split(" ")[0];
String tmp[] = time.split(":");
tmp[0] = String.valueOf(Integer.valueOf(tmp[0])+ 12);
time = tmp[0] + ":" + tmp[1];
time += ":00";
} else if (found.contains("p.m.") && !found.contains(":")) {
time = String.valueOf(Integer.valueOf(found.split("a.m.")[0].split(" ")[0]) + 12);
if (time.length() < 2) { time = "0" + time; }
time += ":00:00";
} else {
time = found.split("p.m.")[0].split(" ")[0];
String tmp[] = time.split(":");
tmp[0] = String.valueOf(Integer.valueOf(tmp[0])+ 12);
time = tmp[0] + ":" + tmp[1];
time += ":00";
}
if (i == 0) {
k.setStartTime(time);
} else if (i == 1) {
k.setEndTime(time);
}
i++;
}
// Indicate no valid times found
if (i == 0) { k.setError(); }
// Build event time format
k.setStartTime(strMonth + "-" + strDay + "-2014 " + k.getStartTime());
// Parse location from description
k.setLocation("Purdue University");
return k;
}
public static void sendData(Event e) {
HttpClient httpClient = new DefaultHttpClient();
try {
HttpPost request = new HttpPost("http://54.213.17.69:9000/scraper_handle");
StringEntity params = new StringEntity(e.getSendFormat());
request.addHeader("content-type", "application/json");
request.setEntity(params);
HttpResponse response = httpClient.execute(request);
// handle response here...
}catch (Exception ex) {
// handle exception here
ex.printStackTrace();
System.exit(2);
} finally {
httpClient.getConnectionManager().shutdown();
System.out.println("SENT!");
}
}
public static void main(String[] args) throws InterruptedException {
boolean run = true;
int delay = 10; // Seconds
while (run) {
//runScanner("BoilerLink");
runScanner("PurdueNews");
// Thread.sleep(delay*1000);
// TODO DEBUG
run = false;
}
}
}