package org.frasermccrossan.ltc;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import android.annotation.SuppressLint;
import android.content.Context;
import android.content.pm.PackageInfo;
import android.content.pm.PackageManager;
import android.content.res.Resources;
import android.database.sqlite.SQLiteException;
import android.os.AsyncTask;
import android.os.Build;
import android.text.TextUtils.SimpleStringSplitter;
import android.text.TextUtils.StringSplitter;
// everything required to load the LTC_supplied data into the database
@SuppressLint("UseSparseArrays")
public class LTCScraper {
LoadDataTask task = null;
ScrapingStatus status = null;
Context context;
static final String PROXY_BASE = "";
static final Boolean PROXY_ENABLED = false;
static final String LTC_BASE = "http://www.ltconline.ca";
static final String ROUTE_PATH = "/WebWatch/MobileAda.aspx";
public static final String ROUTE_URL = LTC_BASE + ROUTE_PATH; // used when calling the diagnostic screen
static final String DIRECTION_PATH = "/WebWatch/MobileAda.aspx?r=%s";
static final String STOPS_PATH = "/WebWatch/MobileAda.aspx?r=%s&d=%d";
static final String LOCATIONS_PATH = "/WebWatch/UpdateWebMap.aspx?u=%s";
static final String PROXY_PREDICTION_PATH = "/WebWatch/ProxyMobileAda.aspx?r=%s&d=%s&s=%s";
static final String LTC_PREDICTION_PATH = "/WebWatch/MobileAda.aspx?r=%s&d=%s&s=%s";
static final Pattern TIME_PATTERN = Pattern.compile("(\\d{1,2}):(\\d{2}) ?([AP])?");
// matches arrival text in the MobileAda.aspx prediction
static final Pattern ARRIVAL_PATTERN = Pattern.compile("(?i) *(\\d{1,2}:\\d{2} *[\\.apm]*) +(to .*)");
// pattern for route number in a[href]
static final Pattern ROUTE_NUM_PATTERN = Pattern.compile("\\?r=(\\d{1,3})");
// pattern for direction number in a[href]
static final Pattern DIRECTION_NUM_PATTERN = Pattern.compile("\\&d=(\\d+)");
// pattern for stop number in a[href]
static final Pattern STOP_NUM_PATTERN = Pattern.compile("\\&s=(\\d+)");
// if no buses are found
static final Pattern NO_INFO_PATTERN = Pattern.compile("(?mi)no stop information");
static final Pattern NO_BUS_PATTERN = Pattern.compile("(?mi)no further buses");
static final Pattern LOCATION_STOP_PATTERN = Pattern.compile("(\\d+)");
/* parseDocFromUri() starts with the initial timeout then retries doubling the timeout each time until
* greater than the maximum timeout, thus we get (for example) 1, 2, 4, 8, 16
*/
static final int INITIAL_FETCH_TIMEOUT = 2000;
static final int MAXIMUM_FETCH_TIMEOUT = 128*1000;
static final int FAILURE_LIMIT = 40;
LTCScraper(Context c, ScrapingStatus s) {
context = c;
status = s;
}
LTCScraper(Context c) {
/* instantiate this way if you plan only to check bus predictions
*/
context = c;
}
public void close() {
if (task != null) {
task.cancel(true);
}
}
@SuppressLint("NewApi")
public void loadAll() {
task = new LoadDataTask();
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) {
/* on Honeycomb and later, ASyncTasks run on a serial executor, and since
* we might have another asynctask running in an activity (e.g. fetching stop lists),
* we don't really want them all to block
*/
task.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
}
else {
task.execute();
}
}
private Document parseDocFromUri(String proxyPath, String ltcPath) throws IOException, MalformedURLException {
return parseDocFromUri(proxyPath, ltcPath, INITIAL_FETCH_TIMEOUT);
}
/*
* the first path is the one to try with the proxy, the second is for the real
* site; these are only different for fetching predictions, and only because
* we want to change the caching time and need a different path in nginx */
private Document parseDocFromUri(String proxyPath, String ltcPath, int initial_timeout) throws IOException, MalformedURLException {
Document doc;
URL url;
boolean try_proxy = PROXY_ENABLED;
int timeout = initial_timeout;
while (true) {
try {
if (try_proxy) {
url = new URL(PROXY_BASE + proxyPath);
}
else {
url = new URL(LTC_BASE + ltcPath);
}
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setConnectTimeout(timeout);
connection.setReadTimeout(timeout);
doc = null;
try {
InputStream in = new BufferedInputStream(connection.getInputStream(), 8192);
doc = Jsoup.parse(in, null, proxyPath);
}
finally {
connection.disconnect();
}
break;
}
catch (SocketTimeoutException e) {
timeout *= 8;
if (timeout <= MAXIMUM_FETCH_TIMEOUT) {
continue;
}
else {
if (try_proxy) {
// reached the maximum timeout, try switching from proxy to real site
try_proxy=false;
timeout = initial_timeout;
continue;
}
throw e;
}
}
catch (SocketException e) {
/*
* if we get a connection refused while trying the proxy then the proxy may be down,
* so switch back to the LTC site
*/
if (try_proxy) {
try_proxy = false;
timeout = initial_timeout;
continue;
}
else {
throw e;
}
}
}
return doc;
}
public String ltcPredictionPath(LTCRoute route, String stopNumber) {
return String.format(LTC_PREDICTION_PATH, route.number, route.direction, stopNumber);
}
public String proxyPredictionPath(LTCRoute route, String stopNumber) {
return String.format(PROXY_PREDICTION_PATH, route.number, route.direction, stopNumber);
}
public ArrayList<Prediction> getPredictions(LTCRoute route, String stopNumber, ScrapeStatus scrapeStatus) {
ArrayList<Prediction> predictions = new ArrayList<Prediction>(3); // usually get 3 of them
Resources res = context.getResources();
try {
Calendar now = Calendar.getInstance();
now.set(Calendar.SECOND, 0);
now.set(Calendar.MILLISECOND, 0); // now we have 'now' set to the current time
Document doc = parseDocFromUri(proxyPredictionPath(route,stopNumber),
ltcPredictionPath(route, stopNumber),
INITIAL_FETCH_TIMEOUT);
Elements divs = doc.select("div");
if (divs.size() == 0) {
throw new ScrapeException("LTC down?", ScrapeStatus.PROBLEM_IMMEDIATELY, true);
}
//Log.i("GP", String.format("rows=%d", timeRows.size()));
for (Element div: divs) {
//Log.i("GP", String.format("cols=%d", cols.size()));
List<TextNode> textNodes = div.textNodes();
for (TextNode node: textNodes) {
String text = node.text();
Matcher noBusMatcher = NO_BUS_PATTERN.matcher(text);
if (noBusMatcher.find()) {
throw new ScrapeException(res.getString(R.string.no_further), ScrapeStatus.PROBLEM_IF_ALL, false);
}
Matcher noStopMatcher = NO_INFO_PATTERN.matcher(text);
if (noStopMatcher.find()) {
throw new ScrapeException(res.getString(R.string.no_service), ScrapeStatus.PROBLEM_IF_ALL, false);
}
Matcher arrivalMatcher = ARRIVAL_PATTERN.matcher(text);
while (arrivalMatcher.find()) {
String textTime = arrivalMatcher.group(1);
String destination = arrivalMatcher.group(2);
predictions.add(new Prediction(route, textTime, destination, now));
}
}
}
if (predictions.size() == 0) {
throw new ScrapeException(res.getString(R.string.no_bus), ScrapeStatus.PROBLEM_IF_ALL, true);
}
scrapeStatus.setStatus(ScrapeStatus.OK, ScrapeStatus.NOT_PROBLEM, null);
}
catch (ScrapeException e) {
scrapeStatus.setStatus(ScrapeStatus.FAILED, e.problemType, e.getMessage());
predictions.add(new Prediction(route, e.getMessage(), e.seriousProblem));
}
catch (SocketTimeoutException e) {
scrapeStatus.setStatus(ScrapeStatus.FAILED, ScrapeStatus.PROBLEM_IMMEDIATELY, e.getMessage());
predictions.add(new Prediction(context, route, R.string.times_timeout, true));
}
catch (IOException e) {
scrapeStatus.setStatus(ScrapeStatus.FAILED, ScrapeStatus.PROBLEM_IMMEDIATELY, e.getMessage());
predictions.add(new Prediction(context, route, R.string.times_fail, true));
}
return predictions;
}
public ArrayList<LTCRoute> loadRoutes() throws ScrapeException, IOException {
ArrayList<LTCRoute> routes = new ArrayList<LTCRoute>();
Document doc = parseDocFromUri(ROUTE_PATH, ROUTE_PATH);
Elements routeLinks = doc.select("a[href]");
for (Element routeLink : routeLinks) {
String name = routeLink.text();
Attributes attrs = routeLink.attributes();
String href = attrs.get("href");
Matcher m = ROUTE_NUM_PATTERN.matcher(href);
if (m.find()) {
String number = m.group(1);
LTCRoute route = new LTCRoute(number, name/*, href*/);
routes.add(route);
}
}
return routes;
}
ArrayList<LTCDirection> loadDirections(String routeNum) throws ScrapeException, IOException {
ArrayList<LTCDirection> directions = new ArrayList<LTCDirection>(2); // probably 2
String path = String.format(DIRECTION_PATH, routeNum);
Document doc = parseDocFromUri(path, path);
Elements dirLinks = doc.select("a[href]");
for (Element dirLink : dirLinks) {
String name = dirLink.text();
Attributes attrs = dirLink.attributes();
String href = attrs.get("href");
Matcher m = DIRECTION_NUM_PATTERN.matcher(href);
if (m.find()) {
Integer number = Integer.valueOf(m.group(1));
LTCDirection dir = new LTCDirection(number, name);
directions.add(dir);
}
}
return directions;
}
HashMap<Integer, LTCStop> loadStops(String routeNum, int direction) throws ScrapeException, IOException {
HashMap<Integer, LTCStop> stops = new HashMap<Integer, LTCStop>();
String path = String.format(STOPS_PATH, routeNum, direction);
Document doc = parseDocFromUri(path, path);
Elements stopLinks = doc.select("a[href]");
for (Element stopLink : stopLinks) {
String name = stopLink.text();
Attributes attrs = stopLink.attributes();
String href = attrs.get("href");
Matcher m = STOP_NUM_PATTERN.matcher(href);
if (m.find()) {
Integer number = Integer.valueOf(m.group(1));
LTCStop stop = new LTCStop(number, name);
stops.put(stop.number, stop);
}
}
return stops;
}
/* this just updates existing stops with any locations found from the google map URL */
void loadStopLocations(String routeNum, HashMap<Integer, LTCStop> stops) throws ScrapeException, IOException {
StringBuilder builder = new StringBuilder(8192);
String line;
URL url;
HttpURLConnection connection;
boolean try_proxy = PROXY_ENABLED;
while (true) {
try {
if (try_proxy) {
url = new URL(String.format(PROXY_BASE + LOCATIONS_PATH, routeNum));
connection = (HttpURLConnection) url.openConnection();
} else {
url = new URL(String.format(LTC_BASE + LOCATIONS_PATH, routeNum));
connection = (HttpURLConnection) url.openConnection();
}
connection.setConnectTimeout(MAXIMUM_FETCH_TIMEOUT);
connection.setReadTimeout(MAXIMUM_FETCH_TIMEOUT);
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
while ((line = reader.readLine()) != null) {
builder.append(line);
}
int offset = 0;
String stopData = builder.toString();
// skip past the crap at the start by finding the 1st asterisk
for (int i = 0; i < 1; ++i) {
offset = stopData.indexOf('*', offset) + 1;
}
/* get the interesting part, and while we're at it, remove all asterisks
* so we don't have to deal with them at the start of latitudes later
*/
String actualStopText = stopData.substring(offset).replace("*", "");
StringSplitter splitter = new SimpleStringSplitter(';');
splitter.setString(actualStopText);
for (String stopInfo : splitter) {
String elems[] = stopInfo.split("\\|");
if (elems.length == 7) {
double latitude = Double.valueOf(elems[0]);
double longitude = Double.valueOf(elems[1]);
Matcher stopNumMatch = LOCATION_STOP_PATTERN.matcher(elems[4]);
if (stopNumMatch.find()) {
int stopNum = Integer.valueOf(stopNumMatch.group(1));
LTCStop stop = stops.get(stopNum);
if (stop != null) {
stop.latitude = latitude;
stop.longitude = longitude;
}
}
}
}
connection.disconnect();
return;
} catch (SocketException e) {
if (try_proxy) {
try_proxy = false;
continue;
}
// if this is already not the proxy, give up
throw e;
}
}
}
private class LoadDataTask extends AsyncTask<Void, LoadProgress, Void> {
protected Void doInBackground(Void... voids) {
ArrayList<LTCRoute> routesToDo;
ArrayList<LTCRoute> routesDone;
// all distinct directions (should only end up with four)
HashMap<Integer, LTCDirection> allDirections = new HashMap<Integer, LTCDirection>(4);
// all distinct stops
HashMap<Integer, LTCStop> allStops = new HashMap<Integer, LTCStop>();
// all stops that each route stops at in each direction
ArrayList<RouteStopLink> links = new ArrayList<RouteStopLink>();
Resources res = context.getResources();
LoadProgress progress = new LoadProgress();
BusDb db;
try {
publishProgress(progress.title(res.getString(R.string.loading_stop_cache)).percent(0));
db = new BusDb(context);
db.ensureStopPreload();
db.close();
publishProgress(progress.title(res.getString(R.string.downloading_routes)).percent(3));
routesToDo = loadRoutes();
if (routesToDo.size() == 0) {
publishProgress(progress.title(res.getString(R.string.download_failed))
.message(res.getString(R.string.no_routes_found))
.failed());
}
else {
int totalToDo = routesToDo.size();
routesDone = new ArrayList<LTCRoute>(totalToDo);
int failures = 0;
MAINLOOP: while (routesToDo.size() > 0) {
int i = 0;
while (i < routesToDo.size()) {
try {
publishProgress(progress.message(String.format(res.getString(R.string.loading_route_nodir), routesToDo.get(i).name))
.percent(5 + 90 * routesDone.size() / totalToDo));
ArrayList<LTCDirection> routeDirections = loadDirections(routesToDo.get(i).number);
// Log.d("loadtask", String.format("route %s has %d directions", routes.get(i).number, routeDirections.size()));
int routeStopCount = 0;
for (LTCDirection dir: routeDirections) {
if (!allDirections.containsKey(dir.number)) {
allDirections.put(dir.number, dir);
}
if (isCancelled()) {
break MAINLOOP;
}
HashMap<Integer, LTCStop> dirStops = loadStops(routesToDo.get(i).number, dir.number);
routeStopCount += dirStops.size();
if (isCancelled()){
break MAINLOOP;
}
for (int stopNumber: dirStops.keySet()) {
if (!allStops.containsKey(stopNumber)) {
allStops.put(stopNumber, dirStops.get(stopNumber));
}
links.add(new RouteStopLink(routesToDo.get(i).number, dir.number, stopNumber));
}
}
db = new BusDb(context);
boolean doLoadStop = db.getStopCount(routesToDo.get(i)) < routeStopCount ||
db.getLocationlessStopCount(routesToDo.get(i)) > 0;
db.close();
if (doLoadStop) {
publishProgress(progress.message(String.format(res.getString(R.string.loading_route_stop_locations), routesToDo.get(i).name))
.percent(6 + 90 * routesDone.size() / totalToDo));
loadStopLocations(routesToDo.get(i).number, allStops);
}
routesDone.add(routesToDo.get(i));
routesToDo.remove(i); // don't increment i, just remove the one we just did
}
catch (IOException e) {
failures++; // note that one failed
if (failures > FAILURE_LIMIT) {
throw(new ScrapeException(res.getString(R.string.too_many_failures), ScrapeStatus.PROBLEM_IMMEDIATELY, true));
}
i++; // go to the next one
}
}
}
publishProgress(progress.message(res.getString(R.string.saving_database))
.percent(95));
if (!isCancelled()) {
db = new BusDb(context);
db.saveBusData(routesDone, allDirections.values(), allStops.values(), links, false);
db.close();
publishProgress(progress.title(res.getString(R.string.stop_download_complete))
.message(res.getString(R.string.database_ready))
.percent(100).complete());
}
}
}
catch (IOException e) {
publishProgress(progress.title(res.getString(R.string.unable_to_load_routes))
.message(e.getMessage())
.failed());
}
catch (ScrapeException e) {
publishProgress(progress.title(e.getMessage())
.message("")
.failed());
}
catch (SQLiteException e) {
publishProgress(progress.title(e.getMessage())
.message("")
.failed());
}
return null;
}
protected void onProgressUpdate(LoadProgress... progress) {
if (!isCancelled() && status != null) {
status.update(progress[0]);
}
}
}
}