package org.frasermccrossan.ltc; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.URL; import java.util.ArrayList; import java.util.Calendar; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import android.annotation.SuppressLint; import android.content.Context; import android.content.pm.PackageInfo; import android.content.pm.PackageManager; import android.content.res.Resources; import android.database.sqlite.SQLiteException; import android.os.AsyncTask; import android.os.Build; import android.text.TextUtils.SimpleStringSplitter; import android.text.TextUtils.StringSplitter; // everything required to load the LTC_supplied data into the database @SuppressLint("UseSparseArrays") public class LTCScraper { LoadDataTask task = null; ScrapingStatus status = null; Context context; static final String PROXY_BASE = ""; static final Boolean PROXY_ENABLED = false; static final String LTC_BASE = "http://www.ltconline.ca"; static final String ROUTE_PATH = "/WebWatch/MobileAda.aspx"; public static final String ROUTE_URL = LTC_BASE + ROUTE_PATH; // used when calling the diagnostic screen static final String DIRECTION_PATH = "/WebWatch/MobileAda.aspx?r=%s"; static final String STOPS_PATH = "/WebWatch/MobileAda.aspx?r=%s&d=%d"; static final String LOCATIONS_PATH = "/WebWatch/UpdateWebMap.aspx?u=%s"; static final String PROXY_PREDICTION_PATH = "/WebWatch/ProxyMobileAda.aspx?r=%s&d=%s&s=%s"; static final String LTC_PREDICTION_PATH = "/WebWatch/MobileAda.aspx?r=%s&d=%s&s=%s"; static final Pattern TIME_PATTERN = Pattern.compile("(\\d{1,2}):(\\d{2}) ?([AP])?"); // matches arrival text in the MobileAda.aspx prediction static final Pattern ARRIVAL_PATTERN = Pattern.compile("(?i) *(\\d{1,2}:\\d{2} *[\\.apm]*) +(to .*)"); // pattern for route number in a[href] static final Pattern ROUTE_NUM_PATTERN = Pattern.compile("\\?r=(\\d{1,3})"); // pattern for direction number in a[href] static final Pattern DIRECTION_NUM_PATTERN = Pattern.compile("\\&d=(\\d+)"); // pattern for stop number in a[href] static final Pattern STOP_NUM_PATTERN = Pattern.compile("\\&s=(\\d+)"); // if no buses are found static final Pattern NO_INFO_PATTERN = Pattern.compile("(?mi)no stop information"); static final Pattern NO_BUS_PATTERN = Pattern.compile("(?mi)no further buses"); static final Pattern LOCATION_STOP_PATTERN = Pattern.compile("(\\d+)"); /* parseDocFromUri() starts with the initial timeout then retries doubling the timeout each time until * greater than the maximum timeout, thus we get (for example) 1, 2, 4, 8, 16 */ static final int INITIAL_FETCH_TIMEOUT = 2000; static final int MAXIMUM_FETCH_TIMEOUT = 128*1000; static final int FAILURE_LIMIT = 40; LTCScraper(Context c, ScrapingStatus s) { context = c; status = s; } LTCScraper(Context c) { /* instantiate this way if you plan only to check bus predictions */ context = c; } public void close() { if (task != null) { task.cancel(true); } } @SuppressLint("NewApi") public void loadAll() { task = new LoadDataTask(); if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) { /* on Honeycomb and later, ASyncTasks run on a serial executor, and since * we might have another asynctask running in an activity (e.g. fetching stop lists), * we don't really want them all to block */ task.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR); } else { task.execute(); } } private Document parseDocFromUri(String proxyPath, String ltcPath) throws IOException, MalformedURLException { return parseDocFromUri(proxyPath, ltcPath, INITIAL_FETCH_TIMEOUT); } /* * the first path is the one to try with the proxy, the second is for the real * site; these are only different for fetching predictions, and only because * we want to change the caching time and need a different path in nginx */ private Document parseDocFromUri(String proxyPath, String ltcPath, int initial_timeout) throws IOException, MalformedURLException { Document doc; URL url; boolean try_proxy = PROXY_ENABLED; int timeout = initial_timeout; while (true) { try { if (try_proxy) { url = new URL(PROXY_BASE + proxyPath); } else { url = new URL(LTC_BASE + ltcPath); } HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setConnectTimeout(timeout); connection.setReadTimeout(timeout); doc = null; try { InputStream in = new BufferedInputStream(connection.getInputStream(), 8192); doc = Jsoup.parse(in, null, proxyPath); } finally { connection.disconnect(); } break; } catch (SocketTimeoutException e) { timeout *= 8; if (timeout <= MAXIMUM_FETCH_TIMEOUT) { continue; } else { if (try_proxy) { // reached the maximum timeout, try switching from proxy to real site try_proxy=false; timeout = initial_timeout; continue; } throw e; } } catch (SocketException e) { /* * if we get a connection refused while trying the proxy then the proxy may be down, * so switch back to the LTC site */ if (try_proxy) { try_proxy = false; timeout = initial_timeout; continue; } else { throw e; } } } return doc; } public String ltcPredictionPath(LTCRoute route, String stopNumber) { return String.format(LTC_PREDICTION_PATH, route.number, route.direction, stopNumber); } public String proxyPredictionPath(LTCRoute route, String stopNumber) { return String.format(PROXY_PREDICTION_PATH, route.number, route.direction, stopNumber); } public ArrayList<Prediction> getPredictions(LTCRoute route, String stopNumber, ScrapeStatus scrapeStatus) { ArrayList<Prediction> predictions = new ArrayList<Prediction>(3); // usually get 3 of them Resources res = context.getResources(); try { Calendar now = Calendar.getInstance(); now.set(Calendar.SECOND, 0); now.set(Calendar.MILLISECOND, 0); // now we have 'now' set to the current time Document doc = parseDocFromUri(proxyPredictionPath(route,stopNumber), ltcPredictionPath(route, stopNumber), INITIAL_FETCH_TIMEOUT); Elements divs = doc.select("div"); if (divs.size() == 0) { throw new ScrapeException("LTC down?", ScrapeStatus.PROBLEM_IMMEDIATELY, true); } //Log.i("GP", String.format("rows=%d", timeRows.size())); for (Element div: divs) { //Log.i("GP", String.format("cols=%d", cols.size())); List<TextNode> textNodes = div.textNodes(); for (TextNode node: textNodes) { String text = node.text(); Matcher noBusMatcher = NO_BUS_PATTERN.matcher(text); if (noBusMatcher.find()) { throw new ScrapeException(res.getString(R.string.no_further), ScrapeStatus.PROBLEM_IF_ALL, false); } Matcher noStopMatcher = NO_INFO_PATTERN.matcher(text); if (noStopMatcher.find()) { throw new ScrapeException(res.getString(R.string.no_service), ScrapeStatus.PROBLEM_IF_ALL, false); } Matcher arrivalMatcher = ARRIVAL_PATTERN.matcher(text); while (arrivalMatcher.find()) { String textTime = arrivalMatcher.group(1); String destination = arrivalMatcher.group(2); predictions.add(new Prediction(route, textTime, destination, now)); } } } if (predictions.size() == 0) { throw new ScrapeException(res.getString(R.string.no_bus), ScrapeStatus.PROBLEM_IF_ALL, true); } scrapeStatus.setStatus(ScrapeStatus.OK, ScrapeStatus.NOT_PROBLEM, null); } catch (ScrapeException e) { scrapeStatus.setStatus(ScrapeStatus.FAILED, e.problemType, e.getMessage()); predictions.add(new Prediction(route, e.getMessage(), e.seriousProblem)); } catch (SocketTimeoutException e) { scrapeStatus.setStatus(ScrapeStatus.FAILED, ScrapeStatus.PROBLEM_IMMEDIATELY, e.getMessage()); predictions.add(new Prediction(context, route, R.string.times_timeout, true)); } catch (IOException e) { scrapeStatus.setStatus(ScrapeStatus.FAILED, ScrapeStatus.PROBLEM_IMMEDIATELY, e.getMessage()); predictions.add(new Prediction(context, route, R.string.times_fail, true)); } return predictions; } public ArrayList<LTCRoute> loadRoutes() throws ScrapeException, IOException { ArrayList<LTCRoute> routes = new ArrayList<LTCRoute>(); Document doc = parseDocFromUri(ROUTE_PATH, ROUTE_PATH); Elements routeLinks = doc.select("a[href]"); for (Element routeLink : routeLinks) { String name = routeLink.text(); Attributes attrs = routeLink.attributes(); String href = attrs.get("href"); Matcher m = ROUTE_NUM_PATTERN.matcher(href); if (m.find()) { String number = m.group(1); LTCRoute route = new LTCRoute(number, name/*, href*/); routes.add(route); } } return routes; } ArrayList<LTCDirection> loadDirections(String routeNum) throws ScrapeException, IOException { ArrayList<LTCDirection> directions = new ArrayList<LTCDirection>(2); // probably 2 String path = String.format(DIRECTION_PATH, routeNum); Document doc = parseDocFromUri(path, path); Elements dirLinks = doc.select("a[href]"); for (Element dirLink : dirLinks) { String name = dirLink.text(); Attributes attrs = dirLink.attributes(); String href = attrs.get("href"); Matcher m = DIRECTION_NUM_PATTERN.matcher(href); if (m.find()) { Integer number = Integer.valueOf(m.group(1)); LTCDirection dir = new LTCDirection(number, name); directions.add(dir); } } return directions; } HashMap<Integer, LTCStop> loadStops(String routeNum, int direction) throws ScrapeException, IOException { HashMap<Integer, LTCStop> stops = new HashMap<Integer, LTCStop>(); String path = String.format(STOPS_PATH, routeNum, direction); Document doc = parseDocFromUri(path, path); Elements stopLinks = doc.select("a[href]"); for (Element stopLink : stopLinks) { String name = stopLink.text(); Attributes attrs = stopLink.attributes(); String href = attrs.get("href"); Matcher m = STOP_NUM_PATTERN.matcher(href); if (m.find()) { Integer number = Integer.valueOf(m.group(1)); LTCStop stop = new LTCStop(number, name); stops.put(stop.number, stop); } } return stops; } /* this just updates existing stops with any locations found from the google map URL */ void loadStopLocations(String routeNum, HashMap<Integer, LTCStop> stops) throws ScrapeException, IOException { StringBuilder builder = new StringBuilder(8192); String line; URL url; HttpURLConnection connection; boolean try_proxy = PROXY_ENABLED; while (true) { try { if (try_proxy) { url = new URL(String.format(PROXY_BASE + LOCATIONS_PATH, routeNum)); connection = (HttpURLConnection) url.openConnection(); } else { url = new URL(String.format(LTC_BASE + LOCATIONS_PATH, routeNum)); connection = (HttpURLConnection) url.openConnection(); } connection.setConnectTimeout(MAXIMUM_FETCH_TIMEOUT); connection.setReadTimeout(MAXIMUM_FETCH_TIMEOUT); BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); while ((line = reader.readLine()) != null) { builder.append(line); } int offset = 0; String stopData = builder.toString(); // skip past the crap at the start by finding the 1st asterisk for (int i = 0; i < 1; ++i) { offset = stopData.indexOf('*', offset) + 1; } /* get the interesting part, and while we're at it, remove all asterisks * so we don't have to deal with them at the start of latitudes later */ String actualStopText = stopData.substring(offset).replace("*", ""); StringSplitter splitter = new SimpleStringSplitter(';'); splitter.setString(actualStopText); for (String stopInfo : splitter) { String elems[] = stopInfo.split("\\|"); if (elems.length == 7) { double latitude = Double.valueOf(elems[0]); double longitude = Double.valueOf(elems[1]); Matcher stopNumMatch = LOCATION_STOP_PATTERN.matcher(elems[4]); if (stopNumMatch.find()) { int stopNum = Integer.valueOf(stopNumMatch.group(1)); LTCStop stop = stops.get(stopNum); if (stop != null) { stop.latitude = latitude; stop.longitude = longitude; } } } } connection.disconnect(); return; } catch (SocketException e) { if (try_proxy) { try_proxy = false; continue; } // if this is already not the proxy, give up throw e; } } } private class LoadDataTask extends AsyncTask<Void, LoadProgress, Void> { protected Void doInBackground(Void... voids) { ArrayList<LTCRoute> routesToDo; ArrayList<LTCRoute> routesDone; // all distinct directions (should only end up with four) HashMap<Integer, LTCDirection> allDirections = new HashMap<Integer, LTCDirection>(4); // all distinct stops HashMap<Integer, LTCStop> allStops = new HashMap<Integer, LTCStop>(); // all stops that each route stops at in each direction ArrayList<RouteStopLink> links = new ArrayList<RouteStopLink>(); Resources res = context.getResources(); LoadProgress progress = new LoadProgress(); BusDb db; try { publishProgress(progress.title(res.getString(R.string.loading_stop_cache)).percent(0)); db = new BusDb(context); db.ensureStopPreload(); db.close(); publishProgress(progress.title(res.getString(R.string.downloading_routes)).percent(3)); routesToDo = loadRoutes(); if (routesToDo.size() == 0) { publishProgress(progress.title(res.getString(R.string.download_failed)) .message(res.getString(R.string.no_routes_found)) .failed()); } else { int totalToDo = routesToDo.size(); routesDone = new ArrayList<LTCRoute>(totalToDo); int failures = 0; MAINLOOP: while (routesToDo.size() > 0) { int i = 0; while (i < routesToDo.size()) { try { publishProgress(progress.message(String.format(res.getString(R.string.loading_route_nodir), routesToDo.get(i).name)) .percent(5 + 90 * routesDone.size() / totalToDo)); ArrayList<LTCDirection> routeDirections = loadDirections(routesToDo.get(i).number); // Log.d("loadtask", String.format("route %s has %d directions", routes.get(i).number, routeDirections.size())); int routeStopCount = 0; for (LTCDirection dir: routeDirections) { if (!allDirections.containsKey(dir.number)) { allDirections.put(dir.number, dir); } if (isCancelled()) { break MAINLOOP; } HashMap<Integer, LTCStop> dirStops = loadStops(routesToDo.get(i).number, dir.number); routeStopCount += dirStops.size(); if (isCancelled()){ break MAINLOOP; } for (int stopNumber: dirStops.keySet()) { if (!allStops.containsKey(stopNumber)) { allStops.put(stopNumber, dirStops.get(stopNumber)); } links.add(new RouteStopLink(routesToDo.get(i).number, dir.number, stopNumber)); } } db = new BusDb(context); boolean doLoadStop = db.getStopCount(routesToDo.get(i)) < routeStopCount || db.getLocationlessStopCount(routesToDo.get(i)) > 0; db.close(); if (doLoadStop) { publishProgress(progress.message(String.format(res.getString(R.string.loading_route_stop_locations), routesToDo.get(i).name)) .percent(6 + 90 * routesDone.size() / totalToDo)); loadStopLocations(routesToDo.get(i).number, allStops); } routesDone.add(routesToDo.get(i)); routesToDo.remove(i); // don't increment i, just remove the one we just did } catch (IOException e) { failures++; // note that one failed if (failures > FAILURE_LIMIT) { throw(new ScrapeException(res.getString(R.string.too_many_failures), ScrapeStatus.PROBLEM_IMMEDIATELY, true)); } i++; // go to the next one } } } publishProgress(progress.message(res.getString(R.string.saving_database)) .percent(95)); if (!isCancelled()) { db = new BusDb(context); db.saveBusData(routesDone, allDirections.values(), allStops.values(), links, false); db.close(); publishProgress(progress.title(res.getString(R.string.stop_download_complete)) .message(res.getString(R.string.database_ready)) .percent(100).complete()); } } } catch (IOException e) { publishProgress(progress.title(res.getString(R.string.unable_to_load_routes)) .message(e.getMessage()) .failed()); } catch (ScrapeException e) { publishProgress(progress.title(e.getMessage()) .message("") .failed()); } catch (SQLiteException e) { publishProgress(progress.title(e.getMessage()) .message("") .failed()); } return null; } protected void onProgressUpdate(LoadProgress... progress) { if (!isCancelled() && status != null) { status.update(progress[0]); } } } }