/* * Copyright (C) 2010 Daniel Jacobi * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package de.questmaster.tudmensa; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.Vector; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import android.os.Build; public class DataExtractor implements Runnable { private MensaMeals mActivity = null; private MealsDbAdapter mDbHelper = null; private String mFirstDate = null; private String mLocation = null; private boolean mWork_done = false; private MensaMealsSettings.Settings mSettings = new MensaMealsSettings.Settings(); public DataExtractor(MensaMeals c, String location) { this.mActivity = c; this.mDbHelper = c.mDbHelper; this.mLocation = location; // Read settings mSettings.ReadSettings(c); } public void run() { mWork_done = false; try { /* DOM implementation for Android is buggy till 2.1u1 */ if (Build.VERSION.SDK_INT < 8) throw new Exception("Workaround for XML parser bug."); parseWebsiteDOM(mLocation, "week"); parseWebsiteDOM(mLocation, "nextweek"); // parseWebsitePULL(mLocation, "week"); // parseWebsitePULL(mLocation, "nextweek"); } catch (Exception e) { /* home grown parser (Fallback) */ parseTable(getWebPage(mLocation, "week")); parseTable(getWebPage(mLocation, "nextweek")); } if (mSettings.m_bDeleteOldData) mDbHelper.deleteOldMeal(mFirstDate); mWork_done = true; mActivity.mHandler.sendEmptyMessage(0); } public boolean isAlive() { return !mWork_done; } /* parse Website and store in database */ private Vector<String> getWebPage(String task, String view) { Vector<String> webTable = new Vector<String>(); try { URL uTest = new URL("http://www.studentenwerkdarmstadt.de/index.php?option=com_spk&task=" + task + "&view=" + view); BufferedReader br = new BufferedReader(new InputStreamReader(uTest.openStream()), 2048); boolean store = false; String s; while ((s = br.readLine()) != null) { // find first line of meal tables if (s.indexOf("class=\"spk_table\">") >= 0) { // remove before table s = s.substring(s.indexOf("<tr><td")); store = true; } if (store) { if (s.indexOf("</table>") >= 0) { // remove after table s = "</table>"; store = false; } // append line webTable.add(s); if (!store) { break; // fertig } } } br.close(); } catch (MalformedURLException e) { // Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // Auto-generated catch block e.printStackTrace(); } return webTable; } private void parseTable(Vector<String> tbl) { Vector<String> days = new Vector<String>(); int day_index = 0; int meal_num = 0; String curCounter = ""; for (String s : tbl) { // new Row -> maybe new counter if (s.startsWith("<tr>")) { if (s.indexOf(" ") < 0) { // get new Counter name curCounter = extractData(s); meal_num = 0; } else if (!curCounter.equals("")) { // more meals at one counter meal_num++; } } if (s.startsWith("<td")) { String tmp = extractData(s); // date line if (curCounter.compareTo("") == 0 && tmp.length() == 10) { tmp = tmp.substring(6, 10) + tmp.substring(3, 5) + tmp.substring(0, 2); days.add(tmp); if (mFirstDate == null) { mFirstDate = tmp; } } else { parseMealString(days, day_index, meal_num, curCounter, tmp); } day_index++; } if (s.startsWith("</tr>")) { day_index = 0; } // </table>: end -> clean old entries if (s.equals("</table>")) { break; } } } private String extractData(String s) { // cut end "</td>" s = s.substring(0, s.length() - 5); // cut begining return s.substring(s.lastIndexOf(">") + 1, s.length()).trim(); } private String htmlDecode(String in) { String out = in; out = out.replaceAll("ä", "ä"); out = out.replaceAll("Ä", "Ä"); out = out.replaceAll("ö", "ö"); out = out.replaceAll("Ö", "Ö"); out = out.replaceAll("ü", "ü"); out = out.replaceAll("Ü", "Ü"); out = out.replaceAll("ß", "ß"); out = out.replaceAll("&", "&"); out = out.replaceAll(""", "\""); out = out.replaceAll("´", "\'"); out = out.replaceAll("è", "é"); return out; } /** * The DOM parser is buggy till at least 2.1. This code works form android * 2.2 on. But has a workaround for the bug, so can be used with all Android * versions. * * @param task * @param view * @throws SAXException */ private void parseWebsiteDOM(String task, String view) throws SAXException { Vector<String> days = new Vector<String>(); int day_index = 0; int meal_num = 0; String curCounter = ""; try { URL uMenuWebsite = new URL("http://www.studentenwerkdarmstadt.de/index.php?option=com_spk&task=" + task + "&view=" + view); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc; doc = db.parse(new BufferedInputStream(uMenuWebsite.openStream(), 2048)); doc.getDocumentElement().normalize(); NodeList tables = doc.getElementsByTagName("table"); // iterate tables for (int i = 0; i < tables.getLength(); i++) { // check table class attribute Node tblNode = tables.item(i); if (getAttribute(tblNode, "class").equals("spk_table")) { NodeList tblRows = tblNode.getChildNodes(); // iterate rows for (int j = 0; j < tblRows.getLength(); j++) { Node tblRow = tblRows.item(j); if (tblRow.getNodeType() == Node.ELEMENT_NODE) { NodeList tblEntries = tblRow.getChildNodes(); String counter_name; if (tblEntries.item(0).getFirstChild().getNodeType() == Node.ENTITY_REFERENCE_NODE) { counter_name = "nbsp"; } else { counter_name = tblEntries.item(0).getFirstChild().getNodeValue(); } if (counter_name.indexOf("nbsp") < 0) { // get new Counter name curCounter = counter_name; meal_num = 0; } else if (!curCounter.equals("")) { // more meals at one counter meal_num++; } // iterate tbl-data for (int k = 1; k < tblEntries.getLength(); k++) { Node tblEntry = tblEntries.item(k); if (tblEntry.getNodeType() == Node.ELEMENT_NODE) { if (j == 0) { String tmp = tblEntry.getFirstChild().getNodeValue(); // first row are the dates tmp = tmp.substring(6, 10) + tmp.substring(3, 5) + tmp.substring(0, 2); days.add(tmp); if (mFirstDate == null) { mFirstDate = tmp; } } else { // Rebuild String NodeList tblData = tblEntry.getChildNodes(); String tmp = ""; for (int l = 0; l < tblData.getLength(); l++) { Node data = tblData.item(l); if (data.getNodeType() == Node.TEXT_NODE) { tmp += data.getNodeValue(); } else if (data.getNodeType() == Node.ENTITY_REFERENCE_NODE) { tmp += "&" + data.getNodeName() + ";"; } } //tmp = htmlDecode(tmp); // further rows are data parseMealString(days, day_index, meal_num, curCounter, tmp); day_index++; } } // ENTITY_REFERENCE_NODEs } // Data day_index = 0; } } // Row } // Table } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } } /** * @param days * @param day_index * @param meal_num * @param curCounter * @param tmp */ private void parseMealString(Vector<String> days, int day_index, int meal_num, String curCounter, String tmp) { // ?-sign unfortunately not encoded, so checking price-tag if (tmp.lastIndexOf(",") > 0 && day_index < days.size() && Character.isDigit(tmp.charAt(tmp.lastIndexOf(",") - 1)) && Character.isDigit(tmp.charAt(tmp.lastIndexOf(",") + 1)) && Character.isDigit(tmp.charAt(tmp.lastIndexOf(",") + 2))) { // price tmp = tmp.substring(0, tmp.lastIndexOf(",") + 3); String price = tmp.substring(tmp.lastIndexOf(" ") + 1); // cut price from string tmp = tmp.substring(0, tmp.length() - price.length()).trim(); // add EUR sign to price string price += " €"; // type String type = tmp.substring(tmp.lastIndexOf(" ") + 1); // cut type from string String meal = tmp.substring(0, tmp.length() - type.length()).trim(); meal = htmlDecode(meal); // create detail information String info; if (type.equals("F")) { info = mActivity.getResources().getString(R.string.type_fish); } else if (type.equals("G")) { info = mActivity.getResources().getString(R.string.type_poultry); } else if (type.equals("K")) { info = mActivity.getResources().getString(R.string.type_calf); } else if (type.equals("R")) { info = mActivity.getResources().getString(R.string.type_beef); } else if (type.equals("RS")) { info = mActivity.getResources().getString(R.string.type_beefpig); } else if (type.equals("S")) { info = mActivity.getResources().getString(R.string.type_pig); } else if (type.equals("V")) { info = mActivity.getResources().getString(R.string.type_vegie); } else { info = ""; } // get additional information // (extract from meal name) String mealInspect = meal; while (mealInspect.contains("(") && mealInspect.contains(")")) { String additions = mealInspect.substring(mealInspect.indexOf("(") + 1, mealInspect.indexOf(")")); mealInspect = mealInspect.substring(mealInspect.indexOf(")") + 1); // skip current (...) String[] splitAdditions = additions.split(","); try { for (String s1 : splitAdditions) { switch (Integer.parseInt(s1)) { case 1: info += "\n(1) " + mActivity.getResources().getString(R.string.note_colorant); break; case 2: info += "\n(2) " + mActivity.getResources().getString(R.string.note_preservative); break; case 3: info += "\n(3) " + mActivity.getResources().getString(R.string.note_antioxidant); break; case 4: info += "\n(4) " + mActivity.getResources().getString(R.string.note_flavor_enhancer); break; case 5: info += "\n(5) " + mActivity.getResources().getString(R.string.note_sulphur_treated); break; case 6: info += "\n(6) " + mActivity.getResources().getString(R.string.note_blackened); break; case 7: info += "\n(7) " + mActivity.getResources().getString(R.string.note_waxed); break; case 8: info += "\n(8) " + mActivity.getResources().getString(R.string.note_phosphate); break; case 9: info += "\n(9) " + mActivity.getResources().getString(R.string.note_sweetening); break; case 11: info += "\n(11) " + mActivity.getResources().getString(R.string.note_phenylalanine_source); break; } } } catch (NumberFormatException e) { // No number, so its nothing // we care about } } // Add table entry String date = days.get(day_index); long rowId = 0; if ((rowId = mDbHelper.fetchMealId(mLocation, date, curCounter, meal_num)) >= 0) { mDbHelper.updateMeal(rowId, mLocation, date, meal_num, curCounter, meal, type, price, info); } else mDbHelper.createMeal(mLocation, date, meal_num, curCounter, meal, type, price, info); } } private String getAttribute(Node n, String attrib) { NamedNodeMap nnm = n.getAttributes(); if (nnm != null) { Node nod = nnm.getNamedItem(attrib); if (nod != null) return nod.getNodeValue(); } // its NOT there return ""; } // /** // * uses also buggy XML parser. :( // */ // private void parseWebsitePULL(String task, String view) { // XmlPullParser parser = Xml.newPullParser(); // boolean do_parse = false; // boolean new_row = false; // // Vector<String> days = new Vector<String>(); // int day_index = 0; // int meal_num = 0; // String curCounter = ""; // // try { // URL uMealList = new // URL("http://www.studentenwerkdarmstadt.de/index.php?option=com_spk&task=" // + task + "&view=" + view); // // auto-detect the encoding from the stream // parser.setInput(new BufferedReader(new InputStreamReader(new // HTMLInputStream(uMealList.openStream())), 2048)); // int eventType = parser.getEventType(); // boolean done = false; // while (eventType != XmlPullParser.END_DOCUMENT && !done) { // String name = null; // switch (eventType) { // case XmlPullParser.START_TAG: // name = parser.getName(); // if (name.equalsIgnoreCase("table")) { // if (parser.getAttributeValue(null, "class") != null && // "spk_table".equalsIgnoreCase(parser.getAttributeValue(null, "class"))) { // do_parse = true; // } // } else if (do_parse) { // if (name.equalsIgnoreCase("tr")) { // new_row = true; // // } else if (name.equalsIgnoreCase("td")) { // String tmp; // try { // tmp = parser.nextText(); // } catch (XmlPullParserException e2) { // // skip img tags // while (parser.next() != XmlPullParser.TEXT) { // } // tmp = parser.getText(); // } // // // date line // if (curCounter.compareTo("") == 0 && tmp.length() == 10) { // tmp = tmp.substring(6, 10) + tmp.substring(3, 5) + tmp.substring(0, 2); // days.add(tmp); // // if (mFirstDate == null) { // mFirstDate = tmp; // } // // } else if (new_row) { // if (!tmp.equals("")) { // // get new Counter name // curCounter = tmp; // meal_num = 0; // } else if (!curCounter.equals("")) { // // more meals at one counter // meal_num++; // } // new_row = false; // // // ?-sign unfortunately not encoded, so checking // // price-tag // } else // parseMealString(days, day_index, meal_num, curCounter, tmp); // // day_index++; // // } // } // break; // case XmlPullParser.END_TAG: // name = parser.getName(); // if (name.equalsIgnoreCase("table")) { // do_parse = false; // done = true; // } else if (do_parse) { // if (name.equalsIgnoreCase("tr")) { // day_index = 0; // } // } // break; // } // eventType = parser.next(); // } // } catch (XmlPullParserException e1) { // e1.printStackTrace(); // } catch (IOException e1) { // e1.printStackTrace(); // } // } }