package it.fdev.scraper; import it.fdev.unisaconnect.FragmentBiblioDoSearch; import it.fdev.unisaconnect.R; import it.fdev.unisaconnect.data.Book; import it.fdev.utils.Utils; import java.io.IOException; import java.net.CookieHandler; import java.net.CookieManager; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import android.app.IntentService; import android.content.Context; import android.content.Intent; import android.util.Log; public class BiblioSearchScraper extends IntentService { public final static String BROADCAST_STATE_BIBLIO_SEARCH = "it.fdev.biblio.status_search"; public static boolean isRunning = false; private Context mContext; private static CookieManager mCookieManager = new CookieManager(); private static String reqParam; public BiblioSearchScraper() { super("it.fdev.biblio.search_scraper_service"); } @Override protected void onHandleIntent(Intent intent) { try { String url = intent.getStringExtra(FragmentBiblioDoSearch.ARG_URI); if (url == null) { isRunning = false; stopForeground(true); stopSelf(); return; } isRunning = true; mContext = getApplicationContext(); Utils.sendLoadingMessage(mContext, R.string.cerco_libri); ArrayList<Book> results = getSearchResults(url); Utils.broadcastStatus(mContext, BROADCAST_STATE_BIBLIO_SEARCH, FragmentBiblioDoSearch.BROADCAST_STATUS, results); } catch (Exception e) { Log.e(Utils.TAG, "Biblio search service crashed", e); Utils.broadcastStatus(mContext, BROADCAST_STATE_BIBLIO_SEARCH, FragmentBiblioDoSearch.BROADCAST_STATUS, null); } isRunning = false; stopForeground(true); stopSelf(); return; } public static String prepareCookies(String url) throws IOException { CookieHandler.setDefault(mCookieManager); Response res = Jsoup.connect(url).method(Method.GET).timeout(30000).execute(); Document document = res.parse(); String toSearch = Pattern.quote("lio-aleph.unisa.it/F/") + "([A-Z0-9]+" + Pattern.quote("-") + "[0-9]+)" + Pattern.quote("?func=fin"); Pattern pattern = Pattern.compile(toSearch); Matcher matcher = pattern.matcher(document.toString()); if (!matcher.find() || matcher.groupCount() < 1) { return null; } return matcher.group(1); } private ArrayList<Book> getSearchResults(String url) throws IOException { String urlToGet = url; if (reqParam == null || reqParam.isEmpty()) { reqParam = prepareCookies(url); } if (reqParam == null) { return null; } urlToGet = url.replace(".unisa.it/F/", ".unisa.it/F/" + reqParam); Log.d(Utils.TAG, "urlToGet: " + urlToGet); Response res = Jsoup.connect(urlToGet).method(Method.GET).timeout(30000).execute(); Document document = res.parse(); boolean isValidResults = document.getElementsMatchingOwnText("Numero di record nel set superato").isEmpty(); if (!isValidResults) { return null; } ArrayList<Book> resultList = new ArrayList<Book>(); Element table = document.getElementsByTag("table").last(); Elements rows = table.getAllElements().first().getElementsByTag("tr"); for (int i = 1; i < rows.size(); i++) { Element cRow = rows.get(i); Elements cols = cRow.getElementsByTag("td"); if (cols.size() < 7) { return null; } String resultNumber = cols.get(0).text().trim(); String detailsUrl = cols.get(0).getElementsByTag("a").first().attr("href").trim(); String author = cols.get(2).text().trim(); String format = cols.get(3).text().trim(); String title = cols.get(4).text().trim(); String year = cols.get(5).text().trim(); String position = cols.get(6).getAllElements().first().text().trim(); Book cBook = new Book(resultNumber, author, format, title, year, detailsUrl, position); resultList.add(cBook); } return resultList; } }