BiblioSearchScraper.java example

Explorer
UnisaConnect-master
- src
  - it
    - fdev
package it.fdev.scraper;

import it.fdev.unisaconnect.FragmentBiblioDoSearch;
import it.fdev.unisaconnect.R;
import it.fdev.unisaconnect.data.Book;
import it.fdev.utils.Utils;

import java.io.IOException;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import android.app.IntentService;
import android.content.Context;
import android.content.Intent;
import android.util.Log;

public class BiblioSearchScraper extends IntentService {

	public final static String BROADCAST_STATE_BIBLIO_SEARCH = "it.fdev.biblio.status_search";

	public static boolean isRunning = false;

	private Context mContext;
	private static CookieManager mCookieManager = new CookieManager();
	private static String reqParam;

	public BiblioSearchScraper() {
		super("it.fdev.biblio.search_scraper_service");
	}

	@Override
	protected void onHandleIntent(Intent intent) {
		try {
			String url = intent.getStringExtra(FragmentBiblioDoSearch.ARG_URI);
			if (url == null) {
				isRunning = false;
				stopForeground(true);
				stopSelf();
				return;
			}

			isRunning = true;
			mContext = getApplicationContext();

			Utils.sendLoadingMessage(mContext, R.string.cerco_libri);

			ArrayList<Book> results = getSearchResults(url);
			Utils.broadcastStatus(mContext, BROADCAST_STATE_BIBLIO_SEARCH, FragmentBiblioDoSearch.BROADCAST_STATUS, results);
		} catch (Exception e) {
			Log.e(Utils.TAG, "Biblio search service crashed", e);
			Utils.broadcastStatus(mContext, BROADCAST_STATE_BIBLIO_SEARCH, FragmentBiblioDoSearch.BROADCAST_STATUS, null);
		}
		isRunning = false;
		stopForeground(true);
		stopSelf();
		return;
	}

	public static String prepareCookies(String url) throws IOException {
		CookieHandler.setDefault(mCookieManager);
		Response res = Jsoup.connect(url).method(Method.GET).timeout(30000).execute();
		Document document = res.parse();

		String toSearch = Pattern.quote("lio-aleph.unisa.it/F/") + "([A-Z0-9]+" + Pattern.quote("-") + "[0-9]+)" + Pattern.quote("?func=fin");
		Pattern pattern = Pattern.compile(toSearch);
		Matcher matcher = pattern.matcher(document.toString());
		if (!matcher.find() || matcher.groupCount() < 1) {
			return null;
		}
		return matcher.group(1);
	}

	private ArrayList<Book> getSearchResults(String url) throws IOException {
		String urlToGet = url;
		if (reqParam == null || reqParam.isEmpty()) {
			reqParam = prepareCookies(url);
		}
		if (reqParam == null) {
			return null;
		}
		urlToGet = url.replace(".unisa.it/F/", ".unisa.it/F/" + reqParam);

		Log.d(Utils.TAG, "urlToGet: " + urlToGet);

		Response res = Jsoup.connect(urlToGet).method(Method.GET).timeout(30000).execute();
		Document document = res.parse();
		
		boolean isValidResults = document.getElementsMatchingOwnText("Numero di record nel set superato").isEmpty();
		if (!isValidResults) {
			return null;
		}

		ArrayList<Book> resultList = new ArrayList<Book>();
		Element table = document.getElementsByTag("table").last();
		Elements rows = table.getAllElements().first().getElementsByTag("tr");
		for (int i = 1; i < rows.size(); i++) {
			Element cRow = rows.get(i);
			Elements cols = cRow.getElementsByTag("td");
			if (cols.size() < 7) {
				return null;
			}
			String resultNumber = cols.get(0).text().trim();
			String detailsUrl = cols.get(0).getElementsByTag("a").first().attr("href").trim();
			String author = cols.get(2).text().trim();
			String format = cols.get(3).text().trim();
			String title = cols.get(4).text().trim();
			String year = cols.get(5).text().trim();
			String position = cols.get(6).getAllElements().first().text().trim();
			Book cBook = new Book(resultNumber, author, format, title, year, detailsUrl, position);
			resultList.add(cBook);
		}
		return resultList;
	}

}