GoogleBooksDeMetadataDownloader.java example

Explorer
jeboorker-master
package org.rr.jeborker.metadata.download;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;

import org.apache.commons.io.Charsets;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.rr.commons.collection.TransformValueSet;
import org.rr.commons.log.LoggerFactory;
import org.rr.commons.utils.StringUtil;

/**
 * {@link MetadataDownloader} implementation that loads metadata from the "Katalog der deutschen Nationalbibliothek".
 */
public class GoogleBooksDeMetadataDownloader implements MetadataDownloader {

	private static final String MAIN_URL = "http://www.google.de";

	private static final String QUERY_URL_PART = MAIN_URL + "/search?tbm=bks&q=";

	private static final int ENTRIES_TO_FETCH = 50;

	private static final int PAGES_TO_LOAD = ENTRIES_TO_FETCH / 10;
	
	private static final String PAGE_CHARSET = Charsets.ISO_8859_1.name();

	@Override
	public List<MetadataDownloadEntry> search(String phrase) {
		try {
			List<URL> searchUrl = getSearchPageUrls(phrase);
			List<byte[]> pageHtmlContent = MetadataDownloadUtils.loadPages(searchUrl, PAGES_TO_LOAD);
			List<Document> htmlDocs = getDocuments(pageHtmlContent);
			Set<String> allLinks = getSearchResultLinks(htmlDocs);
			List<byte[]> metadataHtmlContent = loadLinkContent(allLinks);
			return getMetadataDownloadEntries(metadataHtmlContent);
		} catch (IOException e) {
			LoggerFactory.getLogger(this).log(Level.WARNING, "Failed to fetch metadata for search '" + phrase + "'", e);
		}
		return null;
	}

	private List<MetadataDownloadEntry> getMetadataDownloadEntries(List<byte[]> metadataHtmlContent) throws IOException {
		List<MetadataDownloadEntry> result = new ArrayList<>(metadataHtmlContent.size());
		for (byte[] html : metadataHtmlContent) {
			if (html != null) {
				Document htmlDoc = Jsoup.parse(new ByteArrayInputStream(html), PAGE_CHARSET, MAIN_URL);
				GoogleBooksDeDownloadMetadataEntry entry = new GoogleBooksDeDownloadMetadataEntry(htmlDoc);
				if(StringUtil.isNotEmpty(entry.getTitle())) {
					result.add(entry);
				}
			}
		}
		return result;
	}

	private List<byte[]> loadLinkContent(Set<String> allLinks) throws IOException {
		List<byte[]> loadPages = MetadataDownloadUtils.loadPages(new TransformValueSet<String, URL>(allLinks) {

			@Override
			public URL transform(String link) {
				try {
					return new URL(link);
				} catch (MalformedURLException e) {
					LoggerFactory.getLogger(this).log(Level.SEVERE, "Failed to create url for " + link, e);
				}
				return null;
			}
		}, 10);
		return loadPages;
	}

	private Set<String> getSearchResultLinks(List<Document> htmlDocs) {
		Set<String> allLinks = new LinkedHashSet<>();
		for (Document document : htmlDocs) {
			allLinks.addAll(getSearchResultLinks(document));
		}
		return allLinks;
	}

	private List<String> getSearchResultLinks(Document doc) {
		List<String> links = new ArrayList<>(ENTRIES_TO_FETCH);
		Elements headlines = doc.getElementsByTag("h3");
		for (Element headline : headlines) {
			Element link = headline.child(0);
			if(link.tagName().equals("a")) {
				String href = link.attr("href");
				if (href != null && href.contains("books.google.") && !href.contains("printsec=")) {
					href = href.replaceAll("https://", "http://");
					links.add(href);
				}
			}
		}
		return links;
	}

	private List<Document> getDocuments(List<byte[]> content) throws IOException {
		List<Document> documents = new ArrayList<>(content.size());
		for (byte[] bs : content) {
			documents.add(Jsoup.parse(new ByteArrayInputStream(bs), PAGE_CHARSET, MAIN_URL));
		}
		return documents;
	}

	private List<URL> getSearchPageUrls(String searchTerm) throws UnsupportedEncodingException, MalformedURLException {
		String encodesSearchPhrase = URLEncoder.encode(searchTerm, StringUtil.UTF_8);
		List<URL> urls = new ArrayList<>(PAGES_TO_LOAD);
		for (int i = 0; i < PAGES_TO_LOAD; i++) {
			String position = "&start=" + (i * 10);
			urls.add(new URL(QUERY_URL_PART + encodesSearchPhrase + position));
		}
		return urls;
	}

}