package org.rr.jeborker.metadata.download;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import org.apache.commons.io.Charsets;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.rr.commons.collection.TransformValueSet;
import org.rr.commons.log.LoggerFactory;
import org.rr.commons.utils.StringUtil;
/**
* {@link MetadataDownloader} implementation that loads metadata from the "Katalog der deutschen Nationalbibliothek".
*/
public class GoogleBooksDeMetadataDownloader implements MetadataDownloader {
private static final String MAIN_URL = "http://www.google.de";
private static final String QUERY_URL_PART = MAIN_URL + "/search?tbm=bks&q=";
private static final int ENTRIES_TO_FETCH = 50;
private static final int PAGES_TO_LOAD = ENTRIES_TO_FETCH / 10;
private static final String PAGE_CHARSET = Charsets.ISO_8859_1.name();
@Override
public List<MetadataDownloadEntry> search(String phrase) {
try {
List<URL> searchUrl = getSearchPageUrls(phrase);
List<byte[]> pageHtmlContent = MetadataDownloadUtils.loadPages(searchUrl, PAGES_TO_LOAD);
List<Document> htmlDocs = getDocuments(pageHtmlContent);
Set<String> allLinks = getSearchResultLinks(htmlDocs);
List<byte[]> metadataHtmlContent = loadLinkContent(allLinks);
return getMetadataDownloadEntries(metadataHtmlContent);
} catch (IOException e) {
LoggerFactory.getLogger(this).log(Level.WARNING, "Failed to fetch metadata for search '" + phrase + "'", e);
}
return null;
}
private List<MetadataDownloadEntry> getMetadataDownloadEntries(List<byte[]> metadataHtmlContent) throws IOException {
List<MetadataDownloadEntry> result = new ArrayList<>(metadataHtmlContent.size());
for (byte[] html : metadataHtmlContent) {
if (html != null) {
Document htmlDoc = Jsoup.parse(new ByteArrayInputStream(html), PAGE_CHARSET, MAIN_URL);
GoogleBooksDeDownloadMetadataEntry entry = new GoogleBooksDeDownloadMetadataEntry(htmlDoc);
if(StringUtil.isNotEmpty(entry.getTitle())) {
result.add(entry);
}
}
}
return result;
}
private List<byte[]> loadLinkContent(Set<String> allLinks) throws IOException {
List<byte[]> loadPages = MetadataDownloadUtils.loadPages(new TransformValueSet<String, URL>(allLinks) {
@Override
public URL transform(String link) {
try {
return new URL(link);
} catch (MalformedURLException e) {
LoggerFactory.getLogger(this).log(Level.SEVERE, "Failed to create url for " + link, e);
}
return null;
}
}, 10);
return loadPages;
}
private Set<String> getSearchResultLinks(List<Document> htmlDocs) {
Set<String> allLinks = new LinkedHashSet<>();
for (Document document : htmlDocs) {
allLinks.addAll(getSearchResultLinks(document));
}
return allLinks;
}
private List<String> getSearchResultLinks(Document doc) {
List<String> links = new ArrayList<>(ENTRIES_TO_FETCH);
Elements headlines = doc.getElementsByTag("h3");
for (Element headline : headlines) {
Element link = headline.child(0);
if(link.tagName().equals("a")) {
String href = link.attr("href");
if (href != null && href.contains("books.google.") && !href.contains("printsec=")) {
href = href.replaceAll("https://", "http://");
links.add(href);
}
}
}
return links;
}
private List<Document> getDocuments(List<byte[]> content) throws IOException {
List<Document> documents = new ArrayList<>(content.size());
for (byte[] bs : content) {
documents.add(Jsoup.parse(new ByteArrayInputStream(bs), PAGE_CHARSET, MAIN_URL));
}
return documents;
}
private List<URL> getSearchPageUrls(String searchTerm) throws UnsupportedEncodingException, MalformedURLException {
String encodesSearchPhrase = URLEncoder.encode(searchTerm, StringUtil.UTF_8);
List<URL> urls = new ArrayList<>(PAGES_TO_LOAD);
for (int i = 0; i < PAGES_TO_LOAD; i++) {
String position = "&start=" + (i * 10);
urls.add(new URL(QUERY_URL_PART + encodesSearchPhrase + position));
}
return urls;
}
}