GoogleScholar.java example

Explorer
jabref-master
- src
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.io.StringReader;
import java.net.HttpCookie;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.utils.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
 */
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
    private static final Log LOGGER = LogFactory.getLog(GoogleScholar.class);

    private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)");

    private static final String BASIC_SEARCH_URL = "https://scholar.google.com/scholar?";
    private static final String SEARCH_IN_TITLE_URL = "https://scholar.google.com//scholar?";

    private static final int NUM_RESULTS = 10;

    private final ImportFormatPreferences importFormatPreferences;

    public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
        Objects.requireNonNull(importFormatPreferences);

        this.importFormatPreferences = importFormatPreferences;
    }

    @Override
    public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
        Objects.requireNonNull(entry);
        Optional<URL> pdfLink = Optional.empty();

        // Search in title
        if (!entry.hasField(FieldName.TITLE)) {
            return pdfLink;
        }

        try {
            URIBuilder uriBuilder = new URIBuilder(SEARCH_IN_TITLE_URL);
            uriBuilder.addParameter("as_q", "");
            uriBuilder.addParameter("as_epq", entry.getField(FieldName.TITLE).orElse(null));
            uriBuilder.addParameter("as_occt", "title");

            Document doc = Jsoup.connect(uriBuilder.toString()).userAgent(URLDownload.USER_AGENT).get();
            // Check results for PDF link
            // TODO: link always on first result or none?
            for (int i = 0; i < NUM_RESULTS; i++) {
                Elements link = doc.select(String.format("#gs_ggsW%s a", i));

                if (link.first() != null) {
                    String s = link.first().attr("href");
                    // link present?
                    if (!"".equals(s)) {
                        // TODO: check title inside pdf + length?
                        // TODO: report error function needed?! query -> result
                        LOGGER.info("Fulltext PDF found @ Google: " + s);
                        pdfLink = Optional.of(new URL(s));
                        break;
                    }
                }
            }
        } catch (URISyntaxException e) {
            throw new FetcherException("Building URI failed.", e);
        }

        return pdfLink;
    }

    @Override
    public String getName() {
        return "Google Scholar";
    }

    @Override
    public HelpFile getHelpPage() {
        return HelpFile.FETCHER_GOOGLE_SCHOLAR;
    }

    @Override
    public List<BibEntry> performSearch(String query) throws FetcherException {
        try {
            obtainAndModifyCookie();
            List<BibEntry> foundEntries = new ArrayList<>(10);

            URIBuilder uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
            uriBuilder.addParameter("hl", "en");
            uriBuilder.addParameter("btnG", "Search");
            uriBuilder.addParameter("q", query);

            addHitsFromQuery(foundEntries, uriBuilder.toString());

            if (foundEntries.size() == 10) {
                uriBuilder.addParameter("start", "10");
                addHitsFromQuery(foundEntries, uriBuilder.toString());
            }

            return foundEntries;
        } catch (URISyntaxException e) {
            throw new FetcherException("Error while fetching from " + getName(), e);
        } catch (IOException e) {
            // if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
            // The caught IOException looks for example like this:
            // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
            if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
                throw new FetcherException("Fetching from Google Scholar failed.",
                        Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
            } else {
                throw new FetcherException("Error while fetching from " + getName(), e);
            }
        }
    }

    private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
        String content = new URLDownload(queryURL).asString();

        Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
        while (matcher.find()) {
            String citationsPageURL = matcher.group().replace("&", "&");
            BibEntry newEntry = downloadEntry(citationsPageURL);
            entryList.add(newEntry);
        }
    }

    private BibEntry downloadEntry(String link) throws IOException, FetcherException {
        String downloadedContent = new URLDownload(link).asString();
        BibtexParser parser = new BibtexParser(importFormatPreferences);
        ParserResult result = parser.parse(new StringReader(downloadedContent));
        if ((result == null) || (result.getDatabase() == null)) {
            throw new FetcherException("Parsing entries from Google Scholar bib file failed.");
        } else {
            Collection<BibEntry> entries = result.getDatabase().getEntries();
            if (entries.size() != 1) {
                LOGGER.debug(entries.size() + " entries found! (" + link + ")");
                throw new FetcherException("Parsing entries from Google Scholar bib file failed.");
            } else {
                BibEntry entry = entries.iterator().next();
                return entry;
            }
        }
    }

    private void obtainAndModifyCookie() throws FetcherException {
        try {
            URLDownload downloader = new URLDownload("https://scholar.google.com");
            List<HttpCookie> cookies = downloader.getCookieFromUrl();
            for (HttpCookie cookie : cookies) {
                // append "CF=4" which represents "Citation format bibtex"
                cookie.setValue(cookie.getValue() + ":CF=4");
            }
        } catch (IOException e) {
            throw new FetcherException("Cookie configuration for Google Scholar failed.", e);
        }
    }
}