DoiResolution.java example

Explorer
jabref-master
- src
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.identifier.DOI;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * FulltextFetcher implementation that follows the DOI resolution redirects and scans for a full-text PDF URL.
 */
public class DoiResolution implements FulltextFetcher {
    private static final Log LOGGER = LogFactory.getLog(DoiResolution.class);

    @Override
    public Optional<URL> findFullText(BibEntry entry) throws IOException {
        Objects.requireNonNull(entry);
        Optional<URL> pdfLink = Optional.empty();

        Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::parse);

        if (doi.isPresent()) {
            String sciLink = doi.get().getURIAsASCIIString();

            // follow all redirects and scan for a single pdf link
            if (!sciLink.isEmpty()) {
                try {
                    Connection connection = Jsoup.connect(sciLink);
                    // pretend to be a browser (agent & referrer)
                    connection.userAgent(URLDownload.USER_AGENT);
                    connection.referrer("http://www.google.com");
                    connection.followRedirects(true);
                    connection.ignoreHttpErrors(true);
                    // some publishers are quite slow (default is 3s)
                    connection.timeout(5000);

                    Document html = connection.get();
                    // scan for PDF
                    Elements elements = html.body().select("a[href]");
                    List<Optional<URL>> links = new ArrayList<>();

                    for (Element element : elements) {
                        String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH);
                        String hrefText = element.text().toLowerCase(Locale.ENGLISH);
                        // Only check if pdf is included in the link or inside the text
                        // ACM uses tokens without PDF inside the link
                        // See https://github.com/lehner/LocalCopy for more scrape ideas
                        if ((href.contains("pdf") || hrefText.contains("pdf")) && new URLDownload(href).isPdf()) {
                            links.add(Optional.of(new URL(href)));
                        }
                    }
                    // return if only one link was found (high accuracy)
                    if (links.size() == 1) {
                        LOGGER.info("Fulltext PDF found @ " + sciLink);
                        pdfLink = links.get(0);
                    }
                } catch (IOException e) {
                    LOGGER.warn("DoiResolution fetcher failed: ", e);
                }
            }
        }
        return pdfLink;
    }
}