package org.jabref.logic.importer.fetcher; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Objects; import java.util.Optional; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.logic.net.URLDownload; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; import org.jabref.model.entry.identifier.DOI; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * FulltextFetcher implementation that follows the DOI resolution redirects and scans for a full-text PDF URL. */ public class DoiResolution implements FulltextFetcher { private static final Log LOGGER = LogFactory.getLog(DoiResolution.class); @Override public Optional<URL> findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); Optional<URL> pdfLink = Optional.empty(); Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::parse); if (doi.isPresent()) { String sciLink = doi.get().getURIAsASCIIString(); // follow all redirects and scan for a single pdf link if (!sciLink.isEmpty()) { try { Connection connection = Jsoup.connect(sciLink); // pretend to be a browser (agent & referrer) connection.userAgent(URLDownload.USER_AGENT); connection.referrer("http://www.google.com"); connection.followRedirects(true); connection.ignoreHttpErrors(true); // some publishers are quite slow (default is 3s) connection.timeout(5000); Document html = connection.get(); // scan for PDF Elements elements = html.body().select("a[href]"); List<Optional<URL>> links = new ArrayList<>(); for (Element element : elements) { String href = element.attr("abs:href").toLowerCase(Locale.ENGLISH); String hrefText = element.text().toLowerCase(Locale.ENGLISH); // Only check if pdf is included in the link or inside the text // ACM uses tokens without PDF inside the link // See https://github.com/lehner/LocalCopy for more scrape ideas if ((href.contains("pdf") || hrefText.contains("pdf")) && new URLDownload(href).isPdf()) { links.add(Optional.of(new URL(href))); } } // return if only one link was found (high accuracy) if (links.size() == 1) { LOGGER.info("Fulltext PDF found @ " + sciLink); pdfLink = links.get(0); } } catch (IOException e) { LOGGER.warn("DoiResolution fetcher failed: ", e); } } } return pdfLink; } }