package org.jabref.logic.importer.fetcher; import java.io.IOException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.logic.importer.IdBasedFetcher; import org.jabref.logic.importer.IdFetcher; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.SearchBasedFetcher; import org.jabref.logic.importer.util.OAI2Handler; import org.jabref.logic.util.io.XMLUtil; import org.jabref.logic.util.strings.StringSimilarity; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibtexEntryTypes; import org.jabref.model.entry.FieldName; import org.jabref.model.entry.LinkedFile; import org.jabref.model.entry.identifier.ArXivIdentifier; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.strings.StringUtil; import org.jabref.model.util.OptionalUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.client.utils.URIBuilder; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.SAXException; /** * Fetcher for the arXiv. * * @see <a href="http://arxiv.org/help/api/index">ArXiv API</a> for an overview of the API * @see <a href="http://arxiv.org/help/api/user-manual#_calling_the_api">ArXiv API User's Manual</a> for a detailed * description on how to use the API * * Similar implementions: * <a href="https://github.com/nathangrigg/arxiv2bib">arxiv2bib</a> which is <a href="https://arxiv2bibtex.org/">live</a> * <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a> */ public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> { private static final Log LOGGER = LogFactory.getLog(ArXiv.class); private static final String API_URL = "http://export.arxiv.org/api/query"; private final ImportFormatPreferences importFormatPreferences; public ArXiv(ImportFormatPreferences importFormatPreferences) { this.importFormatPreferences = importFormatPreferences; } @Override public Optional<URL> findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); try { Optional<URL> pdfUrl = searchForEntries(entry).stream() .map(ArXivEntry::getPdfUrl) .filter(Optional::isPresent) .map(Optional::get) .findFirst(); if (pdfUrl.isPresent()) { LOGGER.info("Fulltext PDF found @ arXiv."); } return pdfUrl; } catch (FetcherException e) { LOGGER.warn("arXiv API request failed", e); } return Optional.empty(); } private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherException { List<ArXivEntry> entries = queryApi(searchQuery, Collections.emptyList(), 0, 1); if (entries.size() == 1) { return Optional.of(entries.get(0)); } else { return Optional.empty(); } } private Optional<ArXivEntry> searchForEntryById(String id) throws FetcherException { Optional<ArXivIdentifier> identifier = ArXivIdentifier.parse(id); if (!identifier.isPresent()) { return Optional.empty(); } List<ArXivEntry> entries = queryApi("", Collections.singletonList(identifier.get()), 0, 1); if (entries.size() >= 1) { return Optional.of(entries.get(0)); } else { return Optional.empty(); } } private List<ArXivEntry> searchForEntries(BibEntry entry) throws FetcherException { // 1. Eprint Optional<String> identifier = entry.getField(FieldName.EPRINT); if (StringUtil.isNotBlank(identifier)) { try { // Get pdf of entry with the specified id return OptionalUtil.toList(searchForEntryById(identifier.get())); } catch (FetcherException e) { LOGGER.warn("arXiv eprint API request failed", e); } } // 2. DOI and other fields String query; Optional<String> doi = entry.getField(FieldName.DOI).flatMap(DOI::parse).map(DOI::getNormalized); if (doi.isPresent()) { // Search for an entry in the ArXiv which is linked to the doi query = "doi:" + doi.get(); } else { Optional<String> authorQuery = entry.getField(FieldName.AUTHOR).map(author -> "au:" + author); Optional<String> titleQuery = entry.getField(FieldName.TITLE).map(title -> "ti:" + title); query = OptionalUtil.toList(authorQuery, titleQuery).stream().collect(Collectors.joining("+AND+")); } Optional<ArXivEntry> arxivEntry = searchForEntry(query); if (arxivEntry.isPresent()) { // Check if entry is a match StringSimilarity match = new StringSimilarity(); String arxivTitle = arxivEntry.get().title.orElse(""); String entryTitle = entry.getField(FieldName.TITLE).orElse(""); if (match.isSimilar(arxivTitle, entryTitle)) { return OptionalUtil.toList(arxivEntry); } } return Collections.emptyList(); } private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherException { return queryApi(searchQuery, Collections.emptyList(), 0, 10); } private List<ArXivEntry> queryApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults) throws FetcherException { Document result = callApi(searchQuery, ids, start, maxResults); List<Node> entries = XMLUtil.asList(result.getElementsByTagName("entry")); return entries.stream().map(ArXivEntry::new).collect(Collectors.toList()); } /** * Queries the API. * * If only {@code searchQuery} is given, then the API will return results for each article that matches the query. * If only {@code ids} is given, then the API will return results for each article in the list. * If both {@code searchQuery} and {@code ids} are given, then the API will return each article in * {@code ids} that matches {@code searchQuery}. This allows the API to act as a results filter. * * @param searchQuery the search query used to find articles; * <a href="http://arxiv.org/help/api/user-manual#query_details">details</a> * @param ids a list of arXiv identifiers * @param start the index of the first returned result (zero-based) * @param maxResults the number of maximal results (has to be smaller than 2000) * @return the response from the API as a XML document (Atom 1.0) * @throws FetcherException if there was a problem while building the URL or the API was not accessible */ private Document callApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults) throws FetcherException { if (maxResults > 2000) { throw new IllegalArgumentException("The arXiv API limits the number of maximal results to be 2000"); } try { URIBuilder uriBuilder = new URIBuilder(API_URL); // The arXiv API has problems with accents, so we remove them (i.e. Fréchet -> Frechet) if (StringUtil.isNotBlank(searchQuery)) { uriBuilder.addParameter("search_query", StringUtil.stripAccents(searchQuery)); } if (!ids.isEmpty()) { uriBuilder.addParameter("id_list", ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(","))); } uriBuilder.addParameter("start", String.valueOf(start)); uriBuilder.addParameter("max_results", String.valueOf(maxResults)); URL url = uriBuilder.build().toURL(); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); if (connection.getResponseCode() == 400) { // Bad request error from server, try to get more information throw getException(builder.parse(connection.getErrorStream())); } else { return builder.parse(connection.getInputStream()); } } catch (SAXException | ParserConfigurationException | IOException | URISyntaxException exception) { throw new FetcherException("arXiv API request failed", exception); } } private FetcherException getException(Document error) { List<Node> entries = XMLUtil.asList(error.getElementsByTagName("entry")); // Check if the API returned an error // In case of an error, only one entry will be returned with the error information. For example: // http://export.arxiv.org/api/query?id_list=0307015 // <entry> // <id>http://arxiv.org/api/errors#incorrect_id_format_for_0307015</id> // <title>Error</title> // <summary>incorrect id format for 0307015</summary> // </entry> if (entries.size() == 1) { Node node = entries.get(0); Optional<String> id = XMLUtil.getNodeContent(node, "id"); Boolean isError = id.map(idContent -> idContent.startsWith("http://arxiv.org/api/errors")).orElse(false); if (isError) { String errorMessage = XMLUtil.getNodeContent(node, "summary").orElse("Unknown error"); return new FetcherException(errorMessage); } } return new FetcherException("arXiv API request failed"); } @Override public String getName() { return "ArXiv"; } @Override public HelpFile getHelpPage() { return HelpFile.FETCHER_OAI2_ARXIV; } @Override public List<BibEntry> performSearch(String query) throws FetcherException { return searchForEntries(query).stream().map( (arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())).collect(Collectors.toList()); } @Override public Optional<BibEntry> performSearchById(String identifier) throws FetcherException { return searchForEntryById(identifier).map( (arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())); } @Override public Optional<ArXivIdentifier> findIdentifier(BibEntry entry) throws FetcherException { return searchForEntries(entry).stream() .map(ArXivEntry::getId) .filter(Optional::isPresent) .map(Optional::get) .findFirst(); } @Override public String getIdentifierName() { return "ArXiv"; } private static class ArXivEntry { private final Optional<String> title; private final Optional<String> urlAbstractPage; private final Optional<String> publishedDate; private final Optional<String> abstractText; private final List<String> authorNames; private final List<String> categories; private final Optional<URL> pdfUrl; private final Optional<String> doi; private final Optional<String> journalReferenceText; private final Optional<String> primaryCategory; public ArXivEntry(Node item) { // see http://arxiv.org/help/api/user-manual#_details_of_atom_results_returned // Title of the article // The result from the arXiv contains hard line breaks, try to remove them title = XMLUtil.getNodeContent(item, "title").map(OAI2Handler::correctLineBreaks); // The url leading to the abstract page urlAbstractPage = XMLUtil.getNodeContent(item, "id"); // Date on which the first version was published publishedDate = XMLUtil.getNodeContent(item, "published"); // Abstract of the article abstractText = XMLUtil.getNodeContent(item, "summary").map(OAI2Handler::correctLineBreaks) .map(String::trim); // Authors of the article authorNames = new ArrayList<>(); for (Node authorNode : XMLUtil.getNodesByName(item, "author")) { Optional<String> authorName = XMLUtil.getNodeContent(authorNode, "name").map(String::trim); authorName.ifPresent(authorNames::add); } // Categories (arXiv, ACM, or MSC classification) categories = new ArrayList<>(); for (Node categoryNode : XMLUtil.getNodesByName(item, "category")) { Optional<String> category = XMLUtil.getAttributeContent(categoryNode, "term"); category.ifPresent(categories::add); } // Links Optional<URL> pdfUrlParsed = Optional.empty(); for (Node linkNode : XMLUtil.getNodesByName(item, "link")) { Optional<String> linkTitle = XMLUtil.getAttributeContent(linkNode, "title"); if (linkTitle.equals(Optional.of("pdf"))) { pdfUrlParsed = XMLUtil.getAttributeContent(linkNode, "href").map(url -> { try { return new URL(url); } catch (MalformedURLException e) { return null; } }); } } pdfUrl = pdfUrlParsed; // Associated DOI doi = XMLUtil.getNodeContent(item, "arxiv:doi"); // Journal reference (as provided by the author) journalReferenceText = XMLUtil.getNodeContent(item, "arxiv:journal_ref"); // Primary category // Ex: <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math-ph" scheme="http://arxiv.org/schemas/atom"/> primaryCategory = XMLUtil.getNode(item, "arxiv:primary_category") .flatMap(node -> XMLUtil.getAttributeContent(node, "term")); } /** * Returns the url of the linked pdf */ public Optional<URL> getPdfUrl() { return pdfUrl; } /** * Returns the arXiv identifier */ public Optional<String> getIdString() { // remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID String prefix = "http://arxiv.org/abs/"; return urlAbstractPage.map(abstractUrl -> { if (abstractUrl.startsWith(prefix)) { return abstractUrl.substring(prefix.length()); } else { return abstractUrl; } }); } public Optional<ArXivIdentifier> getId() { return getIdString().flatMap(ArXivIdentifier::parse); } /** * Returns the date when the first version was put on the arXiv */ public Optional<String> getDate() { // Publication string also contains time, e.g. 2014-05-09T14:49:43Z return publishedDate.map(date -> { if (date.length() < 10) { return null; } else { return date.substring(0, 10); } }); } public BibEntry toBibEntry(Character keywordDelimiter) { BibEntry bibEntry = new BibEntry(); bibEntry.setType(BibtexEntryTypes.ARTICLE); bibEntry.setField(FieldName.EPRINTTYPE, "arXiv"); bibEntry.setField(FieldName.AUTHOR, String.join(" and ", authorNames)); bibEntry.addKeywords(categories, keywordDelimiter); getIdString().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id)); title.ifPresent(titleContent -> bibEntry.setField(FieldName.TITLE, titleContent)); doi.ifPresent(doiContent -> bibEntry.setField(FieldName.DOI, doiContent)); abstractText.ifPresent(abstractContent -> bibEntry.setField(FieldName.ABSTRACT, abstractContent)); getDate().ifPresent(date -> bibEntry.setField(FieldName.DATE, date)); primaryCategory.ifPresent(category -> bibEntry.setField(FieldName.EPRINTCLASS, category)); journalReferenceText.ifPresent(journal -> bibEntry.setField(FieldName.JOURNALTITLE, journal)); getPdfUrl().ifPresent(url -> bibEntry .setFiles(Collections.singletonList(new LinkedFile("online", url, "PDF")))); return bibEntry; } } }