package org.jabref.logic.importer.fetcher;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.IdBasedFetcher;
import org.jabref.logic.importer.IdFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.util.OAI2Handler;
import org.jabref.logic.util.io.XMLUtil;
import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibtexEntryTypes;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.LinkedFile;
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.model.entry.identifier.DOI;
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.OptionalUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.utils.URIBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
/**
* Fetcher for the arXiv.
*
* @see <a href="http://arxiv.org/help/api/index">ArXiv API</a> for an overview of the API
* @see <a href="http://arxiv.org/help/api/user-manual#_calling_the_api">ArXiv API User's Manual</a> for a detailed
* description on how to use the API
*
* Similar implementions:
* <a href="https://github.com/nathangrigg/arxiv2bib">arxiv2bib</a> which is <a href="https://arxiv2bibtex.org/">live</a>
* <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a>
*/
public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {
private static final Log LOGGER = LogFactory.getLog(ArXiv.class);
private static final String API_URL = "http://export.arxiv.org/api/query";
private final ImportFormatPreferences importFormatPreferences;
public ArXiv(ImportFormatPreferences importFormatPreferences) {
this.importFormatPreferences = importFormatPreferences;
}
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
try {
Optional<URL> pdfUrl = searchForEntries(entry).stream()
.map(ArXivEntry::getPdfUrl)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
}
return pdfUrl;
} catch (FetcherException e) {
LOGGER.warn("arXiv API request failed", e);
}
return Optional.empty();
}
private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherException {
List<ArXivEntry> entries = queryApi(searchQuery, Collections.emptyList(), 0, 1);
if (entries.size() == 1) {
return Optional.of(entries.get(0));
} else {
return Optional.empty();
}
}
private Optional<ArXivEntry> searchForEntryById(String id) throws FetcherException {
Optional<ArXivIdentifier> identifier = ArXivIdentifier.parse(id);
if (!identifier.isPresent()) {
return Optional.empty();
}
List<ArXivEntry> entries = queryApi("", Collections.singletonList(identifier.get()), 0, 1);
if (entries.size() >= 1) {
return Optional.of(entries.get(0));
} else {
return Optional.empty();
}
}
private List<ArXivEntry> searchForEntries(BibEntry entry) throws FetcherException {
// 1. Eprint
Optional<String> identifier = entry.getField(FieldName.EPRINT);
if (StringUtil.isNotBlank(identifier)) {
try {
// Get pdf of entry with the specified id
return OptionalUtil.toList(searchForEntryById(identifier.get()));
} catch (FetcherException e) {
LOGGER.warn("arXiv eprint API request failed", e);
}
}
// 2. DOI and other fields
String query;
Optional<String> doi = entry.getField(FieldName.DOI).flatMap(DOI::parse).map(DOI::getNormalized);
if (doi.isPresent()) {
// Search for an entry in the ArXiv which is linked to the doi
query = "doi:" + doi.get();
} else {
Optional<String> authorQuery = entry.getField(FieldName.AUTHOR).map(author -> "au:" + author);
Optional<String> titleQuery = entry.getField(FieldName.TITLE).map(title -> "ti:" + title);
query = OptionalUtil.toList(authorQuery, titleQuery).stream().collect(Collectors.joining("+AND+"));
}
Optional<ArXivEntry> arxivEntry = searchForEntry(query);
if (arxivEntry.isPresent()) {
// Check if entry is a match
StringSimilarity match = new StringSimilarity();
String arxivTitle = arxivEntry.get().title.orElse("");
String entryTitle = entry.getField(FieldName.TITLE).orElse("");
if (match.isSimilar(arxivTitle, entryTitle)) {
return OptionalUtil.toList(arxivEntry);
}
}
return Collections.emptyList();
}
private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherException {
return queryApi(searchQuery, Collections.emptyList(), 0, 10);
}
private List<ArXivEntry> queryApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults)
throws FetcherException {
Document result = callApi(searchQuery, ids, start, maxResults);
List<Node> entries = XMLUtil.asList(result.getElementsByTagName("entry"));
return entries.stream().map(ArXivEntry::new).collect(Collectors.toList());
}
/**
* Queries the API.
*
* If only {@code searchQuery} is given, then the API will return results for each article that matches the query.
* If only {@code ids} is given, then the API will return results for each article in the list.
* If both {@code searchQuery} and {@code ids} are given, then the API will return each article in
* {@code ids} that matches {@code searchQuery}. This allows the API to act as a results filter.
*
* @param searchQuery the search query used to find articles;
* <a href="http://arxiv.org/help/api/user-manual#query_details">details</a>
* @param ids a list of arXiv identifiers
* @param start the index of the first returned result (zero-based)
* @param maxResults the number of maximal results (has to be smaller than 2000)
* @return the response from the API as a XML document (Atom 1.0)
* @throws FetcherException if there was a problem while building the URL or the API was not accessible
*/
private Document callApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults) throws FetcherException {
if (maxResults > 2000) {
throw new IllegalArgumentException("The arXiv API limits the number of maximal results to be 2000");
}
try {
URIBuilder uriBuilder = new URIBuilder(API_URL);
// The arXiv API has problems with accents, so we remove them (i.e. Fréchet -> Frechet)
if (StringUtil.isNotBlank(searchQuery)) {
uriBuilder.addParameter("search_query", StringUtil.stripAccents(searchQuery));
}
if (!ids.isEmpty()) {
uriBuilder.addParameter("id_list",
ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(",")));
}
uriBuilder.addParameter("start", String.valueOf(start));
uriBuilder.addParameter("max_results", String.valueOf(maxResults));
URL url = uriBuilder.build().toURL();
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
if (connection.getResponseCode() == 400) {
// Bad request error from server, try to get more information
throw getException(builder.parse(connection.getErrorStream()));
} else {
return builder.parse(connection.getInputStream());
}
} catch (SAXException | ParserConfigurationException | IOException | URISyntaxException exception) {
throw new FetcherException("arXiv API request failed", exception);
}
}
private FetcherException getException(Document error) {
List<Node> entries = XMLUtil.asList(error.getElementsByTagName("entry"));
// Check if the API returned an error
// In case of an error, only one entry will be returned with the error information. For example:
// http://export.arxiv.org/api/query?id_list=0307015
// <entry>
// <id>http://arxiv.org/api/errors#incorrect_id_format_for_0307015</id>
// <title>Error</title>
// <summary>incorrect id format for 0307015</summary>
// </entry>
if (entries.size() == 1) {
Node node = entries.get(0);
Optional<String> id = XMLUtil.getNodeContent(node, "id");
Boolean isError = id.map(idContent -> idContent.startsWith("http://arxiv.org/api/errors")).orElse(false);
if (isError) {
String errorMessage = XMLUtil.getNodeContent(node, "summary").orElse("Unknown error");
return new FetcherException(errorMessage);
}
}
return new FetcherException("arXiv API request failed");
}
@Override
public String getName() {
return "ArXiv";
}
@Override
public HelpFile getHelpPage() {
return HelpFile.FETCHER_OAI2_ARXIV;
}
@Override
public List<BibEntry> performSearch(String query) throws FetcherException {
return searchForEntries(query).stream().map(
(arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())).collect(Collectors.toList());
}
@Override
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
return searchForEntryById(identifier).map(
(arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
}
@Override
public Optional<ArXivIdentifier> findIdentifier(BibEntry entry) throws FetcherException {
return searchForEntries(entry).stream()
.map(ArXivEntry::getId)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
}
@Override
public String getIdentifierName() {
return "ArXiv";
}
private static class ArXivEntry {
private final Optional<String> title;
private final Optional<String> urlAbstractPage;
private final Optional<String> publishedDate;
private final Optional<String> abstractText;
private final List<String> authorNames;
private final List<String> categories;
private final Optional<URL> pdfUrl;
private final Optional<String> doi;
private final Optional<String> journalReferenceText;
private final Optional<String> primaryCategory;
public ArXivEntry(Node item) {
// see http://arxiv.org/help/api/user-manual#_details_of_atom_results_returned
// Title of the article
// The result from the arXiv contains hard line breaks, try to remove them
title = XMLUtil.getNodeContent(item, "title").map(OAI2Handler::correctLineBreaks);
// The url leading to the abstract page
urlAbstractPage = XMLUtil.getNodeContent(item, "id");
// Date on which the first version was published
publishedDate = XMLUtil.getNodeContent(item, "published");
// Abstract of the article
abstractText = XMLUtil.getNodeContent(item, "summary").map(OAI2Handler::correctLineBreaks)
.map(String::trim);
// Authors of the article
authorNames = new ArrayList<>();
for (Node authorNode : XMLUtil.getNodesByName(item, "author")) {
Optional<String> authorName = XMLUtil.getNodeContent(authorNode, "name").map(String::trim);
authorName.ifPresent(authorNames::add);
}
// Categories (arXiv, ACM, or MSC classification)
categories = new ArrayList<>();
for (Node categoryNode : XMLUtil.getNodesByName(item, "category")) {
Optional<String> category = XMLUtil.getAttributeContent(categoryNode, "term");
category.ifPresent(categories::add);
}
// Links
Optional<URL> pdfUrlParsed = Optional.empty();
for (Node linkNode : XMLUtil.getNodesByName(item, "link")) {
Optional<String> linkTitle = XMLUtil.getAttributeContent(linkNode, "title");
if (linkTitle.equals(Optional.of("pdf"))) {
pdfUrlParsed = XMLUtil.getAttributeContent(linkNode, "href").map(url -> {
try {
return new URL(url);
} catch (MalformedURLException e) {
return null;
}
});
}
}
pdfUrl = pdfUrlParsed;
// Associated DOI
doi = XMLUtil.getNodeContent(item, "arxiv:doi");
// Journal reference (as provided by the author)
journalReferenceText = XMLUtil.getNodeContent(item, "arxiv:journal_ref");
// Primary category
// Ex: <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="math-ph" scheme="http://arxiv.org/schemas/atom"/>
primaryCategory = XMLUtil.getNode(item, "arxiv:primary_category")
.flatMap(node -> XMLUtil.getAttributeContent(node, "term"));
}
/**
* Returns the url of the linked pdf
*/
public Optional<URL> getPdfUrl() {
return pdfUrl;
}
/**
* Returns the arXiv identifier
*/
public Optional<String> getIdString() {
// remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID
String prefix = "http://arxiv.org/abs/";
return urlAbstractPage.map(abstractUrl -> {
if (abstractUrl.startsWith(prefix)) {
return abstractUrl.substring(prefix.length());
} else {
return abstractUrl;
}
});
}
public Optional<ArXivIdentifier> getId() {
return getIdString().flatMap(ArXivIdentifier::parse);
}
/**
* Returns the date when the first version was put on the arXiv
*/
public Optional<String> getDate() {
// Publication string also contains time, e.g. 2014-05-09T14:49:43Z
return publishedDate.map(date -> {
if (date.length() < 10) {
return null;
} else {
return date.substring(0, 10);
}
});
}
public BibEntry toBibEntry(Character keywordDelimiter) {
BibEntry bibEntry = new BibEntry();
bibEntry.setType(BibtexEntryTypes.ARTICLE);
bibEntry.setField(FieldName.EPRINTTYPE, "arXiv");
bibEntry.setField(FieldName.AUTHOR, String.join(" and ", authorNames));
bibEntry.addKeywords(categories, keywordDelimiter);
getIdString().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id));
title.ifPresent(titleContent -> bibEntry.setField(FieldName.TITLE, titleContent));
doi.ifPresent(doiContent -> bibEntry.setField(FieldName.DOI, doiContent));
abstractText.ifPresent(abstractContent -> bibEntry.setField(FieldName.ABSTRACT, abstractContent));
getDate().ifPresent(date -> bibEntry.setField(FieldName.DATE, date));
primaryCategory.ifPresent(category -> bibEntry.setField(FieldName.EPRINTCLASS, category));
journalReferenceText.ifPresent(journal -> bibEntry.setField(FieldName.JOURNALTITLE, journal));
getPdfUrl().ifPresent(url -> bibEntry
.setFiles(Collections.singletonList(new LinkedFile("online", url, "PDF"))));
return bibEntry;
}
}
}