package org.jabref.gui.importer.fetcher;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.Locale;
import java.util.Optional;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.jabref.gui.importer.ImportInspectionDialog;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.ImportInspector;
import org.jabref.logic.importer.OutputPrinter;
import org.jabref.logic.importer.util.OAI2Handler;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.Month;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
*
* This class can be used to access any archive offering an OAI2 interface. By
* default it will access ArXiv.org
*
* @see <a href="http://arxiv.org/help/oa/index"></a>
*
* @author Ulrich Stärk
* @author Christian Kopf
*/
public class OAI2Fetcher implements EntryFetcher {
private static final Log LOGGER = LogFactory.getLog(OAI2Fetcher.class);
private static final String OAI2_ARXIV_PREFIXIDENTIFIER = "oai%3AarXiv.org%3A";
private static final String OAI2_ARXIV_HOST = "export.arxiv.org";
private static final String OAI2_ARXIV_SCRIPT = "oai2";
private static final String OAI2_ARXIV_METADATAPREFIX = "arXiv";
private static final String OAI2_ARXIV_ARCHIVENAME = "ArXiv.org";
private static final String OAI2_IDENTIFIER_FIELD = "oai2identifier";
private SAXParser saxParser;
private final String oai2Host;
private final String oai2Script;
private final String oai2MetaDataPrefix;
private final String oai2PrefixIdentifier;
private final String oai2ArchiveName;
private boolean shouldContinue = true;
private long waitTime = -1;
private Date lastCall;
/**
*
*
* @param oai2Host
* the host to query without leading http:// and without trailing /
* @param oai2Script
* the relative location of the oai2 interface without leading
* and trailing /
* @param oai2Metadataprefix
* the urlencoded metadataprefix
* @param oai2Prefixidentifier
* the urlencoded prefix identifier
* @param waitTimeMs
* Time to wait in milliseconds between query-requests.
*/
public OAI2Fetcher(String oai2Host, String oai2Script, String oai2Metadataprefix, String oai2Prefixidentifier,
String oai2ArchiveName, long waitTimeMs) {
this.oai2Host = oai2Host;
this.oai2Script = oai2Script;
this.oai2MetaDataPrefix = oai2Metadataprefix;
this.oai2PrefixIdentifier = oai2Prefixidentifier;
this.oai2ArchiveName = oai2ArchiveName;
this.waitTime = waitTimeMs;
try {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
saxParser = parserFactory.newSAXParser();
} catch (ParserConfigurationException | SAXException e) {
LOGGER.error("Error creating SAXParser for OAI2Fetcher", e);
}
}
/**
* Default Constructor. The archive queried will be ArXiv.org
*
*/
public OAI2Fetcher() {
this(OAI2Fetcher.OAI2_ARXIV_HOST, OAI2Fetcher.OAI2_ARXIV_SCRIPT, OAI2Fetcher.OAI2_ARXIV_METADATAPREFIX,
OAI2Fetcher.OAI2_ARXIV_PREFIXIDENTIFIER, OAI2Fetcher.OAI2_ARXIV_ARCHIVENAME, 20000L);
}
/**
* Construct the query URL
*
* @param key
* The key of the OAI2 entry that the url should point to.
*
* @return a String denoting the query URL
*/
public String constructUrl(String key) {
String identifier;
try {
identifier = URLEncoder.encode(key, StandardCharsets.UTF_8.name());
} catch (UnsupportedEncodingException e) {
return "";
}
return "http://" + oai2Host + "/" + oai2Script + "?" + "verb=GetRecord" + "&identifier=" + oai2PrefixIdentifier
+ identifier + "&metadataPrefix=" + oai2MetaDataPrefix;
}
/**
* some archives - like ArXiv.org - might expect of you to wait some time
*/
private boolean shouldWait() {
return waitTime > 0;
}
/**
* Strip subcategories from ArXiv key.
*
* @param key The key to fix.
* @return Fixed key.
*/
public static String fixKey(String key) {
String resultingKey = key;
if (resultingKey.toLowerCase(Locale.ENGLISH).startsWith("arxiv:")) {
resultingKey = resultingKey.substring(6);
}
int dot = resultingKey.indexOf('.');
int slash = resultingKey.indexOf('/');
if ((dot > -1) && (dot < slash)) {
resultingKey = resultingKey.substring(0, dot) + resultingKey.substring(slash, resultingKey.length());
}
return resultingKey;
}
/**
* Import an entry from an OAI2 archive. The BibEntry provided has to
* have the field OAI2_IDENTIFIER_FIELD set to the search string.
*
* @param key
* The OAI2 key to fetch from ArXiv.
* @return The imported BibEntry or null if none.
*/
protected BibEntry importOai2Entry(String key) throws IOException, SAXException {
/**
* Fix for problem reported in mailing-list:
* https://sourceforge.net/forum/message.php?msg_id=4087158
*/
String fixedKey = OAI2Fetcher.fixKey(key);
String url = constructUrl(fixedKey);
URL oai2Url = new URL(url);
HttpURLConnection oai2Connection = (HttpURLConnection) oai2Url.openConnection();
oai2Connection.setRequestProperty("User-Agent", "JabRef");
/* create an empty BibEntry and set the oai2identifier field */
BibEntry entry = new BibEntry("article");
entry.setField(OAI2Fetcher.OAI2_IDENTIFIER_FIELD, fixedKey);
DefaultHandler handlerBase = new OAI2Handler(entry);
try (InputStream inputStream = oai2Connection.getInputStream()) {
/* parse the result */
saxParser.parse(inputStream, handlerBase);
/* Correct line breaks and spacing */
for (String name : entry.getFieldNames()) {
entry.getField(name)
.ifPresent(content -> entry.setField(name, OAI2Handler.correctLineBreaks(content)));
}
if (fixedKey.matches("\\d\\d\\d\\d\\..*")) {
entry.setField(FieldName.YEAR, "20" + fixedKey.substring(0, 2));
int monthNumber = Integer.parseInt(fixedKey.substring(2, 4));
Optional<Month> month = Month.getMonthByNumber(monthNumber);
month.ifPresent(entry::setMonth);
}
}
return entry;
}
@Override
public HelpFile getHelpPage() {
return HelpFile.FETCHER_OAI2_ARXIV;
}
@Override
public JPanel getOptionsPanel() {
// we have no additional options
return null;
}
@Override
public String getTitle() {
return "ArXiv.org";
}
@Override
public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) {
try {
shouldContinue = true;
/* multiple keys can be delimited by ; or space */
String[] keys = query.replace(" ", ";").split(";");
for (int i = 0; i < keys.length; i++) {
String key = keys[i];
/*
* some archives - like arxive.org - might expect of you to wait
* some time
*/
if (shouldWait() && (lastCall != null)) {
long elapsed = new Date().getTime() - lastCall.getTime();
while (elapsed < waitTime) {
status.setStatus(
Localization.lang("Waiting for ArXiv...") + ((waitTime - elapsed) / 1000) + " s");
Thread.sleep(1000);
elapsed = new Date().getTime() - lastCall.getTime();
}
}
status.setStatus(Localization.lang("Processing %0", key));
/* the cancel button has been hit */
if (!shouldContinue) {
break;
}
/* query the archive and load the results into the BibEntry */
BibEntry be = null;
try {
be = importOai2Entry(key);
} catch (SAXException e) {
String url = constructUrl(OAI2Fetcher.fixKey(key));
LOGGER.error("Error while fetching from " + getTitle(), e);
((ImportInspectionDialog)dialog).showMessage(Localization.lang("Error while fetching from %0", getTitle()) + "\n" +
Localization.lang("A SAX exception occurred while parsing '%0':", url),
Localization.lang("Search %0", getTitle()), JOptionPane.ERROR_MESSAGE);
}
if (shouldWait()) {
lastCall = new Date();
}
/* add the entry to the inspection dialog */
if (be != null) {
dialog.addEntry(be);
}
/* update the dialogs progress bar */
dialog.setProgress(i + 1, keys.length);
}
return true;
} catch (IOException | InterruptedException e) {
LOGGER.error("Error while fetching from " + getTitle(), e);
((ImportInspectionDialog)dialog).showErrorMessage(this.getTitle(), e.getLocalizedMessage());
}
return false;
}
@Override
public void stopFetching() {
shouldContinue = false;
}
}