package org.jabref.gui.importer.fetcher;
import java.awt.BorderLayout;
import java.io.IOException;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JCheckBox;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import org.jabref.Globals;
import org.jabref.gui.importer.ImportInspectionDialog;
import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter;
import org.jabref.logic.formatter.bibtexfields.UnitsToLatexFormatter;
import org.jabref.logic.formatter.casechanger.ProtectTermsFormatter;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.ImportInspector;
import org.jabref.logic.importer.OutputPrinter;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.journals.JournalAbbreviationLoader;
import org.jabref.logic.journals.JournalAbbreviationPreferences;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.jabref.preferences.JabRefPreferences;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
public class IEEEXploreFetcher implements EntryFetcher {
private static final Log LOGGER = LogFactory.getLog(IEEEXploreFetcher.class);
private static final String URL_SEARCH = "http://ieeexplore.ieee.org/rest/search?reload=true";
private static final String URL_BIBTEX_START = "http://ieeexplore.ieee.org/xpl/downloadCitations?reload=true&recordIds=";
private static final String URL_BIBTEX_END = "&download-format=download-bibtex&x=0&y=0";
private static final String DIALOG_TITLE = Localization.lang("Search %0", "IEEEXplore");
private static final int MAX_FETCH = 100;
private static final Pattern PUBLICATION_PATTERN = Pattern.compile("(.*), \\d*\\.*\\s?(.*)");
private static final Pattern PROCEEDINGS_PATTERN = Pattern.compile("(.*?)\\.?\\s?Proceedings\\s?(.*)");
private static final Pattern MONTH_PATTERN = Pattern.compile("(\\d*+)\\s*([a-z]*+)-*(\\d*+)\\s*([a-z]*+)");
private static final Pattern PREPROCESSING_PATTERN = Pattern.compile("(?<!&)(#[x]*[0]*\\p{XDigit}+;)");
private static final Pattern SUB_DETECTION_1 = Pattern.compile("/sub ([^/]+)/");
private static final Pattern SUB_DETECTION_2 = Pattern.compile("\\(sub\\)([^(]+)\\(/sub\\)");
private static final String SUB_TEXT_RESULT = "\\\\textsubscript\\{$1\\}";
private static final Pattern SUPER_DETECTION_1 = Pattern.compile("/sup ([^/]+)/");
private static final Pattern SUPER_DETECTION_2 = Pattern.compile("\\(sup\\)([^(]+)\\(/sup\\)");
private static final String SUPER_TEXT_RESULT = "\\\\textsuperscript\\{$1\\}";
private final ProtectTermsFormatter protectTermsFormatter = new ProtectTermsFormatter();
private final UnitsToLatexFormatter unitsToLatexFormatter = new UnitsToLatexFormatter();
private final HtmlToLatexFormatter htmlToLatexFormatter = new HtmlToLatexFormatter();
private final JCheckBox absCheckBox = new JCheckBox(Localization.lang("Include abstracts"), false);
private final JournalAbbreviationLoader abbreviationLoader;
private boolean shouldContinue;
public IEEEXploreFetcher(JournalAbbreviationLoader abbreviationLoader) {
super();
this.abbreviationLoader = Objects.requireNonNull(abbreviationLoader);
CookieHandler.setDefault(new CookieManager());
}
@Override
public JPanel getOptionsPanel() {
JPanel pan = new JPanel();
pan.setLayout(new BorderLayout());
pan.add(absCheckBox, BorderLayout.NORTH);
return pan;
}
@Override
public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) {
//IEEE API seems to use .QT. as a marker for the quotes for exact phrase searching
String terms = query.replaceAll("\"", "\\.QT\\.");
shouldContinue = true;
int parsed = 0;
int pageNumber = 1;
String postData = makeSearchPostRequestPayload(pageNumber, terms);
try {
//open the search URL
URLDownload dl = new URLDownload(IEEEXploreFetcher.URL_SEARCH);
//add request header
dl.addHeader("Accept", "application/json");
dl.addHeader("Content-Type", "application/json");
dl.addHeader("Referer", "http://ieeexplore.ieee.org/search/searchresult.jsp");
// set post data
dl.setPostData(postData);
//retrieve the search results
String page = dl.asString();
//the page can be blank if the search did not work (not sure the exact conditions that lead to this, but declaring it an invalid search for now)
if (page.isEmpty()) {
status.showMessage(Localization.lang("You have entered an invalid search '%0'.", query),
DIALOG_TITLE, JOptionPane.INFORMATION_MESSAGE);
return false;
}
//parses the JSON data returned by the query
//TODO: a faster way would be to parse the JSON tokens one at a time just to extract the article number, but this seems to be fast enough...
JSONObject searchResultsJson = new JSONObject(page);
int hits = searchResultsJson.getInt("totalRecords");
//if no search results were found
if (hits == 0) {
status.showMessage(Localization.lang("No entries found for the search string '%0'", query),
DIALOG_TITLE, JOptionPane.INFORMATION_MESSAGE);
return false;
}
//if max hits were exceeded, display the warning
if (hits > IEEEXploreFetcher.MAX_FETCH) {
status.showMessage(
Localization.lang("%0 entries found. To reduce server load, only %1 will be downloaded.",
String.valueOf(hits), String.valueOf(IEEEXploreFetcher.MAX_FETCH)),
DIALOG_TITLE, JOptionPane.INFORMATION_MESSAGE);
}
//fetch the raw Bibtex results from IEEEXplore
String bibtexPage = new URLDownload(createBibtexQueryURL(searchResultsJson))
.asString(Globals.prefs.getDefaultEncoding());
//preprocess the result (eg. convert HTML escaped characters to latex and do other formatting not performed by BibtexParser)
bibtexPage = preprocessBibtexResultsPage(bibtexPage);
//parse the page into Bibtex entries
Collection<BibEntry> parsedBibtexCollection = new BibtexParser(Globals.prefs.getImportFormatPreferences())
.parseEntries(bibtexPage);
int nEntries = parsedBibtexCollection.size();
Iterator<BibEntry> parsedBibtexCollectionIterator = parsedBibtexCollection.iterator();
while (parsedBibtexCollectionIterator.hasNext() && shouldContinue) {
dialog.addEntry(cleanup(parsedBibtexCollectionIterator.next()));
dialog.setProgress(parsed, nEntries);
parsed++;
}
return true;
} catch (ParseException | IOException | JSONException e) {
LOGGER.error("Error while fetching from " + getTitle(), e);
((ImportInspectionDialog)dialog).showErrorMessage(this.getTitle(), e.getLocalizedMessage());
}
return false;
}
@Override
public String getTitle() {
return "IEEEXplore";
}
@Override
public HelpFile getHelpPage() {
return HelpFile.FETCHER_IEEEXPLORE;
}
/**
* This method is called by the dialog when the user has canceled the import.
*/
@Override
public void stopFetching() {
shouldContinue = false;
}
private String makeSearchPostRequestPayload(int startIndex, String terms) {
return "{\"queryText\":" + JSONObject.quote(terms) + ",\"refinements\":[],\"pageNumber\":\"" + startIndex
+ "\",\"searchWithin\":[],\"newsearch\":\"true\",\"searchField\":\"Search_All\",\"rowsPerPage\":\"100\"}";
}
private String createBibtexQueryURL(JSONObject searchResultsJson) {
//buffer to use for building the URL for fetching the bibtex data from IEEEXplore
StringBuilder bibtexQueryURLStringBuf = new StringBuilder();
bibtexQueryURLStringBuf.append(URL_BIBTEX_START);
//loop over each record and create a comma-separate list of article numbers which will be used to download the raw Bibtex
JSONArray recordsJsonArray = searchResultsJson.getJSONArray("records");
for (int n = 0; n < recordsJsonArray.length(); n++) {
if (!recordsJsonArray.getJSONObject(n).isNull("articleNumber")) {
bibtexQueryURLStringBuf.append(recordsJsonArray.getJSONObject(n).getString("articleNumber"))
.append(',');
}
}
//delete the last comma
bibtexQueryURLStringBuf.deleteCharAt(bibtexQueryURLStringBuf.length() - 1);
//add the abstract setting
boolean includeAbstract = absCheckBox.isSelected();
if (includeAbstract) {
bibtexQueryURLStringBuf.append("&citations-format=citation-abstract");
} else {
bibtexQueryURLStringBuf.append("&citations-format=citation-only");
}
//append the remaining URL
bibtexQueryURLStringBuf.append(URL_BIBTEX_END);
return bibtexQueryURLStringBuf.toString();
}
private String preprocessBibtexResultsPage(String bibtexPage) {
//for some reason, the escaped HTML characters in the titles are in the format "#xNNNN" (they are missing the ampersand)
//add the ampersands back in before passing to the HTML formatter so they can be properly converted
//TODO: Maybe edit the HTMLconverter to also recognize escaped characters even when the & is missing?
String result = PREPROCESSING_PATTERN.matcher(bibtexPage).replaceAll("&$1");
//Also, percent signs are not escaped by the IEEEXplore Bibtex output nor, it would appear, the subsequent processing in JabRef
//TODO: Maybe find a better spot for this if it applies more universally
result = result.replaceAll("(?<!\\\\)%", "\\\\%");
//Format the bibtexResults using the HTML formatter (clears up numerical and text escaped characters and remaining HTML tags)
result = htmlToLatexFormatter.format(result);
return result;
}
private BibEntry cleanup(BibEntry entry) {
if (entry == null) {
return null;
}
// clean up title
entry.getField(FieldName.TITLE).ifPresent(dirtyTitle -> {
// USe the alt-text and replace image links
String title = dirtyTitle.replaceAll("[ ]?img src=[^ ]+ alt=\"([^\"]+)\">[ ]?", "\\$$1\\$");
// Try to sort out most of the /spl / conversions
// Deal with this specific nested type first
title = title.replaceAll("/sub /spl infin//", "\\$_\\\\infty\\$");
title = title.replaceAll("/sup /spl infin//", "\\$\\^\\\\infty\\$");
// Replace general expressions
title = title.replaceAll("/[sS]pl ([^/]+)/", "\\$\\\\$1\\$");
// Deal with subscripts and superscripts
title = SUPER_DETECTION_1.matcher(title).replaceAll(SUPER_TEXT_RESULT);
title = SUB_DETECTION_1.matcher(title).replaceAll(SUB_TEXT_RESULT);
title = SUPER_DETECTION_2.matcher(title).replaceAll(SUPER_TEXT_RESULT);
title = SUB_DETECTION_2.matcher(title).replaceAll(SUB_TEXT_RESULT);
// Replace \infin with \infty
title = title.replaceAll("\\\\infin", "\\\\infty");
// Unit formatting
if (Globals.prefs.getBoolean(JabRefPreferences.USE_UNIT_FORMATTER_ON_SEARCH)) {
title = unitsToLatexFormatter.format(title);
}
// Automatic case keeping
if (Globals.prefs.getBoolean(JabRefPreferences.USE_CASE_KEEPER_ON_SEARCH)) {
title = protectTermsFormatter.format(title);
}
// Write back
entry.setField(FieldName.TITLE, title);
});
// clean up author
entry.getField(FieldName.AUTHOR).ifPresent(dirtyAuthor -> {
String author = dirtyAuthor.replaceAll("\\s+", " ");
//reorder the "Jr." "Sr." etc to the correct ordering
String[] authorSplit = author.split("(^\\s*|\\s*$|\\s+and\\s+)");
List<String> authorResult = new ArrayList<>();
for (String authorSplitPart : authorSplit) {
authorResult.add(authorSplitPart.replaceAll("(.+?),(.+?),(.+)", "$1,$3,$2"));
}
author = String.join(" and ", authorResult);
author = author.replace(".", ". ").replace(" ", " ").replace(". -", ".-").replace("; ", " and ")
.replace(" ,", ",").replace(" ", " ");
author = author.replaceAll("[ ,;]+$", "");
//TODO: remove trailing commas
entry.setField(FieldName.AUTHOR, author);
});
// clean up month
entry.getField(FieldName.MONTH).filter(month -> !month.isEmpty()).ifPresent(dirtyMonth -> {
String month = dirtyMonth.replace(".", "");
month = month.toLowerCase(Locale.ROOT);
Matcher mm = MONTH_PATTERN.matcher(month);
StringBuilder date = new StringBuilder(month);
if (mm.find()) {
if (mm.group(3).isEmpty()) {
if (mm.group(2).isEmpty()) {
date = new StringBuilder().append(mm.group(1)).append(',');
} else {
date = new StringBuilder().append('#').append(mm.group(2).substring(0, 3)).append('#');
if (!mm.group(1).isEmpty()) {
date.append(' ').append(mm.group(1)).append(',');
}
}
} else if (mm.group(2).isEmpty()) {
if (mm.group(4).isEmpty()) {
date.append(',');
} else {
date = new StringBuilder().append('#').append(mm.group(4).substring(0, 3)).append('#')
.append(mm.group(1)).append("--").append(mm.group(3)).append(',');
}
} else {
date = new StringBuilder().append('#').append(mm.group(2).substring(0, 3)).append('#')
.append(mm.group(1)).append("--#").append(mm.group(4).substring(0, 3)).append('#')
.append(mm.group(3)).append(',');
}
}
entry.setField(FieldName.MONTH, date.toString());
});
// clean up pages
entry.getField(FieldName.PAGES).ifPresent(pages -> {
String[] pageNumbers = pages.split("-");
if (pageNumbers.length == 2) {
if (pageNumbers[0].equals(pageNumbers[1])) { // single page
entry.setField(FieldName.PAGES, pageNumbers[0]);
} else {
entry.setField(FieldName.PAGES, pages.replace("-", "--"));
}
}
});
// clean up publication field
String type = entry.getType();
String sourceField = "";
if ("article".equals(type)) {
sourceField = FieldName.JOURNAL;
entry.clearField(FieldName.BOOKTITLE);
} else if ("inproceedings".equals(type)) {
sourceField = FieldName.BOOKTITLE;
}
if (entry.hasField(sourceField)) {
String fullName = entry.getField(sourceField).get();
if ("article".equals(type)) {
int ind = fullName.indexOf(": Accepted for future publication");
if (ind > 0) {
fullName = fullName.substring(0, ind);
entry.setField(FieldName.YEAR, "to be published");
entry.clearField(FieldName.MONTH);
entry.clearField(FieldName.PAGES);
entry.clearField(FieldName.NUMBER);
}
String[] parts = fullName.split("[\\[\\]]"); //[see also...], [legacy...]
fullName = parts[0];
if (parts.length == 3) {
fullName += parts[2];
}
entry.getField(FieldName.NOTE).filter(note -> "Early Access".equals(note)).ifPresent(note -> {
entry.setField(FieldName.YEAR, "to be published");
entry.clearField(FieldName.MONTH);
entry.clearField(FieldName.PAGES);
entry.clearField(FieldName.NUMBER);
});
} else {
fullName = fullName.replace("Conference Proceedings", "Proceedings")
.replace("Proceedings of", "Proceedings").replace("Proceedings.", "Proceedings");
fullName = fullName.replace("International", "Int.");
fullName = fullName.replace("Symposium", "Symp.");
fullName = fullName.replace("Conference", "Conf.");
fullName = fullName.replace(" on", " ").replace(" ", " ");
}
Matcher m1 = PUBLICATION_PATTERN.matcher(fullName);
String abrvPattern = ".*[^,] '?\\d+\\)?";
if (m1.find()) {
String prefix = m1.group(2).trim();
String postfix = m1.group(1).trim();
String abrv = "";
String[] parts = prefix.split("\\. ", 2);
if (parts.length == 2) {
if (parts[0].matches(abrvPattern)) {
prefix = parts[1];
abrv = parts[0];
} else {
prefix = parts[0];
abrv = parts[1];
}
}
if (prefix.matches(abrvPattern)) {
fullName = postfix + " " + prefix;
} else {
fullName = prefix + " " + postfix + " " + abrv;
fullName = fullName.trim();
}
}
if ("article".equals(type)) {
fullName = fullName.replace(" - ", "-"); //IEE Proceedings-
fullName = fullName.trim();
JournalAbbreviationPreferences journalAbbreviationPreferences = Globals.prefs.getJournalAbbreviationPreferences();
if (journalAbbreviationPreferences.useIEEEAbbreviations()) {
fullName = abbreviationLoader
.getRepository(journalAbbreviationPreferences)
.getMedlineAbbreviation(fullName)
.orElse(fullName);
}
}
if ("inproceedings".equals(type)) {
Matcher m2 = PROCEEDINGS_PATTERN.matcher(fullName);
if (m2.find()) {
String prefix = m2.group(2);
String postfix = m2.group(1).replaceAll("\\.$", "");
if (prefix.matches(abrvPattern)) {
fullName = postfix.trim() + " " + prefix.trim();
} else {
String abrv = "";
String[] parts = postfix.split("\\. ", 2);
if (parts.length == 2) {
if (parts[0].matches(abrvPattern)) {
postfix = parts[1];
abrv = parts[0];
} else {
postfix = parts[0];
abrv = parts[1];
}
}
fullName = prefix.trim() + " " + postfix.trim() + " " + abrv;
}
}
fullName = fullName.trim();
fullName = fullName.replaceAll("^[tT]he ", "").replaceAll("^\\d{4} ", "").replaceAll("[,.]$", "");
Optional<String> year = entry.getField(FieldName.YEAR);
if (year.isPresent()) {
fullName = fullName.replaceAll(", " + year.get() + "\\.?", "");
}
if (!fullName.contains("Abstract") && !fullName.contains("Summaries")
&& !fullName.contains("Conference Record")) {
fullName = "Proc. " + fullName;
}
}
entry.setField(sourceField, fullName);
}
// clean up abstract
entry.getField(FieldName.ABSTRACT).ifPresent(dirtyAbstr -> {
// Try to sort out most of the /spl / conversions
// Deal with this specific nested type first
String abstr = dirtyAbstr.replaceAll("/sub /spl infin//", "\\$_\\\\infty\\$");
abstr = abstr.replaceAll("/sup /spl infin//", "\\$\\^\\\\infty\\$");
// Replace general expressions
abstr = abstr.replaceAll("/[sS]pl ([^/]+)/", "\\$\\\\$1\\$");
// Deal with subscripts and superscripts
abstr = SUPER_DETECTION_1.matcher(abstr).replaceAll(SUPER_TEXT_RESULT);
abstr = SUB_DETECTION_1.matcher(abstr).replaceAll(SUB_TEXT_RESULT);
abstr = SUPER_DETECTION_2.matcher(abstr).replaceAll(SUPER_TEXT_RESULT);
abstr = SUB_DETECTION_2.matcher(abstr).replaceAll(SUB_TEXT_RESULT);
// Replace \infin with \infty
abstr = abstr.replace("\\infin", "\\infty");
// Write back
entry.setField(FieldName.ABSTRACT, abstr);
});
// Clean up url
entry.getField(FieldName.URL)
.ifPresent(url -> entry.setField(FieldName.URL, "http://ieeexplore.ieee.org" + url.replace("tp=&", "")));
// Replace ; as keyword separator
entry.getField(FieldName.KEYWORDS).ifPresent(keys -> entry.setField(FieldName.KEYWORDS,
keys.replace(";", Globals.prefs.get(JabRefPreferences.KEYWORD_SEPARATOR))));
return entry;
}
}