package org.jabref.gui.importer.fetcher; import java.awt.Dimension; import java.awt.GridLayout; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.LinkedHashMap; import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import javax.swing.ButtonGroup; import javax.swing.JCheckBox; import javax.swing.JLabel; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.JRadioButton; import org.jabref.Globals; import org.jabref.gui.importer.FetcherPreviewDialog; import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter; import org.jabref.logic.formatter.bibtexfields.UnitsToLatexFormatter; import org.jabref.logic.formatter.casechanger.ProtectTermsFormatter; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.ImportInspector; import org.jabref.logic.importer.OutputPrinter; import org.jabref.logic.importer.ParseException; import org.jabref.logic.importer.fileformat.BibtexParser; import org.jabref.logic.l10n.Localization; import org.jabref.logic.net.URLDownload; import org.jabref.logic.protectedterms.ProtectedTermsLoader; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; import org.jabref.preferences.JabRefPreferences; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class ACMPortalFetcher implements PreviewEntryFetcher { private static final Log LOGGER = LogFactory.getLog(ACMPortalFetcher.class); private static final String START_URL = "http://portal.acm.org/"; private static final String SEARCH_URL_PART = "results.cfm?query="; private static final String SEARCH_URL_PART_II = "&dl="; private static final String END_URL = "&coll=Portal&short=0";//&start="; private static final String BIBTEX_URL = "exportformats.cfm?id="; private static final String BIBTEX_URL_END = "&expformat=bibtex"; private static final String ABSTRACT_URL = "tab_abstract.cfm?id="; private static final String NEXT_ENTRY_PATTERN = "<div class=\"numbering\">"; private static final String AUTHOR_MARKER = "<div class=\"authors\">"; private static final String SOURCE_MARKER = "<div class=\"source\">"; private static final String END_ENTRY_PATTERN = "<br clear=\"all\" />"; private static final String RESULTS_FOUND_PATTERN = "<div id=\"resfound\">"; private static final String PAGE_RANGE_PATTERN = "<div class=\"pagerange\">"; private static final String START_BIBTEX_ENTRY = "@"; private static final String END_BIBTEX_ENTRY_HTML = "</pre>"; private static final int PER_PAGE = 20; // Fetch only one page. Otherwise, the user will get blocked by ACM. 100 has been the old setting. See Bug 3532752 - https://sourceforge.net/tracker/index.php?func=detail&aid=3532752&group_id=92314&atid=600306 private static final int WAIT_TIME = 200; private static final Pattern HITS_PATTERN = Pattern.compile("<strong>(\\d+,*\\d*)</strong> results found"); private static final Pattern MAX_HITS_PATTERN = Pattern .compile("Result \\d+,*\\d* – \\d+,*\\d* of (\\d+,*\\d*)"); private static final Pattern FULL_CITATION_PATTERN = Pattern.compile("<a href=\"(citation.cfm.*)\" target.*"); private static final Pattern ID_PATTERN = Pattern.compile("citation.cfm\\?id=(\\d+)&.*"); // Patterns used to extract information for the preview: private static final Pattern TITLE_PATTERN = Pattern.compile("<a href=.*?\">([^<]*)</a>"); private static final Pattern ABSTRACT_PATTERN = Pattern.compile("<div .*?>(.*?)</div>"); private static final Pattern SOURCE_PATTERN = Pattern.compile("<span style=\"padding-left:10px\">([^<]*)</span>"); private final HtmlToLatexFormatter htmlToLatexFormatter = new HtmlToLatexFormatter(); private final ProtectTermsFormatter protectTermsFormatter = new ProtectTermsFormatter( new ProtectedTermsLoader(Globals.prefs.getProtectedTermsPreferences())); private final UnitsToLatexFormatter unitsToLatexFormatter = new UnitsToLatexFormatter(); private String terms; private final JRadioButton acmButton = new JRadioButton(Localization.lang("The ACM Digital Library")); private final JRadioButton guideButton = new JRadioButton(Localization.lang("The Guide to Computing Literature")); private final JCheckBox absCheckBox = new JCheckBox(Localization.lang("Include abstracts"), false); private boolean shouldContinue; // user settings private boolean fetchAbstract; private boolean acmOrGuide; private int piv; @Override public JPanel getOptionsPanel() { JPanel pan = new JPanel(); pan.setLayout(new GridLayout(0, 1)); guideButton.setSelected(true); ButtonGroup group = new ButtonGroup(); group.add(acmButton); group.add(guideButton); pan.add(absCheckBox); pan.add(acmButton); pan.add(guideButton); return pan; } @Override public boolean processQueryGetPreview(String query, FetcherPreviewDialog preview, OutputPrinter status) { this.terms = query; piv = 0; shouldContinue = true; acmOrGuide = acmButton.isSelected(); fetchAbstract = absCheckBox.isSelected(); String address = makeUrl(); LinkedHashMap<String, JLabel> previews = new LinkedHashMap<>(); try { URLDownload dl = new URLDownload(address); String page = dl.asString(Globals.prefs.getDefaultEncoding()); int hits = getNumberOfHits(page, RESULTS_FOUND_PATTERN, ACMPortalFetcher.HITS_PATTERN); int index = page.indexOf(RESULTS_FOUND_PATTERN); if (index >= 0) { page = page.substring(index + RESULTS_FOUND_PATTERN.length()); } if (hits == 0) { status.showMessage(Localization.lang("No entries found for the search string '%0'", terms), Localization.lang("Search %0", getTitle()), JOptionPane.INFORMATION_MESSAGE); return false; } else if (hits > 20) { status.showMessage( Localization.lang("%0 entries found. To reduce server load, only %1 will be downloaded.", String.valueOf(hits), String.valueOf(PER_PAGE)), Localization.lang("Search %0", getTitle()), JOptionPane.INFORMATION_MESSAGE); } hits = getNumberOfHits(page, PAGE_RANGE_PATTERN, ACMPortalFetcher.MAX_HITS_PATTERN); parse(page, Math.min(hits, PER_PAGE), previews); for (Map.Entry<String, JLabel> entry : previews.entrySet()) { preview.addEntry(entry.getKey(), entry.getValue()); } return true; } catch (IOException e) { LOGGER.error("Error while fetching from " + getTitle(), e); preview.showErrorMessage(this.getTitle(), e.getLocalizedMessage()); return false; } } @Override public void getEntries(Map<String, Boolean> selection, ImportInspector inspector) { for (Map.Entry<String, Boolean> selentry : selection.entrySet()) { if (!shouldContinue) { break; } if (selentry.getValue()) { downloadEntryBibTeX(selentry.getKey(), fetchAbstract).ifPresent(entry -> { // Convert from HTML and optionally add curly brackets around key words to keep the case entry.getField(FieldName.TITLE).ifPresent(title -> { title = title.replace("\\&", "&").replace("\\#", "#"); title = convertHTMLChars(title); // Unit formatting if (Globals.prefs.getBoolean(JabRefPreferences.USE_UNIT_FORMATTER_ON_SEARCH)) { title = unitsToLatexFormatter.format(title); } // Case keeping if (Globals.prefs.getBoolean(JabRefPreferences.USE_CASE_KEEPER_ON_SEARCH)) { title = protectTermsFormatter.format(title); } entry.setField(FieldName.TITLE, title); }); entry.getField(FieldName.ABSTRACT) .ifPresent(abstr -> entry.setField(FieldName.ABSTRACT, convertHTMLChars(abstr))); inspector.addEntry(entry); }); } } } @Override public int getWarningLimit() { return 10; } @Override public int getPreferredPreviewHeight() { return 75; } @Override public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) { return false; } private String makeUrl() { StringBuilder sb = new StringBuilder(ACMPortalFetcher.START_URL).append(ACMPortalFetcher.SEARCH_URL_PART) .append(terms.replace(" ", "%20")).append(ACMPortalFetcher.SEARCH_URL_PART_II); if (acmOrGuide) { sb.append("ACM"); } else { sb.append("GUIDE"); } sb.append(ACMPortalFetcher.END_URL); return sb.toString(); } private void parse(String text, int hits, Map<String, JLabel> entries) { int entryNumber = 1; while (getNextEntryURL(text, entryNumber, entries) && (entryNumber <= hits)) { entryNumber++; } } private static String getEntryBibTeXURL(String fullCitation) { // Get ID Matcher idMatcher = ACMPortalFetcher.ID_PATTERN.matcher(fullCitation); if (idMatcher.find()) { return idMatcher.group(1); } LOGGER.info("Did not find ID in: " + fullCitation); return null; } private boolean getNextEntryURL(String allText, int entryNumber, Map<String, JLabel> entries) { int index = allText.indexOf(NEXT_ENTRY_PATTERN, piv); int endIndex = allText.indexOf(END_ENTRY_PATTERN, index); piv = endIndex; if (index >= 0) { String text = allText.substring(index, endIndex); // Always try RIS import first Matcher fullCitation = ACMPortalFetcher.FULL_CITATION_PATTERN.matcher(text); String item; if (fullCitation.find()) { String link = getEntryBibTeXURL(fullCitation.group(1)); if (endIndex > 0) { StringBuilder sb = new StringBuilder(); // Find authors: int authStart = text.indexOf(AUTHOR_MARKER); if (authStart >= 0) { int authEnd = text.indexOf("</div>", authStart + AUTHOR_MARKER.length()); if (authEnd >= 0) { sb.append("<p>").append(text.substring(authStart, authEnd)).append("</p>"); } } // Find title: Matcher titM = ACMPortalFetcher.TITLE_PATTERN.matcher(text); if (titM.find()) { sb.append("<p>").append(titM.group(1)).append("</p>"); } int sourceStart = text.indexOf(SOURCE_MARKER); if (sourceStart >= 0) { int sourceEnd = text.indexOf("</div>", sourceStart + SOURCE_MARKER.length()); if (sourceEnd >= 0) { String sourceText = text.substring(sourceStart, sourceEnd); // Find source: Matcher source = ACMPortalFetcher.SOURCE_PATTERN.matcher(sourceText); if (source.find()) { sb.append("<p>").append(source.group(1)).append("</p>"); } } } item = sb.toString(); } else { item = link; } JLabel preview = new JLabel("<html>" + item + "</html>"); preview.setPreferredSize(new Dimension(750, 100)); entries.put(link, preview); return true; } LOGGER.warn("Citation unmatched " + Integer.toString(entryNumber)); return false; } return false; } private static Optional<BibEntry> downloadEntryBibTeX(String id, boolean downloadAbstract) { try { URL url = new URL( ACMPortalFetcher.START_URL + ACMPortalFetcher.BIBTEX_URL + id + ACMPortalFetcher.BIBTEX_URL_END); URLConnection connection = url.openConnection(); // set user-agent to avoid being blocked as a crawler connection.addRequestProperty("User-Agent", URLDownload.USER_AGENT); Collection<BibEntry> items = null; try (BufferedReader in = new BufferedReader( new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8))) { String htmlCode = in.lines().filter(s -> !s.isEmpty()).collect(Collectors.joining()); String bibtexString = htmlCode.substring(htmlCode.indexOf(START_BIBTEX_ENTRY), htmlCode.indexOf(END_BIBTEX_ENTRY_HTML)); items = new BibtexParser(Globals.prefs.getImportFormatPreferences()).parseEntries(bibtexString); } catch (IOException | ParseException e) { LOGGER.info("Download of BibTeX information from ACM Portal failed.", e); } if ((items == null) || items.isEmpty()) { return Optional.empty(); } BibEntry entry = items.iterator().next(); Thread.sleep(ACMPortalFetcher.WAIT_TIME);//wait between requests or you will be blocked by ACM // get abstract if (downloadAbstract) { URLDownload dl = new URLDownload(ACMPortalFetcher.START_URL + ACMPortalFetcher.ABSTRACT_URL + id); String page = dl.asString(Globals.prefs.getDefaultEncoding()); Matcher absM = ACMPortalFetcher.ABSTRACT_PATTERN.matcher(page); if (absM.find()) { entry.setField(FieldName.ABSTRACT, absM.group(1).trim()); } Thread.sleep(ACMPortalFetcher.WAIT_TIME);//wait between requests or you will be blocked by ACM } return Optional.of(entry); } catch (NoSuchElementException e) { LOGGER.info( "Bad BibTeX record read at: " + ACMPortalFetcher.BIBTEX_URL + id + ACMPortalFetcher.BIBTEX_URL_END, e); } catch (MalformedURLException e) { LOGGER.info("Malformed URL.", e); } catch (IOException e) { LOGGER.info("Cannot connect.", e); } catch (InterruptedException ignored) { // Ignored } return Optional.empty(); } /** * This method must convert HTML style char sequences to normal characters. * @param text The text to handle. * @return The converted text. */ private String convertHTMLChars(String text) { return htmlToLatexFormatter.format(text); } /** * Find out how many hits were found. * @param page */ private static int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException { int ind = page.indexOf(marker); if (ind >= 0) { String substring = page.substring(ind, Math.min(ind + 100, page.length())); Matcher m = pattern.matcher(substring); if (m.find()) { try { String number = m.group(1); number = number.replace(",", ""); // Remove , as in 1,234 return Integer.parseInt(number); } catch (NumberFormatException ex) { throw new IOException("Cannot parse number of hits"); } } LOGGER.info("Unmatched! " + substring); } throw new IOException("Cannot parse number of hits"); } @Override public String getTitle() { return "ACM Portal"; } @Override public HelpFile getHelpPage() { return HelpFile.FETCHER_ACM; } // This method is called by the dialog when the user has canceled or //signaled a stop. It is expected that any long-running fetch operations //will stop after this method is called. @Override public void stopFetching() { shouldContinue = false; } }