/* Copyright (C) 2003-2011 Aaron Chen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package net.sf.jabref.imports; import java.awt.*; import java.io.*; import java.net.ConnectException; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.*; import net.sf.jabref.BibtexEntry; import net.sf.jabref.GUIGlobals; import net.sf.jabref.Globals; import net.sf.jabref.OutputPrinter; import net.sf.jabref.gui.FetcherPreviewDialog; public class ACMPortalFetcher implements PreviewEntryFetcher { private ImportInspector dialog = null; private OutputPrinter status; private final HTMLConverter htmlConverter = new HTMLConverter(); final CaseKeeper caseKeeper = new CaseKeeper(); private String terms; private static final String startUrl = "http://portal.acm.org/"; private static final String searchUrlPart = "results.cfm?query="; private static final String searchUrlPartII = "&dl="; private static final String endUrl = "&coll=Portal&short=0";//&start="; private static final String bibtexUrl = "exportformats.cfm?id="; private static final String bibtexUrlEnd = "&expformat=bibtex"; private static final String abstractUrl = "tab_abstract.cfm?id="; private final JRadioButton acmButton = new JRadioButton(Globals.lang("The ACM Digital Library")); private final JRadioButton guideButton = new JRadioButton(Globals.lang("The Guide to Computing Literature")); private final JCheckBox absCheckBox = new JCheckBox(Globals.lang("Include abstracts"), false); private static final int perPage = 20; private static final int MAX_FETCH = perPage; // only one page. Otherwise, the user will get blocked by ACM. 100 has been the old setting. See Bug 3532752 - https://sourceforge.net/tracker/index.php?func=detail&aid=3532752&group_id=92314&atid=600306 private static final int WAIT_TIME = 200; private int hits = 0, unparseable = 0, parsed = 0; private boolean shouldContinue = false; // user settings private boolean fetchAbstract = false; private boolean acmOrGuide = false; private static final Pattern hitsPattern = Pattern.compile(".*Found <b>(\\d+,*\\d*)</b>.*"); private static final Pattern maxHitsPattern = Pattern.compile(".*Results \\d+ - \\d+ of (\\d+,*\\d*).*"); //private static final Pattern bibPattern = Pattern.compile(".*'(exportformats.cfm\\?id=\\d+&expformat=bibtex)'.*"); private static final Pattern fullCitationPattern = Pattern.compile("<A HREF=\"(citation.cfm.*)\" class.*"); private static final Pattern idPattern = Pattern.compile("citation.cfm\\?id=\\d*\\.?(\\d+)&.*"); // Patterns used to extract information for the preview: private static final Pattern titlePattern = Pattern.compile("<A HREF=.*?\">([^<]*)</A>"); private static final Pattern monthYearPattern = Pattern.compile("([A-Za-z]+ [0-9]{4})"); private static final Pattern absPattern = Pattern.compile("<div .*?>(.*?)</div>"); private FetcherPreviewDialog preview; public JPanel getOptionsPanel() { JPanel pan = new JPanel(); pan.setLayout(new GridLayout(0,1)); guideButton.setSelected(true); ButtonGroup group = new ButtonGroup(); group.add(acmButton); group.add(guideButton); pan.add(absCheckBox); pan.add(acmButton); pan.add(guideButton); return pan; } public boolean processQueryGetPreview(String query, FetcherPreviewDialog preview, OutputPrinter status) { this.preview = preview; this.status = status; this.terms = query; piv = 0; shouldContinue = true; parsed = 0; unparseable = 0; acmOrGuide = acmButton.isSelected(); fetchAbstract = absCheckBox.isSelected(); int firstEntry = 1; String address = makeUrl(firstEntry); LinkedHashMap<String, JLabel> previews = new LinkedHashMap<String, JLabel>(); try { URL url = new URL(address); String page = getResults(url); hits = getNumberOfHits(page, "Found", hitsPattern); int index = page.indexOf("Found"); if (index >= 0) { page = page.substring(index + 5); index = page.indexOf("Found"); if (index >= 0) { page = page.substring(index); } } if (hits == 0) { status.showMessage(Globals.lang("No entries found for the search string '%0'", terms), Globals.lang("Search ACM Portal"), JOptionPane.INFORMATION_MESSAGE); return false; } hits = getNumberOfHits(page, "Results", maxHitsPattern); for (int i=0; i<hits; i++) { parse(page, 0, firstEntry, previews); //address = makeUrl(firstEntry); firstEntry += perPage; } for (String s : previews.keySet()) { preview.addEntry(s, previews.get(s)); } return true; } catch (MalformedURLException e) { e.printStackTrace(); } catch (ConnectException e) { status.showMessage(Globals.lang("Connection to ACM Portal failed"), Globals.lang("Search ACM Portal"), JOptionPane.ERROR_MESSAGE); } catch (IOException e) { status.showMessage(Globals.lang(e.getMessage()), Globals.lang("Search ACM Portal"), JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } return false; } public void getEntries(Map<String, Boolean> selection, ImportInspector inspector) { for (String id : selection.keySet()) { if (!shouldContinue) { break; } boolean sel = selection.get(id); if (sel) { try { BibtexEntry entry = downloadEntryBibTeX(id, fetchAbstract); if (entry != null) { // Convert from HTML and optionally add curly brackets around key words to keep the case String title = (String) entry.getField("title"); if (title != null) { title = title.replaceAll("\\\\&", "&").replaceAll("\\\\#","#"); title = convertHTMLChars(title); if (Globals.prefs.getBoolean("useCaseKeeperOnSearch")) { title = caseKeeper.format(title); } entry.setField("title", title); } String abstr = (String) entry.getField("abstract"); if (abstr != null) { abstr = convertHTMLChars(abstr); entry.setField("abstract",abstr); } inspector.addEntry(entry); } } catch (IOException e) { e.printStackTrace(); } } } } public int getWarningLimit() { return 10; } public int getPreferredPreviewHeight() { return 75; } public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) { return false; } private String makeUrl(int startIndex) { StringBuffer sb = new StringBuffer(startUrl).append(searchUrlPart); sb.append(terms.replaceAll(" ", "%20")); sb.append("&start=").append(String.valueOf(startIndex)); sb.append(searchUrlPartII); if (acmOrGuide) { sb.append("ACM"); } else { sb.append("GUIDE"); } sb.append(endUrl); return sb.toString(); } private int piv = 0; private void parse(String text, int startIndex, int firstEntryNumber, Map<String,JLabel> entries) { piv = startIndex; int entryNumber = firstEntryNumber; String entry; while (getNextEntryURL(text, piv, entryNumber, entries)) { entryNumber++; } } private String getEntryBibTeXURL(String fullCitation, boolean abs) { String bibAddr = ""; String ID = ""; // Get ID Matcher idMatcher = idPattern.matcher(fullCitation); if (idMatcher.find()) { ID = idMatcher.group(1); //System.out.println("To fetch: " + bibAddr); } else { System.out.println("Did not find ID in: " + fullCitation); return null; } // fetch bibtex record //bibAddr = bibtexUrl + ID + bibtexUrlEnd; return ID; } private boolean getNextEntryURL(String allText, int startIndex, int entryNumber, Map<String,JLabel> entries) { String toFind = new StringBuffer().append("<strong>") .append(entryNumber).append("</strong><br>").toString(); int index = allText.indexOf(toFind, startIndex); int endIndex = allText.length(); if (index >= 0) { piv = index+1; String text = allText.substring(index, endIndex); // Always try RIS import first Matcher fullCitation = fullCitationPattern.matcher(text); if (fullCitation.find()) { String link = getEntryBibTeXURL(fullCitation.group(1), fetchAbstract); String part; int endOfRecord = text.indexOf("<div class=\"abstract2\">"); if (endOfRecord > 0) { StringBuilder sb = new StringBuilder(); part = text.substring(0, endOfRecord); try { save("part"+entryNumber+".html", part); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } // Find authors: String authMarker = "<div class=\"authors\">"; int authStart = text.indexOf(authMarker); if (authStart >= 0) { int authEnd = text.indexOf("</div>", authStart+authMarker.length()); if (authEnd >= 0) { sb.append("<p>").append(text.substring(authStart, authEnd)).append("</p>"); } } // Find title: Matcher titM = titlePattern.matcher(part); if (titM.find()) { sb.append("<p>").append(titM.group(1)).append("</p>"); } // Find month and year: Matcher mY = monthYearPattern.matcher(part); if (mY.find()) { sb.append("<p>").append(mY.group(1)).append("</p>"); } part = sb.toString(); /*.replaceAll("</tr>", "<br>"); part = part.replaceAll("</td>", ""); part = part.replaceAll("<tr valign=\"[A-Za-z]*\">", ""); part = part.replaceAll("<table style=\"padding: 5px; 5px; 5px; 5px;\" border=\"0\">", "");*/ } else { part = link; } JLabel preview = new JLabel("<html>"+part+"</html>"); preview.setPreferredSize(new Dimension(750, 100)); entries.put(link, preview); return true; } else { System.out.printf("Citation Unmatched %d\n", entryNumber); System.out.printf(text); return false; } } return false; } private BibtexEntry downloadEntryBibTeX(String ID, boolean abs) throws IOException { try { URL url = new URL(startUrl+bibtexUrl+ID+bibtexUrlEnd); BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream())); ParserResult result = BibtexParser.parse(in); in.close(); Collection<BibtexEntry> item = result.getDatabase().getEntries(); if (item.isEmpty()) { return null; } BibtexEntry entry = item.iterator().next(); Thread.sleep(WAIT_TIME);//wait between requests or you will be blocked by ACM // get abstract if (abs) { url = new URL(startUrl + abstractUrl + ID); String page = getResults(url); Matcher absM = absPattern.matcher(page); if (absM.find()) { entry.setField("abstract", absM.group(1).trim()); } Thread.sleep(WAIT_TIME);//wait between requests or you will be blocked by ACM } return entry; } catch (NoSuchElementException e) { System.out.println("Bad Bibtex record read at: " + bibtexUrl + ID + bibtexUrlEnd); e.printStackTrace(); return null; } catch (MalformedURLException e) { e.printStackTrace(); return null; } catch (ConnectException e) { e.printStackTrace(); return null; } catch (IOException e) { e.printStackTrace(); return null; } catch (InterruptedException e) { e.printStackTrace(); return null; } } /** * This method must convert HTML style char sequences to normal characters. * @param text The text to handle. * @return The converted text. */ private String convertHTMLChars(String text) { return htmlConverter.format(text); } /** * Find out how many hits were found. * @param page */ private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException { int ind = page.indexOf(marker); if (ind < 0) { throw new IOException(Globals.lang("Could not parse number of hits")); } String substring = page.substring(ind, Math.min(ind + 42, page.length())); Matcher m = pattern.matcher(substring); if (!m.find()) { System.out.println("Unmatched!"); System.out.println(substring); } else { try { // get rid of , String number = m.group(1); //NumberFormat nf = NumberFormat.getInstance(); //return nf.parse(number).intValue(); number = number.replaceAll(",", ""); //System.out.println(number); return Integer.parseInt(number); } catch (NumberFormatException ex) { throw new IOException(Globals.lang("Could not parse number of hits")); } catch (IllegalStateException e) { throw new IOException(Globals.lang("Could not parse number of hits")); } } throw new IOException(Globals.lang("Could not parse number of hits")); } /** * Download the URL and return contents as a String. * @param source * @return * @throws IOException */ public String getResults(URL source) throws IOException { InputStream in = source.openStream(); StringBuffer sb = new StringBuffer(); byte[] buffer = new byte[256]; while(true) { int bytesRead = in.read(buffer); if(bytesRead == -1) { break; } for (int i=0; i<bytesRead; i++) { sb.append((char)buffer[i]); } } return sb.toString(); } /** * Read results from a file instead of an URL. Just for faster debugging. * @param f * @return * @throws IOException */ public String getResultsFromFile(File f) throws IOException { InputStream in = new BufferedInputStream(new FileInputStream(f)); StringBuffer sb = new StringBuffer(); byte[] buffer = new byte[256]; while(true) { int bytesRead = in.read(buffer); if(bytesRead == -1) { break; } for (int i=0; i<bytesRead; i++) { sb.append((char)buffer[i]); } } return sb.toString(); } public String getTitle() { return "ACM Portal"; } public URL getIcon() { return GUIGlobals.getIconUrl("www"); } public String getHelpPage() { return "ACMPortalHelp.html"; } public String getKeyName() { return "ACM Portal"; } // This method is called by the dialog when the user has cancelled the import. public void cancelled() { shouldContinue = false; } // This method is called by the dialog when the user has selected the //wanted entries, and clicked Ok. The callback object can update status //line etc. public void done(int entriesImported) { } // This method is called by the dialog when the user has cancelled or //signalled a stop. It is expected that any long-running fetch operations //will stop after this method is called. public void stopFetching() { shouldContinue = false; } private void save(String filename, String content) throws IOException { BufferedWriter out = new BufferedWriter(new FileWriter(filename)); out.write(content); out.close(); } }