/* Aaron Chen * 08-28-2007 * ACM Portal support */ package net.sf.jabref.imports; import java.awt.GridLayout; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.ConnectException; import java.net.MalformedURLException; import java.net.URL; import java.util.Collection; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.ButtonGroup; import javax.swing.JCheckBox; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.JRadioButton; import net.sf.jabref.BibtexEntry; import net.sf.jabref.GUIGlobals; import net.sf.jabref.Globals; import net.sf.jabref.OutputPrinter; public class ACMPortalFetcher implements EntryFetcher { ImportInspector dialog = null; OutputPrinter status; final HTMLConverter htmlConverter = new HTMLConverter(); private String terms; String startUrl = "http://portal.acm.org/"; String searchUrlPart = "results.cfm?query="; String searchUrlPartII = "&dl="; String endUrl = "&coll=Portal&short=0";//&start="; private JRadioButton acmButton = new JRadioButton(Globals.lang("The ACM Digital Library")); private JRadioButton guideButton = new JRadioButton(Globals.lang("The Guide to Computing Literature")); private JCheckBox absCheckBox = new JCheckBox(Globals.lang("Include abstracts"), false); private static final int MAX_FETCH = 20; // 20 when short=0 private int perPage = MAX_FETCH, hits = 0, unparseable = 0, parsed = 0; private boolean shouldContinue = false; private boolean fetchAbstract = false; private boolean acmOrGuide = false; Pattern hitsPattern = Pattern.compile(".*Found <b>(\\d+,*\\d*)</b> of.*"); Pattern maxHitsPattern = Pattern.compile(".*Results \\d+ - \\d+ of (\\d+,*\\d*).*"); Pattern bibPattern = Pattern.compile(".*(popBibTex.cfm.*)','BibTex'.*"); Pattern absPattern = Pattern.compile(".*ABSTRACT</A></span>\\s+<p class=\"abstract\">\\s+(.*)"); Pattern fullCitationPattern = Pattern.compile("<A HREF=\"(citation.cfm.*)\" class.*"); public JPanel getOptionsPanel() { JPanel pan = new JPanel(); pan.setLayout(new GridLayout(0,1)); guideButton.setSelected(true); ButtonGroup group = new ButtonGroup(); group.add(acmButton); group.add(guideButton); pan.add(absCheckBox); pan.add(acmButton); pan.add(guideButton); return pan; } public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) { this.dialog = dialog; this.status = status; this.terms = query; piv = 0; shouldContinue = true; parsed = 0; unparseable = 0; acmOrGuide = acmButton.isSelected(); String address = makeUrl(0); try { URL url = new URL(address); //dialog.setVisible(true); String page = getResults(url); //System.out.println(address); hits = getNumberOfHits(page, "Found", hitsPattern); int index = page.indexOf("Found"); if (index >= 0) { page = page.substring(index + 5); index = page.indexOf("Found"); if (index >= 0) page = page.substring(index); } //System.out.println(page); //System.out.printf("Hit %d\n", hits); if (hits == 0) { status.showMessage(Globals.lang("No entries found for the search string '%0'", terms), Globals.lang("Search ACM Portal"), JOptionPane.INFORMATION_MESSAGE); return false; } int maxHits = getNumberOfHits(page, "Results", maxHitsPattern); //System.out.printf("maxHit %d\n", maxHits); //String page = getResultsFromFile(new File("/home/alver/div/temp50.txt")); //List entries = new ArrayList(); //System.out.println("Number of hits: "+hits); //System.out.println("Maximum returned: "+maxHits); if (hits > maxHits) hits = maxHits; if (hits > MAX_FETCH) { status.showMessage(Globals.lang("%0 entries found. To reduce server load, " +"only %1 will be downloaded. It will be very slow, in order to make ACM happy.", new String[] {String.valueOf(hits), String.valueOf(MAX_FETCH)}), Globals.lang("Search ACM Portal"), JOptionPane.INFORMATION_MESSAGE); hits = MAX_FETCH; } fetchAbstract = absCheckBox.isSelected(); //parse(dialog, page, 0, 51); //dialog.setProgress(perPage/2, hits); parse(dialog, page, 0, 1); //System.out.println(page); int firstEntry = perPage; while (shouldContinue && (firstEntry < hits)) { //System.out.println("Fetching from: "+firstEntry); address = makeUrl(firstEntry); //System.out.println(address); page = getResults(new URL(address)); //dialog.setProgress(firstEntry+perPage/2, hits); if (!shouldContinue) break; parse(dialog, page, 0, 1+firstEntry); firstEntry += perPage; } return true; } catch (MalformedURLException e) { e.printStackTrace(); } catch (ConnectException e) { status.showMessage(Globals.lang("Connection to ACM Portal failed"), Globals.lang("Search ACM Portal"), JOptionPane.ERROR_MESSAGE); } catch (IOException e) { status.showMessage(Globals.lang(e.getMessage()), Globals.lang("Search ACM Portal"), JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } return false; } private String makeUrl(int startIndex) { StringBuffer sb = new StringBuffer(startUrl).append(searchUrlPart); sb.append(terms.replaceAll(" ", "%20")); sb.append(searchUrlPartII); if (acmOrGuide) sb.append("ACM"); else sb.append("GUIDE"); sb.append(endUrl); return sb.toString(); } int piv = 0; private void parse(ImportInspector dialog, String text, int startIndex, int firstEntryNumber) { piv = startIndex; int entryNumber = firstEntryNumber; BibtexEntry entry; while (((entry = parseNextEntry(text, piv, entryNumber)) != null) && (shouldContinue)) { if (entry.getField("title") != null) { dialog.addEntry(entry); dialog.setProgress(parsed + unparseable, hits); parsed++; } entryNumber++; try { Thread.sleep(10000);//wait between requests or you will be blocked by ACM } catch (InterruptedException e) { System.err.println(e.getStackTrace()); } } } private BibtexEntry parseEntryBibTeX(String fullCitation, boolean abs) throws IOException { URL url; try { url = new URL(startUrl + fullCitation); String page = getResults(url); Thread.sleep(10000);//wait between requests or you will be blocked by ACM Matcher bibtexAddr = bibPattern.matcher(page); if (bibtexAddr.find()) { URL bibtexUrl = new URL(startUrl + bibtexAddr.group(1)); BufferedReader in = new BufferedReader(new InputStreamReader(bibtexUrl.openStream())); ParserResult result = BibtexParser.parse(in); in.close(); Collection<BibtexEntry> item = result.getDatabase().getEntries(); BibtexEntry entry = item.iterator().next(); if (abs == true) { Matcher absMatch = absPattern.matcher(page); if (absMatch.find()) { String absBlock = absMatch.group(1); entry.setField("abstract", convertHTMLChars(absBlock).trim()); } else { System.out.println("No abstract matched."); //System.out.println(page); } } Thread.sleep(10000);//wait between requests or you will be blocked by ACM return entry; } else return null; } catch (MalformedURLException e) { e.printStackTrace(); return null; } catch (ConnectException e) { e.printStackTrace(); return null; } catch (IOException e) { e.printStackTrace(); return null; } catch (InterruptedException e) { e.printStackTrace(); return null; } } private BibtexEntry parseNextEntry(String allText, int startIndex, int entryNumber) { String toFind = new StringBuffer().append("<strong>") .append(entryNumber).append("</strong>").toString(); int index = allText.indexOf(toFind, startIndex); int endIndex = allText.indexOf("</table>", index+1); //if (endIndex < 0) endIndex = allText.length(); BibtexEntry entry = null; if (index >= 0) { piv = index+1; String text = allText.substring(index, endIndex); // Always try RIS import first Matcher fullCitation = fullCitationPattern.matcher(text); if (fullCitation.find()) { try { Thread.sleep(10000);//wait between requests or you will be blocked by ACM entry = parseEntryBibTeX(fullCitation.group(1), fetchAbstract); } catch (Exception e) { e.printStackTrace(); } } else { System.out.printf("Citation Unmatched %d\n", entryNumber); System.out.printf(text); } if (entry != null) { // fetch successful return entry; } } //System.out.println(allText); //System.out.println(toFind); //System.out.println("Parse Failed"); return null; } /** * This method must convert HTML style char sequences to normal characters. * @param text The text to handle. * @return The converted text. */ private String convertHTMLChars(String text) { return htmlConverter.format(text); } /** * Find out how many hits were found. * @param page */ private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException { int ind = page.indexOf(marker); if (ind < 0) { System.out.println(page); throw new IOException(Globals.lang("Could not parse number of hits")); } String substring = page.substring(ind, Math.min(ind + 42, page.length())); Matcher m = pattern.matcher(substring); if (!m.find()) { System.out.println("Unmatched!"); System.out.println(substring); } else { try { // get rid of , String number = m.group(1); //NumberFormat nf = NumberFormat.getInstance(); //return nf.parse(number).intValue(); number = number.replaceAll(",", ""); //System.out.println(number); return Integer.parseInt(number); } catch (NumberFormatException ex) { throw new IOException(Globals.lang("Could not parse number of hits")); } catch (IllegalStateException e) { throw new IOException(Globals.lang("Could not parse number of hits")); } } throw new IOException(Globals.lang("Could not parse number of hits")); } /** * Download the URL and return contents as a String. * @param source * @return * @throws IOException */ public String getResults(URL source) throws IOException { InputStream in = source.openStream(); StringBuffer sb = new StringBuffer(); byte[] buffer = new byte[256]; while(true) { int bytesRead = in.read(buffer); if(bytesRead == -1) break; for (int i=0; i<bytesRead; i++) sb.append((char)buffer[i]); } return sb.toString(); } /** * Read results from a file instead of an URL. Just for faster debugging. * @param f * @return * @throws IOException */ public String getResultsFromFile(File f) throws IOException { InputStream in = new BufferedInputStream(new FileInputStream(f)); StringBuffer sb = new StringBuffer(); byte[] buffer = new byte[256]; while(true) { int bytesRead = in.read(buffer); if(bytesRead == -1) break; for (int i=0; i<bytesRead; i++) sb.append((char)buffer[i]); } return sb.toString(); } public String getTitle() { return Globals.menuTitle("Search ACM Portal"); } public URL getIcon() { return GUIGlobals.getIconUrl("www"); } public String getHelpPage() { return "ACMPortalHelp.html"; } public String getKeyName() { return "Search ACM Portal"; } // This method is called by the dialog when the user has cancelled the import. public void cancelled() { shouldContinue = false; } // This method is called by the dialog when the user has selected the //wanted entries, and clicked Ok. The callback object can update status //line etc. public void done(int entriesImported) { //System.out.println("Number of entries parsed: "+parsed); //System.out.println("Parsing failed for "+unparseable+" entries"); } // This method is called by the dialog when the user has cancelled or //signalled a stop. It is expected that any long-running fetch operations //will stop after this method is called. public void stopFetching() { shouldContinue = false; } }