package net.sf.jabref.imports; import java.awt.BorderLayout; import java.io.BufferedReader; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.ConnectException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.ButtonGroup; import javax.swing.JCheckBox; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.JRadioButton; import net.sf.jabref.BibtexDatabase; import net.sf.jabref.BibtexEntry; import net.sf.jabref.BibtexEntryType; import net.sf.jabref.GUIGlobals; import net.sf.jabref.Globals; import net.sf.jabref.OutputPrinter; import net.sf.jabref.Util; public class IEEEXploreFetcher implements EntryFetcher { ImportInspector dialog = null; OutputPrinter status; final HTMLConverter htmlConverter = new HTMLConverter(); private JCheckBox absCheckBox = new JCheckBox(Globals.lang("Include abstracts"), false); private JRadioButton htmlButton = new JRadioButton(Globals.lang("HTML parser")); private JRadioButton bibButton = new JRadioButton(Globals.lang("BibTeX importer")); private static final int MAX_FETCH = 100; private int perPage = MAX_FETCH, hits = 0, unparseable = 0, parsed = 0; private int piv = 0; private boolean shouldContinue = false; private boolean includeAbstract = false; private boolean importBibtex = false; private String terms; private final String startUrl = "http://ieeexplore.ieee.org/search/freesearchresult.jsp?queryText="; private final String endUrl = "&rowsPerPage=" + Integer.toString(perPage) + "&pageNumber="; private String searchUrl; private final String importUrl = "http://ieeexplore.ieee.org/xpls/downloadCitations"; private final Pattern hitsPattern = Pattern.compile("([0-9,]+) results"); private final Pattern idPattern = Pattern.compile("<input name=\"\" type=\"checkbox\" value=\"\"\\s*" + "id=\"([0-9]+)\"/>"); private final Pattern typePattern = Pattern.compile("<span class=\"type\">\\s*(.+)"); private HashMap<String, String> fieldPatterns = new HashMap<String, String>(); private final Pattern absPattern = Pattern.compile("<p>\\s*(.+)"); Pattern stdEntryPattern = Pattern.compile(".*<strong>(.+)</strong><br>" + "\\s+(.+)"); Pattern publicationPattern = Pattern.compile("(.*), \\d*\\.*\\s?(.*)"); Pattern proceedingPattern = Pattern.compile("(.*?)\\.?\\s?Proceedings\\s?(.*)"); Pattern abstractLinkPattern = Pattern.compile( "<a href=\"(.+)\" class=\"bodyCopySpaced\">Abstract</a>"); String abrvPattern = ".*[^,] '?\\d+\\)?"; Pattern ieeeArticleNumberPattern = Pattern.compile("<a href=\".*arnumber=(\\d+).*\">"); public IEEEXploreFetcher() { super(); fieldPatterns.put("title", "<a\\s*href=[^<]+>\\s*(.+)\\s*</a>"); fieldPatterns.put("author", "<p>\\s+(.+)"); fieldPatterns.put("volume", "Volume:\\s*(\\d+)"); fieldPatterns.put("number", "Issue:\\s*(\\d+)"); //fieldPatterns.put("part", "Part (\\d+), (.+)"); fieldPatterns.put("year", "Publication Year:\\s*(\\d{4})"); fieldPatterns.put("pages", "Page\\(s\\):\\s*(\\d+)\\s*-\\s*(\\d*)"); fieldPatterns.put("doi", "Digital Object Identifier:\\s*<a href=.*>(.+)</a>"); } public JPanel getOptionsPanel() { JPanel pan = new JPanel(); pan.setLayout(new BorderLayout()); htmlButton.setSelected(true); htmlButton.setEnabled(false); bibButton.setEnabled(false); ButtonGroup group = new ButtonGroup(); group.add(htmlButton); group.add(bibButton); pan.add(absCheckBox, BorderLayout.NORTH); pan.add(htmlButton, BorderLayout.CENTER); pan.add(bibButton, BorderLayout.EAST); return pan; } public boolean processQuery(String query, ImportInspector dialog, OutputPrinter status) { this.dialog = dialog; this.status = status; terms = query; piv = 0; shouldContinue = true; parsed = 0; unparseable = 0; int pageNumber = 1; searchUrl = makeUrl(pageNumber);//start at page 1 try { URL url = new URL(searchUrl); String page = getResults(url); if (page.indexOf("You have entered an invalid search") >= 0) { status.showMessage(Globals.lang("You have entered an invalid search '%0'.", terms), Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE); return false; } if (page.indexOf("Bad request") >= 0) { status.showMessage(Globals.lang("Bad Request '%0'.", terms), Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE); return false; } if (page.indexOf("No results were found.") >= 0) { status.showMessage(Globals.lang("No entries found for the search string '%0'", terms), Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE); return false; } hits = getNumberOfHits(page, "display-status", hitsPattern); includeAbstract = absCheckBox.isSelected(); importBibtex = bibButton.isSelected(); if (hits > MAX_FETCH) { status.showMessage(Globals.lang("%0 entries found. To reduce server load, " +"only %1 will be downloaded.", new String[] {String.valueOf(hits), String.valueOf(MAX_FETCH)}), Globals.lang("Search IEEEXplore"), JOptionPane.INFORMATION_MESSAGE); hits = MAX_FETCH; } parse(dialog, page, 0, 1); int firstEntry = perPage; while (shouldContinue && firstEntry < hits) { pageNumber++; searchUrl = makeUrl(pageNumber); page = getResults(new URL(searchUrl)); if (!shouldContinue) break; parse(dialog, page, 0, firstEntry + 1); firstEntry += perPage; } return true; } catch (MalformedURLException e) { e.printStackTrace(); } catch (ConnectException e) { status.showMessage(Globals.lang("Connection to IEEEXplore failed"), Globals.lang("Search IEEEXplore"), JOptionPane.ERROR_MESSAGE); } catch (IOException e) { status.showMessage(Globals.lang(e.getMessage()), Globals.lang("Search IEEEXplore"), JOptionPane.ERROR_MESSAGE); e.printStackTrace(); } return false; } public String getTitle() { return Globals.menuTitle("Search IEEEXplore"); } public URL getIcon() { return GUIGlobals.getIconUrl("www"); } public String getHelpPage() { return "IEEEXploreHelp.html"; } public String getKeyName() { return "Search IEEEXplore"; } /** * This method is called by the dialog when the user has cancelled the import. */ public void stopFetching() { shouldContinue = false; } private String makeUrl(int startIndex) { StringBuffer sb = new StringBuffer(startUrl); sb.append(terms.replaceAll(" ", "+")); sb.append(endUrl); sb.append(String.valueOf(startIndex)); return sb.toString(); } private void parse(ImportInspector dialog, String text, int startIndex, int firstEntryNumber) { piv = startIndex; int entryNumber = firstEntryNumber; if (importBibtex) { //TODO: Login ArrayList<String> idSelected = new ArrayList<String>(); String id; while ((id = parseNextEntryId(text, piv)) != null && shouldContinue) { idSelected.add(id); entryNumber++; } try { BibtexDatabase dbase = parseBibtexDatabase(idSelected, includeAbstract); Collection<BibtexEntry> items = dbase.getEntries(); Iterator<BibtexEntry> iter = items.iterator(); while (iter.hasNext()) { BibtexEntry entry = iter.next(); dialog.addEntry(cleanup(entry)); dialog.setProgress(parsed + unparseable, hits); parsed++; } } catch (IOException e) { e.printStackTrace(); } //for } else { BibtexEntry entry; while (((entry = parseNextEntry(text, piv)) != null) && shouldContinue) { if (entry.getField("title") != null) { dialog.addEntry(entry); dialog.setProgress(parsed + unparseable, hits); parsed++; } entryNumber++; } } } private BibtexDatabase parseBibtexDatabase(List<String> id, boolean abs) throws IOException { if (id.isEmpty()) return null; URL url; URLConnection conn; try { url = new URL(importUrl); conn = url.openConnection(); } catch (MalformedURLException e) { e.printStackTrace(); return null; } conn.setDoInput(true); conn.setDoOutput(true); conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); conn.setRequestProperty("Referer", searchUrl); PrintWriter out = new PrintWriter( conn.getOutputStream()); String recordIds = ""; Iterator<String> iter = id.iterator(); while (iter.hasNext()) { recordIds += iter.next() + " "; } recordIds = recordIds.trim(); String citation = abs ? "citation-abstract" : "citation-only"; String content = "recordIds=" + recordIds.replaceAll(" ", "%20") + "&fromPageName=&citations-format=" + citation + "&download-format=download-bibtex"; System.out.println(content); out.write(content); out.flush(); out.close(); BufferedReader bufr = new BufferedReader(new InputStreamReader(conn.getInputStream())); StringBuffer sb = new StringBuffer(); char[] buffer = new char[256]; while(true) { int bytesRead = bufr.read(buffer); if(bytesRead == -1) break; for (int i=0; i<bytesRead; i++) sb.append((char)buffer[i]); } System.out.println(sb.toString()); ParserResult results = new BibtexParser(bufr).parse(); bufr.close(); return results.getDatabase(); } private BibtexEntry cleanup(BibtexEntry entry) { if (entry == null) return null; // clean up author String author = (String)entry.getField("author"); if (author != null) { author = author.replaceAll("\\.", ". "); author = author.replaceAll(" ", " "); author = author.replaceAll("\\. -", ".-"); author = author.replaceAll("; ", " and "); author = author.replaceAll("[,;]$", ""); entry.setField("author", author); } // clean up month String month = (String)entry.getField("month"); if ((month != null) && (month.length() > 0)) { month = month.replaceAll("\\.", ""); month = month.toLowerCase(); Pattern monthPattern = Pattern.compile("(\\d*+)\\s*([a-z]*+)-*(\\d*+)\\s*([a-z]*+)"); Matcher mm = monthPattern.matcher(month); String date = month; if (mm.find()) { if (mm.group(3).length() == 0) { if (mm.group(2).length() > 0) { date = "#" + mm.group(2).substring(0, 3) + "#"; if (mm.group(1).length() > 0) { date += " " + mm.group(1) + ","; } } else { date = mm.group(1) + ","; } } else if (mm.group(2).length() == 0) { if (mm.group(4).length() > 0) { date = "#" + mm.group(4).substring(0, 3) + "# " + mm.group(1) + "--" + mm.group(3) + ","; } else date += ","; } else { date = "#" + mm.group(2).substring(0, 3) + "# " + mm.group(1) + "--#" + mm.group(4).substring(0, 3) + "# " + mm.group(3) + ","; } } //date = date.trim(); //if (!date.isEmpty()) { entry.setField("month", date); //} } // clean up pages String field = "pages"; String pages = entry.getField(field); if (pages != null) { String [] pageNumbers = pages.split("-"); if (pageNumbers.length == 2) { if (pageNumbers[0].equals(pageNumbers[1])) {// single page entry.setField(field, pageNumbers[0]); } else { entry.setField(field, pages.replaceAll("-", "--")); } } } // clean up publication field BibtexEntryType type = entry.getType(); String sourceField = ""; if (type.getName() == "Article") { sourceField = "journal"; entry.clearField("booktitle"); } else if (type.getName() == "Inproceedings"){ sourceField = "booktitle"; } String fullName = entry.getField(sourceField); if (fullName != null) { if (type.getName() == "Article") { int ind = fullName.indexOf(": Accepted for future publication"); if (ind > 0) { fullName = fullName.substring(0, ind); entry.setField("year", "to be published"); entry.clearField("month"); entry.clearField("pages"); } String[] parts = fullName.split("[\\[\\]]"); //[see also...], [legacy...] fullName = parts[0]; if (parts.length == 3) { fullName += parts[2]; } } else { fullName = fullName.replace("Conference Proceedings", "Proceedings"). replace("Proceedings of", "Proceedings").replace("Proceedings.", "Proceedings"); fullName = fullName.replaceAll("International", "Int."); fullName = fullName.replaceAll("Symposium", "Symp."); fullName = fullName.replaceAll("Conference", "Conf."); fullName = fullName.replaceAll(" on", " ").replace(" ", " "); } Matcher m1 = publicationPattern.matcher(fullName); if (m1.find()) { String prefix = m1.group(2).trim(); String postfix = m1.group(1).trim(); String abrv = ""; String[] parts = prefix.split("\\. ", 2); if (parts.length == 2) { if (parts[0].matches(abrvPattern)) { prefix = parts[1]; abrv = parts[0]; } else { prefix = parts[0]; abrv = parts[1]; } } if (prefix.matches(abrvPattern) == false) { fullName = prefix + " " + postfix + " " + abrv; fullName = fullName.trim(); } else { fullName = postfix + " " + prefix; } } if (type.getName() == "Article") { fullName = fullName.replace("- ", "-"); //IEE Proceedings- fullName = fullName.trim(); if (Globals.prefs.getBoolean("useIEEEAbrv")) { String id = Globals.journalAbbrev.getAbbreviatedName(fullName, false); if (id != null) fullName = id; } } if (type.getName() == "Inproceedings") { Matcher m2 = proceedingPattern.matcher(fullName); if (m2.find()) { String prefix = m2.group(2); String postfix = m2.group(1).replaceAll("\\.$", ""); if (prefix.matches(abrvPattern) == false) { String abrv = ""; String[] parts = postfix.split("\\. ", 2); if (parts.length == 2) { if (parts[0].matches(abrvPattern)) { postfix = parts[1]; abrv = parts[0]; } else { postfix = parts[0]; abrv = parts[1]; } } fullName = prefix.trim() + " " + postfix.trim() + " " + abrv; } else { fullName = postfix.trim() + " " + prefix.trim(); } } fullName = fullName.trim(); fullName = fullName.replaceAll("^[tT]he ", "").replaceAll("^\\d{4} ", "").replaceAll("[,.]$", ""); String year = entry.getField("year"); fullName = fullName.replaceAll(", " + year + "\\.?", ""); if (fullName.contains("Abstract") == false && fullName.contains("Summaries") == false && fullName.contains("Conference Record") == false) fullName = "Proc. " + fullName; } entry.setField(sourceField, fullName); } return entry; } private String parseNextEntryId(String allText, int startIndex) { int index = allText.indexOf("<div class=\"select", startIndex); int endIndex = allText.indexOf("</div>", index); if (index >= 0 && endIndex > 0) { String text = allText.substring(index, endIndex); endIndex += 6; piv = endIndex; //parse id Matcher idMatcher = idPattern.matcher(text); //add id into a vector if (idMatcher.find()) { return idMatcher.group(1); } } return null; } private BibtexEntry parseNextEntry(String allText, int startIndex) { BibtexEntry entry = null; int index = allText.indexOf("<div class=\"detail", piv); int endIndex = allText.indexOf("</div>", index); if (index >= 0 && endIndex > 0) { endIndex += 6; piv = endIndex; String text = allText.substring(index, endIndex); BibtexEntryType type = null; String sourceField = null; String typeName = ""; Matcher typeMatcher = typePattern.matcher(text); if (typeMatcher.find()) { typeName = typeMatcher.group(1); if (typeName.equalsIgnoreCase("IEEE Journals") || typeName.equalsIgnoreCase("IEEE Early Access") || typeName.equalsIgnoreCase("IET Journals") || typeName.equalsIgnoreCase("AIP Journals") || typeName.equalsIgnoreCase("AVS Journals") || typeName.equalsIgnoreCase("IBM Journals")) { type = BibtexEntryType.getType("article"); sourceField = "journal"; } else if (typeName.equalsIgnoreCase("IEEE Conferences") || typeName.equalsIgnoreCase("IET Conferences")) { type = BibtexEntryType.getType("inproceedings"); sourceField = "booktitle"; } else if (typeName.equalsIgnoreCase("IEEE Standards")) { type = BibtexEntryType.getType("standard"); sourceField = "number"; } else if (typeName.equalsIgnoreCase("IEEE Educational Courses")) { type = BibtexEntryType.getType("Electronic"); sourceField = "note"; } else if (typeName.equalsIgnoreCase("IEEE Book Chapter")) { type = BibtexEntryType.getType("inCollection"); sourceField = "booktitle"; } } if (type == null) { type = BibtexEntryType.getType("misc"); sourceField = "note"; System.err.println("Type detection failed. Use MISC instead."); unparseable++; System.err.println(text); } entry = new BibtexEntry(Util.createNeutralId(), type); if (typeName.equalsIgnoreCase("IEEE Standards")) { entry.setField("organization", "IEEE"); } if (typeName.equalsIgnoreCase("IEEE Book Chapter")) { entry.setField("publisher", "IEEE"); } if (typeName.equalsIgnoreCase("IEEE Early Access")) { entry.setField("note", "Early Access"); } Set<String> fields = fieldPatterns.keySet(); for (String field: fields) { Matcher fieldMatcher = Pattern.compile(fieldPatterns.get(field)).matcher(text); if (fieldMatcher.find()) { entry.setField(field, htmlConverter.format(fieldMatcher.group(1))); if (field.equals("title") && fieldMatcher.find()) { String sec_title = htmlConverter.format(fieldMatcher.group(1)); if (entry.getType() == BibtexEntryType.getStandardType("standard")) { sec_title = sec_title.replaceAll("IEEE Std ", ""); } entry.setField(sourceField, sec_title); } if (field.equals("pages") && fieldMatcher.groupCount() == 2) { entry.setField(field, fieldMatcher.group(1) + "-" + fieldMatcher.group(2)); } } } if (entry.getType() == BibtexEntryType.getStandardType("inproceedings") && entry.getField("author").equals("")) { entry.setType(BibtexEntryType.getStandardType("proceedings")); } if (includeAbstract) { index = allText.indexOf("<div class=\"abstract RevealContent", piv); if (index >= 0) { endIndex = allText.indexOf("</div>", index) + 6; piv = endIndex; text = allText.substring(index, endIndex); Matcher absMatcher = absPattern.matcher(text); if (absMatcher.find()) { entry.setField("abstract", absMatcher.group(1)); } } } } if (entry == null) { return null; } else { return cleanup(entry); } } /** * Find out how many hits were found. * @param page */ private int getNumberOfHits(String page, String marker, Pattern pattern) throws IOException { int ind = page.indexOf(marker); if (ind < 0) { System.out.println(page); throw new IOException(Globals.lang("Could not parse number of hits")); } String substring = page.substring(ind, page.length()); Matcher m = pattern.matcher(substring); if (m.find()) return Integer.parseInt(m.group(1)); else throw new IOException(Globals.lang("Could not parse number of hits")); } /** * Download the URL and return contents as a String. * @param source * @return * @throws IOException */ public String getResults(URL source) throws IOException { InputStream in = source.openStream(); StringBuffer sb = new StringBuffer(); byte[] buffer = new byte[256]; while(true) { int bytesRead = in.read(buffer); if(bytesRead == -1) break; for (int i=0; i<bytesRead; i++) sb.append((char)buffer[i]); } return sb.toString(); } /** * Read results from a file instead of an URL. Just for faster debugging. * @param f * @return * @throws IOException */ public String getResultsFromFile(File f) throws IOException { InputStream in = new BufferedInputStream(new FileInputStream(f)); StringBuffer sb = new StringBuffer(); byte[] buffer = new byte[256]; while(true) { int bytesRead = in.read(buffer); if(bytesRead == -1) break; for (int i=0; i<bytesRead; i++) sb.append((char)buffer[i]); } return sb.toString(); } }