/* Copyright (C) 2003-2011 JabRef contributors.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref.imports;
import net.sf.jabref.*;
import net.sf.jabref.gui.FetcherPreviewDialog;
import net.sf.jabref.net.URLDownload;
import net.sf.jabref.util.NameListNormalizer;
import javax.swing.*;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GoogleScholarFetcher implements PreviewEntryFetcher {
private boolean hasRunConfig = false;
private boolean clearKeys = true; // Should we clear the keys so new ones can be generated?
protected static int MAX_ENTRIES_TO_LOAD = 50;
final static String QUERY_MARKER = "___QUERY___";
final static String URL_START = "http://scholar.google.com";
final static String URL_SETTING = "http://scholar.google.com/scholar_settings";
final static String URL_SETPREFS = "http://scholar.google.com/scholar_setprefs";
final static String SEARCH_URL = URL_START+"/scholar?q="+QUERY_MARKER
+"&hl=en&btnG=Search";
final static Pattern BIBTEX_LINK_PATTERN = Pattern.compile("<a href=\"([^\"]*)\">[A-Za-z ]*BibTeX");
final static Pattern TITLE_START_PATTERN = Pattern.compile("<div class=\"gs_ri\">");
final static Pattern LINK_PATTERN = Pattern.compile("<h3 class=\"gs_rt\"><a href=\"([^\"]*)\">");
final static Pattern TITLE_END_PATTERN = Pattern.compile("<div class=\"gs_fl\">");
protected HashMap<String,String> entryLinks = new HashMap<String, String>();
//final static Pattern NEXT_PAGE_PATTERN = Pattern.compile(
// "<a href=\"([^\"]*)\"><span class=\"SPRITE_nav_next\"> </span><br><span style=\".*\">Next</span></a>");
protected boolean stopFetching = false;
public int getWarningLimit() {
return 10;
}
public int getPreferredPreviewHeight() {
return 100;
}
public boolean processQuery(String query, ImportInspector inspector, OutputPrinter status) {
return false;
}
public boolean processQueryGetPreview(String query, FetcherPreviewDialog preview, OutputPrinter status) {
entryLinks.clear();
stopFetching = false;
try {
if (!hasRunConfig) {
runConfig();
hasRunConfig = true;
}
Map<String, JLabel> citations = getCitations(query);
for (String link : citations.keySet()) {
preview.addEntry(link, citations.get(link));
}
return true;
} catch (IOException e) {
e.printStackTrace();
status.showMessage(Globals.lang("Error fetching from Google Scholar"));
return false;
}
}
public void getEntries(Map<String, Boolean> selection, ImportInspector inspector) {
int toDownload = 0, downloaded = 0;
for (String link : selection.keySet()) {
boolean isSelected = selection.get(link);
if (isSelected) toDownload++;
}
if (toDownload == 0) return;
for (String link : selection.keySet()) {
if (stopFetching)
break;
inspector.setProgress(downloaded, toDownload);
boolean isSelected = selection.get(link);
if (isSelected) {
downloaded++;
try {
BibtexEntry entry = downloadEntry(link);
inspector.addEntry(entry);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public String getTitle() {
return "Google Scholar";
}
public String getKeyName() {
return "Google Scholar";
}
public URL getIcon() {
return GUIGlobals.getIconUrl("www");
}
public String getHelpPage() {
return "GoogleScholarHelp.html";
}
public JPanel getOptionsPanel() {
return null;
}
public void stopFetching() {
stopFetching = true;
}
private void save(String filename, String content) throws IOException {
BufferedWriter out = new BufferedWriter(new FileWriter(filename));
out.write(content);
out.close();
}
protected void runConfig() throws IOException {
String urlQuery;
try {
URL url;
URLDownload ud;
url = new URL("http://scholar.google.com");
ud = new URLDownload(url);
ud.download();
url = new URL(URL_SETTING);
ud = new URLDownload(url);
ud.download();
//save("setting.html", ud.getStringContent());
String settingsPage = ud.getStringContent();
// Get the form items and their values from the page:
HashMap<String,String> formItems = getFormElements(settingsPage);
// Override the important ones:
formItems.put("scis", "yes");
formItems.put("scisf", "4");
formItems.put("num", String.valueOf(MAX_ENTRIES_TO_LOAD));
StringBuilder ub = new StringBuilder(URL_SETPREFS+"?");
for (Iterator<String> i = formItems.keySet().iterator(); i.hasNext();) {
String name = i.next();
ub.append(name).append("=").append(formItems.get(name));
if (i.hasNext())
ub.append("&");
}
ub.append("&submit=");
// Download the URL to set preferences:
URL url_setprefs = new URL(ub.toString());
ud = new URLDownload(url_setprefs);
ud.download();
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
}
}
/**
*
* @param query
* The search term to query Google Scholar for.
* @return a list of IDs
* @throws java.io.IOException
*/
protected Map<String, JLabel> getCitations(String query) throws IOException {
String urlQuery;
LinkedHashMap<String, JLabel> res = new LinkedHashMap<String, JLabel>();
try {
urlQuery = SEARCH_URL.replace(QUERY_MARKER, URLEncoder.encode(query, "UTF-8"));
int count = 1;
String nextPage = null;
while (((nextPage = getCitationsFromUrl(urlQuery, res)) != null)
&& (count < 2)) {
urlQuery = nextPage;
count++;
if (stopFetching)
break;
}
return res;
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
protected String getCitationsFromUrl(String urlQuery, Map<String, JLabel> ids) throws IOException {
URL url = new URL(urlQuery);
URLDownload ud = new URLDownload(url);
ud.download();
String cont = ud.getStringContent();
//save("query.html", cont);
Matcher m = BIBTEX_LINK_PATTERN.matcher(cont);
int lastRegionStart = 0;
while (m.find()) {
String link = m.group(1).replaceAll("&", "&");
String pText = null;
//System.out.println("regionStart: "+m.start());
String part = cont.substring(lastRegionStart, m.start());
Matcher titleS = TITLE_START_PATTERN.matcher(part);
Matcher titleE = TITLE_END_PATTERN.matcher(part);
boolean fS = titleS.find();
boolean fE = titleE.find();
//System.out.println("fs = "+fS+", fE = "+fE);
//System.out.println(titleS.end()+" : "+titleE.start());
if (fS && fE) {
if (titleS.end() < titleE.start()) {
pText = part.substring(titleS.end(), titleE.start());
}
else pText = part;
}
else
pText = link;
pText = pText.replaceAll("\\[PDF\\]", "");
JLabel preview = new JLabel("<html>"+pText+"</html>");
ids.put(link, preview);
// See if we can extract the link Google Scholar puts on the entry's title.
// That will be set as "url" for the entry if downloaded:
Matcher linkMatcher = LINK_PATTERN.matcher(pText);
if (linkMatcher.find())
entryLinks.put(link, linkMatcher.group(1));
lastRegionStart = m.end();
}
/*m = NEXT_PAGE_PATTERN.matcher(cont);
if (m.find()) {
System.out.println("NEXT: "+URL_START+m.group(1).replaceAll("&", "&"));
return URL_START+m.group(1).replaceAll("&", "&");
}
else*/
return null;
}
protected BibtexEntry downloadEntry(String link) throws IOException {
try {
URL url = new URL(URL_START+link);
URLDownload ud = new URLDownload(url);
ud.download();
String s = ud.getStringContent();
BibtexParser bp = new BibtexParser(new StringReader(s));
ParserResult pr = bp.parse();
if ((pr != null) && (pr.getDatabase() != null)) {
Collection<BibtexEntry> entries = pr.getDatabase().getEntries();
if (entries.size() == 1) {
BibtexEntry entry = entries.iterator().next();
if (clearKeys)
entry.setField(BibtexFields.KEY_FIELD, null);
// If the entry's url field is not set, and we have stored an url for this
// entry, set it:
if (entry.getField("url") == null) {
String storedUrl = entryLinks.get(link);
if (storedUrl != null)
entry.setField("url", storedUrl);
}
// Clean up some remaining HTML code from Elsevier(?) papers
// Search for: Poincare algebra
// to see an example
String title = (String) entry.getField("title");
if (title != null) {
String newtitle = title.replaceAll("<.?i>([^<]*)</i>","$1");
if(!newtitle.equals(title)) {
entry.setField("title",newtitle);
}
}
return entry;
}
else if (entries.size() == 0) {
System.out.println("No entry found! ("+link+")");
return null;
}
else {
System.out.println(entries.size()+" entries found! ("+link+")");
return null;
}
}
else {
System.out.println("Parser failed! ("+link+")");
return null;
}
} catch (MalformedURLException ex) {
ex.printStackTrace();
return null;
}
}
static Pattern inputPattern = Pattern.compile("<input type=([^ ]+) name=([^ ]+) value=([^> ]+)");
public static HashMap<String,String> getFormElements(String page) {
Matcher m = inputPattern.matcher(page);
HashMap<String,String> items = new HashMap<String, String>();
while (m.find()) {
String name = m.group(2);
if ((name.length() > 2) && (name.charAt(0) == '"')
&& (name.charAt(name.length()-1) == '"'))
name = name.substring(1, name.length()-1);
String value = m.group(3);
if ((value.length() > 2) && (value.charAt(0) == '"')
&& (value.charAt(value.length()-1) == '"'))
value = value.substring(1, value.length()-1);
items.put(name, value);
}
return items;
}
}