/* Copyright (C) 2003-2011 JabRef contributors.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref.imports;
import net.sf.jabref.*;
import net.sf.jabref.net.URLDownload;
import net.sf.jabref.util.NameListNormalizer;
import javax.swing.*;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CiteSeerXFetcher implements EntryFetcher {
protected static int MAX_PAGES_TO_LOAD = 8;
final static String QUERY_MARKER = "___QUERY___";
final static String URL_START = "http://citeseer.ist.psu.edu";
final static String SEARCH_URL = URL_START+"/search?q="+QUERY_MARKER
+"&submit=Search&sort=rlv&t=doc";
final static Pattern CITE_LINK_PATTERN = Pattern.compile("<a class=\"remove doc_details\" href=\"(.*)\">");
protected boolean stopFetching = false;
public boolean processQuery(String query, ImportInspector inspector, OutputPrinter status) {
stopFetching = false;
try {
List<String> citations = getCitations(query);
for (String citation : citations) {
if (stopFetching)
break;
BibtexEntry entry = getSingleCitation(citation);
//BibtexEntry entry = BibsonomyScraper.getEntry(citation);
//dialog.setProgress(++i, citations.size());
if (entry != null)
inspector.addEntry(entry);
}
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
public String getTitle() {
return "CiteSeerX";
}
public String getKeyName() {
return "CiteSeerX";
}
public URL getIcon() {
return GUIGlobals.getIconUrl("www");
}
public String getHelpPage() {
return "CiteSeerHelp.html";
}
public JPanel getOptionsPanel() {
return null;
}
public void stopFetching() {
stopFetching = true;
}
/**
*
* @param query
* The search term to query JStor for.
* @return a list of IDs
* @throws java.io.IOException
*/
protected List<String> getCitations(String query) throws IOException {
String urlQuery;
ArrayList<String> ids = new ArrayList<String>();
try {
urlQuery = SEARCH_URL.replace(QUERY_MARKER, URLEncoder.encode(query, "UTF-8"));
int count = 1;
String nextPage = null;
while (((nextPage = getCitationsFromUrl(urlQuery, ids)) != null)
&& (count < MAX_PAGES_TO_LOAD)) {
urlQuery = nextPage;
count++;
if (stopFetching)
break;
}
return ids;
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
protected String getCitationsFromUrl(String urlQuery, List<String> ids) throws IOException {
URL url = new URL(urlQuery);
URLDownload ud = new URLDownload(url);
ud.download();
String cont = ud.getStringContent();
//System.out.println(cont);
Matcher m = CITE_LINK_PATTERN.matcher(cont);
while (m.find()) {
ids.add(URL_START+m.group(1));
}
return null;
}
final static String basePattern = "<meta name=\""+QUERY_MARKER+"\" content=\"(.*)\" />";
final static Pattern titlePattern = Pattern.compile(basePattern.replace(QUERY_MARKER, "citation_title"));
final static Pattern authorPattern = Pattern.compile(basePattern.replace(QUERY_MARKER, "citation_authors"));
final static Pattern yearPattern = Pattern.compile(basePattern.replace(QUERY_MARKER, "citation_year"));
final static Pattern abstractPattern = Pattern.compile("<h3>Abstract</h3>\\s*<p>(.*)</p>");
protected BibtexEntry getSingleCitation(String urlString) throws IOException {
URL url = new URL(urlString);
URLDownload ud = new URLDownload(url);
ud.setEncoding("UTF8");
ud.download();
String cont = ud.getStringContent();
// Find title, and create entry if we do. Otherwise assume we didn't get an entry:
Matcher m = titlePattern.matcher(cont);
if (m.find()) {
BibtexEntry entry = new BibtexEntry(Util.createNeutralId());
entry.setField("title", m.group(1));
// Find authors:
m = authorPattern.matcher(cont);
if (m.find()) {
String authors = m.group(1);
entry.setField("author", NameListNormalizer.normalizeAuthorList(authors));
}
// Find year:
m = yearPattern.matcher(cont);
if (m.find())
entry.setField("year", m.group(1));
// Find abstract:
m = abstractPattern.matcher(cont);
if (m.find())
entry.setField("abstract", m.group(1));
return entry;
}
else
return null;
}
}