package org.jabref.gui.importer.fetcher; import java.io.IOException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.JPanel; import org.jabref.Globals; import org.jabref.gui.importer.ImportInspectionDialog; import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.ImportInspector; import org.jabref.logic.importer.OutputPrinter; import org.jabref.logic.net.URLDownload; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class CiteSeerXFetcher implements EntryFetcher { private static final int MAX_PAGES_TO_LOAD = 8; private static final String QUERY_MARKER = "___QUERY___"; private static final String URL_START = "http://citeseer.ist.psu.edu"; private static final String SEARCH_URL = CiteSeerXFetcher.URL_START + "/search?q=" + CiteSeerXFetcher.QUERY_MARKER + "&submit=Search&sort=rlv&t=doc"; private static final Pattern CITE_LINK_PATTERN = Pattern.compile("<a class=\"remove doc_details\" href=\"(.*)\">"); private static final String BASE_PATTERN = "<meta name=\"" + CiteSeerXFetcher.QUERY_MARKER + "\" content=\"(.*)\" />"; private static final Pattern TITLE_PATTERN = Pattern .compile(CiteSeerXFetcher.BASE_PATTERN.replace(CiteSeerXFetcher.QUERY_MARKER, "citation_title")); private static final Pattern AUTHOR_PATTERN = Pattern .compile(CiteSeerXFetcher.BASE_PATTERN.replace(CiteSeerXFetcher.QUERY_MARKER, "citation_authors")); private static final Pattern YEAR_PATTERN = Pattern .compile(CiteSeerXFetcher.BASE_PATTERN.replace(CiteSeerXFetcher.QUERY_MARKER, "citation_year")); private static final Pattern ABSTRACT_PATTERN = Pattern.compile("<h3>Abstract</h3>\\s*<p>(.*)</p>"); private static final Log LOGGER = LogFactory.getLog(CiteSeerXFetcher.class); private boolean stopFetching; @Override public boolean processQuery(String query, ImportInspector inspector, OutputPrinter status) { stopFetching = false; try { List<String> citations = getCitations(query); for (String citation : citations) { if (stopFetching) { break; } BibEntry entry = getSingleCitation(citation); if (entry != null) { inspector.addEntry(entry); } } return true; } catch (IOException e) { LOGGER.error("Error while fetching from " + getTitle(), e); ((ImportInspectionDialog)inspector).showErrorMessage(this.getTitle(), e.getLocalizedMessage()); return false; } } @Override public String getTitle() { return "CiteSeerX"; } @Override public HelpFile getHelpPage() { return HelpFile.FETCHER_CITESEERX; } @Override public JPanel getOptionsPanel() { return null; } @Override public void stopFetching() { stopFetching = true; } /** * * @param query * The search term to query JStor for. * @return a list of IDs * @throws java.io.IOException */ private List<String> getCitations(String query) throws IOException { String urlQuery; List<String> ids = new ArrayList<>(); urlQuery = CiteSeerXFetcher.SEARCH_URL.replace(CiteSeerXFetcher.QUERY_MARKER, URLEncoder.encode(query, StandardCharsets.UTF_8.name())); int count = 1; String nextPage; while (((nextPage = getCitationsFromUrl(urlQuery, ids)) != null) && (count < CiteSeerXFetcher.MAX_PAGES_TO_LOAD)) { urlQuery = nextPage; count++; if (stopFetching) { break; } } return ids; } private static String getCitationsFromUrl(String urlQuery, List<String> ids) throws IOException { String cont = new URLDownload(urlQuery).asString(Globals.prefs.getDefaultEncoding()); Matcher m = CiteSeerXFetcher.CITE_LINK_PATTERN.matcher(cont); while (m.find()) { ids.add(CiteSeerXFetcher.URL_START + m.group(1)); } return null; } private static BibEntry getSingleCitation(String urlString) throws IOException { String cont = new URLDownload(urlString).asString(); // Find title, and create entry if we do. Otherwise assume we did not get an entry: Matcher m = CiteSeerXFetcher.TITLE_PATTERN.matcher(cont); if (m.find()) { BibEntry entry = new BibEntry(); entry.setField(FieldName.TITLE, m.group(1)); // Find authors: m = CiteSeerXFetcher.AUTHOR_PATTERN.matcher(cont); if (m.find()) { String authors = m.group(1); entry.setField(FieldName.AUTHOR, new NormalizeNamesFormatter().format(authors)); } // Find year: m = CiteSeerXFetcher.YEAR_PATTERN.matcher(cont); if (m.find()) { entry.setField(FieldName.YEAR, m.group(1)); } // Find abstract: m = CiteSeerXFetcher.ABSTRACT_PATTERN.matcher(cont); if (m.find()) { entry.setField(FieldName.ABSTRACT, m.group(1)); } return entry; } else { return null; } } }