package org.jabref.logic.importer.fetcher;
import java.io.IOException;
import java.io.StringReader;
import java.net.HttpCookie;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.utils.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar.
*/
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {
private static final Log LOGGER = LogFactory.getLog(GoogleScholar.class);
private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)");
private static final String BASIC_SEARCH_URL = "https://scholar.google.com/scholar?";
private static final String SEARCH_IN_TITLE_URL = "https://scholar.google.com//scholar?";
private static final int NUM_RESULTS = 10;
private final ImportFormatPreferences importFormatPreferences;
public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
Objects.requireNonNull(importFormatPreferences);
this.importFormatPreferences = importFormatPreferences;
}
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();
// Search in title
if (!entry.hasField(FieldName.TITLE)) {
return pdfLink;
}
try {
URIBuilder uriBuilder = new URIBuilder(SEARCH_IN_TITLE_URL);
uriBuilder.addParameter("as_q", "");
uriBuilder.addParameter("as_epq", entry.getField(FieldName.TITLE).orElse(null));
uriBuilder.addParameter("as_occt", "title");
Document doc = Jsoup.connect(uriBuilder.toString()).userAgent(URLDownload.USER_AGENT).get();
// Check results for PDF link
// TODO: link always on first result or none?
for (int i = 0; i < NUM_RESULTS; i++) {
Elements link = doc.select(String.format("#gs_ggsW%s a", i));
if (link.first() != null) {
String s = link.first().attr("href");
// link present?
if (!"".equals(s)) {
// TODO: check title inside pdf + length?
// TODO: report error function needed?! query -> result
LOGGER.info("Fulltext PDF found @ Google: " + s);
pdfLink = Optional.of(new URL(s));
break;
}
}
}
} catch (URISyntaxException e) {
throw new FetcherException("Building URI failed.", e);
}
return pdfLink;
}
@Override
public String getName() {
return "Google Scholar";
}
@Override
public HelpFile getHelpPage() {
return HelpFile.FETCHER_GOOGLE_SCHOLAR;
}
@Override
public List<BibEntry> performSearch(String query) throws FetcherException {
try {
obtainAndModifyCookie();
List<BibEntry> foundEntries = new ArrayList<>(10);
URIBuilder uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
uriBuilder.addParameter("hl", "en");
uriBuilder.addParameter("btnG", "Search");
uriBuilder.addParameter("q", query);
addHitsFromQuery(foundEntries, uriBuilder.toString());
if (foundEntries.size() == 10) {
uriBuilder.addParameter("start", "10");
addHitsFromQuery(foundEntries, uriBuilder.toString());
}
return foundEntries;
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName(), e);
} catch (IOException e) {
// if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}
}
private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException {
String content = new URLDownload(queryURL).asString();
Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
while (matcher.find()) {
String citationsPageURL = matcher.group().replace("&", "&");
BibEntry newEntry = downloadEntry(citationsPageURL);
entryList.add(newEntry);
}
}
private BibEntry downloadEntry(String link) throws IOException, FetcherException {
String downloadedContent = new URLDownload(link).asString();
BibtexParser parser = new BibtexParser(importFormatPreferences);
ParserResult result = parser.parse(new StringReader(downloadedContent));
if ((result == null) || (result.getDatabase() == null)) {
throw new FetcherException("Parsing entries from Google Scholar bib file failed.");
} else {
Collection<BibEntry> entries = result.getDatabase().getEntries();
if (entries.size() != 1) {
LOGGER.debug(entries.size() + " entries found! (" + link + ")");
throw new FetcherException("Parsing entries from Google Scholar bib file failed.");
} else {
BibEntry entry = entries.iterator().next();
return entry;
}
}
}
private void obtainAndModifyCookie() throws FetcherException {
try {
URLDownload downloader = new URLDownload("https://scholar.google.com");
List<HttpCookie> cookies = downloader.getCookieFromUrl();
for (HttpCookie cookie : cookies) {
// append "CF=4" which represents "Citation format bibtex"
cookie.setValue(cookie.getValue() + ":CF=4");
}
} catch (IOException e) {
throw new FetcherException("Cookie configuration for Google Scholar failed.", e);
}
}
}