package org.docear.metadata.extractors;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import javax.imageio.ImageIO;
import org.docear.metadata.data.MetaData;
import org.docear.metadata.data.ScholarMetaData;
import org.docear.metadata.data.ScholarMetaData.ScholarSource;
import org.docear.metadata.events.CaptchaEvent;
import org.docear.metadata.events.FetchedResultsEvent;
import org.docear.metadata.events.MetaDataListener;
import org.jsoup.Connection.Response;
import org.jsoup.HttpStatusException;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class GoogleScholarExtractor extends HtmlDataExtractor {
protected static String BaseURL = "http://scholar.google.com";
private String language = "en";
private String cookieFileName = "GoogleScholarCookie.xml";
private Boolean triedNewCookie = false;
public enum ScholarConfigKeys implements ExtractorConfigKey{
LANGUAGE;
}
public GoogleScholarExtractor(){};
public GoogleScholarExtractor(Map<ExtractorConfigKey, Object> config) throws MalformedConfigException {
super(config);
this.readConfig(config);
}
public GoogleScholarExtractor(Map<ExtractorConfigKey, Object> config, MetaDataListener listener) throws MalformedConfigException {
super(config, listener);
this.readConfig(config);
}
public Collection<MetaData> search(final String query) {
ArrayList<MetaData> result = new ArrayList<MetaData>();
try{
Map<String, String> cookies = getCookies(cookieFileName);
Response response = getConnection(BaseURL + "/scholar")
.data("q", query, "hl" , this.language)
.cookies(cookies)
.execute();
if(this.debuglogging){
logger.info("1. Response URL: " + response.url().toString());
logger.info("1. Response headers: " + response.headers().toString());
logger.info("1. Response body: " + response.body().toString());
logger.info("1. Response cookies: " + response.cookies().toString());
}
Document doc = response.parse();
//File input = new File("C:\\Users\\Anwender\\Desktop\\Neues Textdokument (2).html");
//Document doc = Jsoup.parse(input, "UTF-8", BaseURL);
Elements captchaElements = doc.select("noscript > iframe");
if(!captchaElements.isEmpty()){
String captchaUrl = captchaElements.first().attr("abs:src");
String token = handleReCaptchaRequest(captchaUrl);
if(token != null && !token.isEmpty()){
HashMap<String, String> formData = new HashMap<String, String>();
for(Element inputElement : doc.select("input")){
Attributes inputAttributes = inputElement.attributes();
if(inputAttributes.hasKey("value")){
formData.put(inputAttributes.get("name"), inputAttributes.get("value"));
}
}
for(Element inputElement : doc.select("textarea")){
Attributes inputAttributes = inputElement.attributes();
formData.put(inputAttributes.get("name"), token);
}
Response captchaResponse = getConnection(BaseURL + "/scholar")
.data("q", query, "hl" , this.language)
.data(formData)
.cookies(cookies)
.execute();
if(this.debuglogging){
logger.info("5. Response URL: " + captchaResponse.url().toString());
logger.info("5. Response headers: " + captchaResponse.headers().toString());
logger.info("5. Response body: " + captchaResponse.body().toString());
logger.info("5. Response cookies: " + captchaResponse.cookies().toString());
}
doc = captchaResponse.parse();
}
}
Iterator<Element> bibtexLinks = doc.select("a.gs_nta").iterator();
if(!bibtexLinks.hasNext()){
System.out.println();
}
for(int i = 0; i < maxResults; i++){
if(bibtexLinks.hasNext()){
Element bibtexLink = bibtexLinks.next();
try{
logger.info(bibtexLink.attr("href"));
URL url = new URL(new URL(BaseURL), bibtexLink.attr("href"));
response = getConnection(url.toString())
.cookies(cookies)
.execute();
String bibtex = response.body();
result.add(new ScholarMetaData(i, bibtex, query));
} catch (IOException e) {
System.out.println(e.getMessage());
logger.info(e.getMessage(), e);
}
}
}
}catch(HttpStatusException e){
logger.info(e.getMessage(), e);
if(e.getStatusCode() == 503){
if(handleCaptchaRequest(e)) return search(query);
}
else if(e.getStatusCode() == 403 && !triedNewCookie){
if(requestNewCookie(cookieFileName) != null) return search(query);
}
} catch (IOException e) {
System.out.println(e.getMessage());
logger.info(e.getMessage(), e);
}
FetchedResultsEvent event = new FetchedResultsEvent(result);
for(MetaDataListener listener : this.getListeners()){
listener.onFinishedRequest(event);
}
if(triedNewCookie){
triedNewCookie = false;
}
return result;
}
private String handleReCaptchaRequest(String captchaUrl){
try{
Response response = getConnection(captchaUrl).ignoreHttpErrors(true).execute();
if(this.debuglogging){
logger.info("2. Response URL: " + response.url().toString());
logger.info("2. Response headers: " + response.headers().toString());
logger.info("2. Response body: " + response.body().toString());
logger.info("2. Response cookies: " + response.cookies().toString());
}
Document doc = response.parse();
Elements imageElements = doc.select("center > img");
if(!imageElements.isEmpty()){
String imageUrl = imageElements.first().attr("abs:src");
Response imgResponse = getConnection(imageUrl).execute();
if(this.debuglogging){
logger.info("3. Response URL: " + imgResponse.url().toString());
logger.info("3. Response headers: " + imgResponse.headers().toString());
logger.info("3. Response body: " + imgResponse.body().toString());
logger.info("3. Response cookies: " + imgResponse.cookies().toString());
}
BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgResponse.bodyAsBytes()));
String captcha = sendCaptchaEvent(img);
if(captcha != null && !captcha.isEmpty()){
HashMap<String, String> formData = new HashMap<String, String>();
for(Element inputElement : doc.select("input")){
Attributes inputAttributes = inputElement.attributes();
if(inputAttributes.hasKey("value")){
formData.put(inputAttributes.get("name"), inputAttributes.get("value"));
}
else{
formData.put(inputAttributes.get("name"), captcha);
}
}
Response captchaResponse = getConnection(captchaUrl)
.data(formData)
.ignoreHttpErrors(true)
.followRedirects(false)
.execute();
Document captchaDoc = captchaResponse.parse();
System.out.println();
Elements tokenElements = captchaDoc.select("textarea");
if(!tokenElements.isEmpty()){
if(this.debuglogging){
logger.info("4. Response URL: " + captchaResponse.url().toString());
logger.info("4. Response headers: " + captchaResponse.headers().toString());
logger.info("4. Response body: " + captchaResponse.body().toString());
logger.info("4. Response cookies: " + captchaResponse.cookies().toString());
}
String token = tokenElements.first().text();
return token;
}
else{
return handleReCaptchaRequest(captchaUrl);
}
}
}
}catch(IOException ex){
logger.info(ex.getMessage(), ex);
}
return null;
}
private boolean handleCaptchaRequest(HttpStatusException e) {
try{
Response response = getConnection(e.getUrl()).ignoreHttpErrors(true).execute();
final Document doc = response.parse();
Iterator<Element> imgElements = doc.select("img").iterator();
if(imgElements.hasNext()){
Element imgElement = imgElements.next();
if(imgElement.hasAttr("src")){
String imgURL = imgElement.attr("src");
Response imgResponse = getConnection(BaseURL + imgURL).execute();
final Map<String, String> imgCookie = imgResponse.cookies();
BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgResponse.bodyAsBytes()));
String captcha = sendCaptchaEvent(img);
if(captcha != null && !captcha.isEmpty()){
Iterator<Element> formElements = doc.select("form").iterator();
if(formElements.hasNext()){
Element formElement = formElements.next();
String formURL = "";
if(formElement.hasAttr("action")){
formURL = formElement.attr("action");
}
HashMap<String, String> formData = new HashMap<String, String>();
Elements inputElements = formElement.select("input");
for(Element inputElement : inputElements){
if(!inputElement.attr("name").equals("captcha")){
formData.put(inputElement.attr("name"), inputElement.attr("value"));
}
else{
formData.put(inputElement.attr("name"), captcha);
}
}
URI uri = new URI(e.getUrl());
Response captchaResponse = getConnection(uri.getScheme() + "://" + uri.getHost() + "/sorry/" + formURL)
.data(formData)
.ignoreHttpErrors(true)
.referrer(e.getUrl())
.cookies(imgCookie)
.followRedirects(false)
.execute();
if(captchaResponse.statusCode() == 302 && captchaResponse.hasHeader("Location")){
Map<String, String> cookies = getCookies(cookieFileName);
Response abuseResponse = getConnection(captchaResponse.header("Location"))
.ignoreHttpErrors(true)
.referrer(e.getUrl())
.cookies(cookies)
.followRedirects(false)
.execute();
Map<String, String> abuseCookies = abuseResponse.cookies();
cookies.putAll(abuseCookies);
saveCookies(cookies, cookieFileName);
System.out.println("Redirect Captcha");
}
else{
Map<String, String> cookies = requestNewCookie(cookieFileName);
saveCookies(cookies, cookieFileName);
System.out.println("Normal Captcha");
}
return true;
}
}
}
}
}catch(IOException ex){
logger.info(e.getMessage(), e);
} catch (URISyntaxException e1) {
logger.info(e.getMessage(), e);
}
return false;
}
private String sendCaptchaEvent(BufferedImage img) throws IOException {
String captcha = null;
if(getListeners().size() <= 0){
ImageIO.write(img, "jpg", new File(getPath("captcha.jpg")));
System.out.println("Enter Captcha here : ");
BufferedReader bufferRead = new BufferedReader(new InputStreamReader(System.in));
captcha = bufferRead.readLine();
}
else{
CaptchaEvent event = new CaptchaEvent(ScholarSource.GOOGLESCHOLAR, img);
for(MetaDataListener listener : this.getListeners()){
listener.onCaptchaRequested(event);
}
if(!event.isCanceled() && event.getSolvedCaptcha() != null && !event.getSolvedCaptcha().isEmpty()){
captcha = event.getSolvedCaptcha();
}
}
return captcha;
}
private Map<String, String> getCookies(String fileName) throws IOException {
Map<String, String> cookies = readCookies(fileName);
if(cookies == null){
cookies = requestNewCookie(fileName);
}
else{
String gsp = cookies.get("GSP");
if(!gsp.endsWith(":CF=4")){
cookies.put("GSP", gsp + ":CF=4"); // :CF=4 enables the export to BibTex Link in the result list
}
}
return cookies;
}
private Map<String, String> requestNewCookie(String fileName) {
Map<String, String> cookies = null;
try{
Response response = getConnection(BaseURL).ignoreHttpErrors(true).execute();
cookies = response.cookies();
String gsp = cookies.get("GSP");
cookies.put("GSP", gsp + ":CF=4"); // :CF=4 enables the export to BibTex Link in the result list
saveCookies(cookies, fileName);
}catch(IOException e){
logger.info(e.getMessage(), e);
}
return cookies;
}
@Override
protected void readConfig(Map<ExtractorConfigKey, Object> config) throws MalformedConfigException{
super.readConfig(config);
try{
for(ExtractorConfigKey key : config.keySet()){
if(key instanceof ScholarConfigKeys){
ScholarConfigKeys scholarConfigKey = (ScholarConfigKeys)key;
switch(scholarConfigKey){
case LANGUAGE:
this.language = (String) config.get(ScholarConfigKeys.LANGUAGE);
break;
default:
break;
}
}
}
}catch(ClassCastException e){
logger.error("Could not cast config parameter.", e);
throw new MalformedConfigException();
}
}
public Collection<MetaData> call() throws Exception {
return search(searchValue);
}
@Override
public void setConfig(Map<ExtractorConfigKey, Object> config) throws MalformedConfigException {
super.setConfig(config);
this.readConfig(config);
}
}