package focusedCrawler.seedfinder;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import focusedCrawler.crawler.crawlercommons.fetcher.BaseFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.crawler.crawlercommons.fetcher.http.SimpleHttpFetcher;
import focusedCrawler.util.TimeDelay;
import focusedCrawler.util.parser.BackLinkNeighborhood;
public class GoogleSearch implements SearchEngineApi {
private final SimpleHttpFetcher fetcher;
private int docsPerPage = 10;
private TimeDelay timer = new TimeDelay(5000);
public GoogleSearch(SimpleHttpFetcher fetcher) {
this.fetcher = fetcher;
}
public List<BackLinkNeighborhood> submitQuery(String query, int page) throws IOException {
timer.waitMinimumDelayIfNecesary();
// 21 -> max number allowed by google... decreases after
String queryUrl = "https://www.google.com/search?q=" + query + "&num="+docsPerPage + "&start="+page*docsPerPage;
System.out.println("URL:"+queryUrl);
try {
FetchedResult result = fetcher.get(queryUrl);
InputStream is = new ByteArrayInputStream(result.getContent());
Document doc = Jsoup.parse(is, "UTF-8", query);
is.close();
Elements searchItems = doc.select("div#search");
Elements linkHeaders = searchItems.select(".r");
Elements linksUrl = linkHeaders.select("a[href]");
List<BackLinkNeighborhood> links = new ArrayList<>();
for (Element link : linksUrl) {
String title = link.text();
String url = link.attr("href");
links.add(new BackLinkNeighborhood(url, title));
}
System.out.println(getClass().getSimpleName()+" hits: "+links.size());
return links;
} catch (IOException | BaseFetchException e) {
throw new IOException("Failed to download backlinks from Google.", e);
}
}
}