package org.Webgatherer.ExperimentalLabs.Scraper.Google;
import com.google.inject.Inject;
import org.Webgatherer.CoreEngine.lib.WebDriverFactory;
import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParser;
import org.openqa.selenium.WebDriver;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author Rick Dane
*/
public class GoogleExtractUrls {
private HtmlParser htmlParser;
private String searchPhrase;
private WebDriver driver;
private int numPages;
private int googleNum;
private int basePageNum = 10;
private int incrementNum = 10;
private int delay = 1000;
private int numResultsPerPage = 10;
private String negativeMatchLinkPrefix = "google.com";
private String negativeMatchLinkContains = "google";
private String positiveMatchLinkPrefix = "http";
private List<String> outputLinks = new ArrayList<String>();
private List<String> duplicateCheckList = new ArrayList<String>();
@Inject
public GoogleExtractUrls(HtmlParser htmlParser, WebDriverFactory webDriverFactory) {
this.htmlParser = htmlParser;
driver = webDriverFactory.createNewWebDriver();
}
public void configure(String searchPhrase, int numPages) {
this.searchPhrase = searchPhrase;
this.numPages = numPages;
googleNum = numPages * numResultsPerPage;
}
public List<String> extractUrls() {
int i = basePageNum;
while (i <= googleNum) {
driver.get("https://www.google.com/search?q=" + searchPhrase + "&hl=en&num=" + i + "&lr=&ft=i&cr=&safe=images&tbs=qdr:m#q=livermore+java+developer&hl=en&lr=&tbs=qdr:m&prmd=imvns");
Map<String, String> links = htmlParser.extractLinks(negativeMatchLinkPrefix, driver.getPageSource());
addToOutputList(links);
i = i + incrementNum;
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return outputLinks;
}
private void addToOutputList(Map<String, String> links) {
for (Map.Entry<String, String> entries : links.entrySet()) {
String url = entries.getValue();
if (!url.startsWith(negativeMatchLinkPrefix) && !url.contains(negativeMatchLinkContains) && url.startsWith(positiveMatchLinkPrefix) && !duplicateCheckList.contains(url)) {
outputLinks.add(url);
duplicateCheckList.add(url);
}
}
}
}