package org.Webgatherer.ExperimentalLabs.Scraper.Indeed;
import com.google.inject.Inject;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunicationBase;
import org.Webgatherer.CoreEngine.Core.Threadable.WebGather.ThreadRetrievePage;
import org.Webgatherer.CoreEngine.lib.WebDriverFactory;
import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParser;
import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.EmailExtractor;
import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.TextExtraction;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import java.util.LinkedList;
import java.util.Set;
/**
* @author Rick Dane
*/
public class ThreadRetrievePageIndeed extends ThreadRetrievePage {
protected EmailExtractor emailExtractor;
private static String delimeter = "#";
private static String delimeter2nd = "~";
@Inject
public ThreadRetrievePageIndeed(WebDriverFactory webDriverFactory, TextExtraction textExtraction, HtmlParser htmlParser, EmailExtractor emailExtractor) {
super(webDriverFactory, textExtraction, htmlParser);
this.emailExtractor = emailExtractor;
}
@Override
protected void getPage() {
driver.get(entry[ThreadCommunicationBase.PageQueueEntries.BASE_URL.ordinal()]);
WebElement element = driver.findElement(By.xpath("//a[@href='" + entry[ThreadCommunicationBase.PageQueueEntries.CUSTOM_PARAM.ordinal()] + "']"));
System.out.println("attempt click");
element.click();
Set<String> strSet = driver.getWindowHandles();
for (String curWindow : strSet) {
driver.switchTo().window(curWindow);
String page = driver.getPageSource();
String curUrl = driver.getCurrentUrl();
if (!curUrl.contains("www.indeed")) {
LinkedList<String> emailAddresses = emailExtractor.extractEmailAddressesList(page);
StringBuilder emailsStrb = new StringBuilder();
if (!emailAddresses.isEmpty()) {
for (String email : emailAddresses) {
emailsStrb.append(email + ",");
}
}
entry[ThreadCommunicationBase.PageQueueEntries.EMAIL_ADDRESSES.ordinal()] = emailsStrb.toString();
//TODO: rethink this, we are doing this and then undoing it later, figure out why its here at all
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("url" + delimeter2nd + curUrl + delimeter);
stringBuilder.append("title" + delimeter2nd + driver.getTitle() + delimeter);
entry[ThreadCommunicationBase.PageQueueEntries.CUSTOM_RET_VALUE.ordinal()] = stringBuilder.toString();
threadCommunication.addToOutputDataHolder(entry);
System.out.println("thread added to data output");
}
}
}
@Override
protected boolean actionIfUrlValid
() {
return true;
}
}