package org.Webgatherer.WorkflowExample.Workflows.Implementations.WebGatherer;
import com.google.inject.Injector;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.FinalOutputContainer;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunication;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunicationBase;
import org.Webgatherer.CoreEngine.Core.Threadable.WebGather.WebGather;
import org.Webgatherer.CoreEngine.lib.WebDriverFactory;
import org.Webgatherer.WorkflowExample.Workflows.Base.Common.WorkflowBase;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.support.ui.Wait;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author Rick Dane
*/
public class Workflow_WebSearch1 extends WorkflowBase {
public Workflow_WebSearch1(Injector injector) {
super(injector);
}
public void runWorkflow(Map<String, Object> workflowParams) {
WebGather webGather = (WebGather) workflowParams.get("webGather");
ThreadCommunication threadCommunication = (ThreadCommunication) workflowParams.get("threadCommunication");
FinalOutputContainer finalOutputContainer = (FinalOutputContainer) workflowParams.get("finalOutputContainer");
String[] curEntry = threadCommunication.getFromPageQueue();
if (curEntry == null) {
return;
}
String searchString = curEntry[ThreadCommunicationBase.PageQueueEntries.CUSTOM_PARAM.ordinal()];
String key = curEntry[ThreadCommunicationBase.PageQueueEntries.KEY.ordinal()];
String customLabel = curEntry[ThreadCommunicationBase.PageQueueEntries.CUSTOM_LABEL.ordinal()];
int numberOfPagesToScrape = Integer.parseInt(curEntry[ThreadCommunicationBase.PageQueueEntries.NUM_PAGES_TOSCRAPE.ordinal()]);
int i = 1;
while (i <= numberOfPagesToScrape) {
int start = i * 10 - 10;
String url = "https://www.google.com/search?gcx=c&sourceid=chrome&ie=UTF-8&q=google+places#q=" + searchString + "&hl=en&tbm=plcs&prmd=imvns&start=" + start;
//have to get new instance each time or there is a problem with WebElements, not sure why and can hopefully correct this at some point
WebDriver driver = webGather.getNewWebDriver();
driver.get(url);
List<WebElement> links;
links = driver.findElements(By.tagName("a"));
for (WebElement link : links) {
String linkStr = null;
try {
linkStr = link.getAttribute("href");
} catch (Exception e) {
//e.printStackTrace();
}
if (linkStr != null && checkIfMatch(linkStr)) {
String[] entry = {key, linkStr, null, null, null, null, customLabel};
threadCommunication.addToOutputDataHolder(entry);
}
}
i++;
driver.close();
}
if (threadCommunication.isPageQueueEmpty()) {
threadCommunication.setIsWebGathererThreadFinished(true);
}
}
private boolean checkIfMatch(String linkStr) {
List<String> negativeMatches = new ArrayList<String>();
negativeMatches.add("google");
negativeMatches.add("youtube");
List<String> positiveMatches = new ArrayList<String>();
positiveMatches.add("http");
for (String curMatch : negativeMatches) {
if (linkStr.contains(curMatch)) {
return false;
}
}
for (String curMatch : positiveMatches) {
if (!linkStr.contains(curMatch)) {
return false;
}
}
return true;
}
}