package org.Webgatherer.ExperimentalLabs.Scraper.Object;
import com.google.inject.Inject;
import org.Webgatherer.CoreEngine.lib.WebDriverFactory;
import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParser;
import org.Webgatherer.Persistence.InputOutput.PersistenceImpl_WriteToFile;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import java.util.ArrayList;
import java.util.List;
/**
* @author Rick Dane
*/
public class ScraperBaseJavascript extends ScraperBaseDepr {
private int delay = 100;
private int pageNum = 1;
private int maxPages = 67;
private String fileOutput;
@Inject
public ScraperBaseJavascript(WebDriverFactory webDriverFactory, HtmlParser htmlParser) {
super(webDriverFactory, htmlParser);
}
protected void configure(String fileOutput) {
this.fileOutput = fileOutput;
}
protected void sleep() {
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
protected String prepareBaseDomainUrl(String url) {
int index = url.indexOf("/");
if (index != -1) {
url = url.substring(0, index);
}
return url;
}
protected String pullCompanyUrl(String[] origUrl, String companyName) {
List<String> returnUrls = new ArrayList<String>();
driver.get(origUrl[0]);
List<WebElement> links = driver.findElements(By.tagName("a"));
for (WebElement curElement : links) {
String matchUrl = curElement.getAttribute("href");
if (matchUrl != null) {
String checkVar = null;
if (origUrl[1].length() >= 5) {
checkVar = origUrl[1].substring(0, 4);
}
if (checkVar != null && !matchUrl.contains(companyName) && matchUrl.toLowerCase().contains(checkVar)) {
return matchUrl;
}
}
}
return null;
}
public void getLinksFromOnclick(String searchUrl, String baseUrl, String key) {
String url = searchUrl + pageNum;
driver.get(url);
List<WebElement> links;
List<String[]> initialUrls = new ArrayList<String[]>();
List<String> urls = new ArrayList<String>();
links = driver.findElements(By.tagName("a"));
for (WebElement link : links) {
getLinkFromOnclickElementInner_One(link, baseUrl, initialUrls);
}
for (String[] curEntry : initialUrls) {
String pulledUrl = pullCompanyUrl(curEntry, key);
if (pulledUrl != null) {
urls.add(pulledUrl);
System.out.println(pulledUrl);
PersistenceImpl_WriteToFile.appendToFile(fileOutput, pulledUrl + "\n");
}
}
}
//THE METHODS BELOW ARE ESSENTIALLY "WORKFLOW" METHODS AS, AT LEAST FOR NOW, THEY HAVE TO BE CUSTOMIZED BASED ON WHAT IS BEING SCRAPED
private void getLinkFromOnclickElementInner_One(WebElement link, String baseUrl, List<String[]> initialUrls) {
String onclick = null;
try {
onclick = link.getAttribute("onclick");
if (onclick.startsWith("snap_to_marker")) {
String title = link.getAttribute("title");
sleep();
link.click();
String[] tmpArray = {convertToUrl_One(baseUrl, title).toLowerCase(), title.toLowerCase()};
initialUrls.add(tmpArray);
}
} catch (Exception e) {
}
}
protected String convertToUrl_One(String inputStr, String baseUrl) {
String[] split = inputStr.split("-");
String retString = "";
if (split != null && split.length > 0) {
retString = split[0];
} else {
retString = inputStr;
}
retString = retString.trim().replace(" ", "-").toLowerCase();
return baseUrl + retString;
}
}