package org.Webgatherer.ExperimentalLabs.Scraper.Indeed; import com.google.inject.Inject; import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunication; import org.Webgatherer.ExperimentalLabs.Scraper.Core.PageRetrieverThreadManagerScraper; import org.Webgatherer.ExperimentalLabs.Scraper.Core.ScraperBase; import org.Webgatherer.WorkflowExample.Workflows.Implementations.WebGatherer.EnumUrlRetrieveOptions; import java.util.Queue; /** * @author Rick Dane */ public class ScraperIndeed extends ScraperBase { protected String urlPrefix = "http://www.indeed.com/jobs?q="; protected String urlSecond = "%2C+ca&start="; protected String javaScriptLinkIdentifier = "/rc/clk?jk="; @Inject public ScraperIndeed(PageRetrieverThreadManagerScraper pageRetrieverThreadManager) { super(pageRetrieverThreadManager); } protected void customRunActions(int i, ThreadCommunication threadCommunication, String searchString) { int pgNum = i * numberResultsPerPage; Queue queue = pageRetrieverThreadManager.getInitialJavascriptLinksAddToPageQueue(urlPrefix + searchString + urlSecond + pgNum, javaScriptLinkIdentifier, URL_IDENTIFIER); threadCommunication.setPageQueue(queue); while (!threadCommunication.isPageQueueEmpty()) { try { Thread.sleep(miniSleepDuration); } catch (InterruptedException e) { } pageRetrieverThreadManager.run(EnumUrlRetrieveOptions.HTMLPAGE.ordinal()); } System.out.println("iterate main loop"); i++; } protected String parseUrl(String inputUrl) { String url = null; String[] split1 = inputUrl.split("\\~"); if (split1.length >= 2) { String[] split2 = split1[1].split("\\#"); if (split1.length >= 1) { url = split2[0]; } } return url; } }