package org.Webgatherer.ExperimentalLabs.Scraper.Generic; import com.google.inject.Inject; import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunication; import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunicationBase; import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParser; import org.Webgatherer.ExperimentalLabs.Scraper.Core.PageRetrieverThreadManagerScraper; import org.Webgatherer.ExperimentalLabs.Scraper.Core.ScraperBase; import java.util.Map; import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; /** * @author Rick Dane */ public class ScraperGeneric extends ScraperBase { //protected String urlPrefix = "http://sfbay.craigslist.org/search/?areaID=1&catAbb=jjj&query="; protected String urlPrefix; protected String urlPostfix; protected String urlPatternWildcard = "#"; protected HtmlParser htmlParser; //protected String baseDomainName = "http://craigslist.org"; protected String baseDomainName; protected int pageIncrementAmnt; @Inject public ScraperGeneric(PageRetrieverThreadManagerScraper pageRetrieverThreadManager, HtmlParser htmlParser) { super(pageRetrieverThreadManager); this.htmlParser = htmlParser; } @Override public void configure(String urlPrefix, String urlPostfix, String baseDomainName, int pageIncrementAmnt) { this.baseDomainName = baseDomainName; this.urlPostfix = urlPostfix; this.urlPrefix = urlPrefix; this.pageIncrementAmnt = pageIncrementAmnt; } private String prepareUrlString(String searchStr, int pgNum) { StringBuilder strBld = new StringBuilder(); strBld.append(urlPrefix); strBld.append(searchStr); strBld.append(urlPostfix); String retStr = strBld.toString().replace(urlPatternWildcard, String.valueOf(pgNum)); return retStr; } /** * This is for pages where the links are right on the page, no cleverness is required to get them, such as using JavaScript, so we * are cutting out steps that would happen in another scraper, such as the indeed scraper, for example * * @param i * @param threadCommunication * @param searchString */ @Override protected void customRunActions(int i, ThreadCommunication threadCommunication, String searchString) { int pgNum = i * pageIncrementAmnt; String urlPrepared = prepareUrlString(searchString, pgNum); driver.get(urlPrepared); String pageSource = driver.getPageSource(); Queue<String[]> queue = new ConcurrentLinkedQueue<String[]>(); Map<String, String> links = htmlParser.extractLinks(baseDomainName, pageSource); for (Map.Entry<String, String> curEntry : links.entrySet()) { String[] outputEntry = new String[PageRetrieverThreadManagerScraper.sizeOfStringArrayEnum]; outputEntry[ThreadCommunicationBase.PageQueueEntries.CUSTOM_RET_VALUE.ordinal()] = curEntry.getValue(); threadCommunication.addToOutputDataHolder(outputEntry); } i++; } protected String parseUrl(String inputUrl) { return inputUrl; } }