package org.Webgatherer.ExperimentalLabs.Scraper.Generic; import com.google.inject.Inject; import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunicationBase; import org.Webgatherer.CoreEngine.Core.Threadable.WebGather.ThreadRetrievePage; import org.Webgatherer.CoreEngine.lib.WebDriverFactory; import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParser; import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.EmailExtractor; import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.TextExtraction; import java.util.Map; /** * @author Rick Dane */ public class ThreadRetrievePageGeneric extends ThreadRetrievePage { protected EmailExtractor emailExtractor; private static String delimeter = "#"; private static String delimeter2nd = "~"; @Inject public ThreadRetrievePageGeneric(WebDriverFactory webDriverFactory, TextExtraction textExtraction, HtmlParser htmlParser, EmailExtractor emailExtractor) { super(webDriverFactory, textExtraction, htmlParser); this.emailExtractor = emailExtractor; } /** * In this case the method is misnamed, as its really just extracting links from the original page * TODO: need to rework this so it makes more sense when used in this context */ @Override protected void getPage() { String scrapedPage = entry[ThreadCommunicationBase.PageQueueEntries.SCRAPED_PAGE.ordinal()]; String baseUrl = entry[ThreadCommunicationBase.PageQueueEntries.BASE_URL.ordinal()]; Map<String, String> links = htmlParser.extractLinks(baseUrl, scrapedPage); for (Map.Entry<String, String> curEntry : links.entrySet()) { entry[ThreadCommunicationBase.PageQueueEntries.CUSTOM_RET_VALUE.ordinal()] = curEntry.getValue(); } threadCommunication.addToOutputDataHolder(entry); System.out.println("thread added to data output"); } @Override protected boolean actionIfUrlValid () { return true; } }