package org.Webgatherer.WorkflowExample.Workflows.Implementations.DataInterpetor;
import com.google.inject.Injector;
import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParserImpl;
import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.TextExtraction;
import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.Workflow_DataInterpretorBase;
import org.htmlcleaner.HtmlCleaner;
import java.util.LinkedList;
import java.util.Map;
/**
* @author Rick Dane
*/
public final class Workflow_DataInterpretor_1 extends Workflow_DataInterpretorBase {
private TextExtraction textExtraction;
protected int count = 1;
public Workflow_DataInterpretor_1(Injector injector) {
super(injector);
htmlParser = new HtmlParserImpl(htmlCleaner);
textExtraction = injector.getInstance(TextExtraction.class);
htmlCleanerProvider = injector.getProvider(HtmlCleaner.class);
}
@Override
public void runWorkflow(Map<String, Object> workflowParams) {
System.out.print(count + ", ");
count++;
runWorkflowSetup(workflowParams);
if (curScrapedPage != null) {
String[] checkFor1 = {"career", "job", "employment", "work"};
checkForMatchesToSendBackLink(checkFor1, "careers");
String[] checkFor3 = {"about", "info"};
checkForMatchesToSendBackLink(checkFor3, "aboutus");
String[] checkFor4 = {"site map"};
checkForMatchesToSendBackLink(checkFor4, "sitemap");
if (curCategory != null && (curCategory.equals("aboutus") || curCategory.equals("sitemap"))) {
checkForMatchesToSendBackLink(checkFor1, "careers");
}
if (curCategory == null) {
//add negative matches from the initial page, to be used to determine unique links on specific pages later
textExtraction.extractAllLinksFromSameSite(this, curScrapedPage, "careers", curPageBaseDomainUrl, TextExtraction.LinkMatchType.NEGATIVE_MATCH);
}
}
dataHolder = trie.get(curEntryKey);
if (curCategory != null && curCategory.equals("aboutus")) {
addPageToDataHolder("aboutus", curPageBaseUrl);
}
if (curCategory != null && curCategory.equals("careers")) {
textExtraction.extractAllLinksFromSameSite(this, curScrapedPage, "careers", curPageBaseDomainUrl, TextExtraction.LinkMatchType.POSITIVE_MATCH);
String[] mustContainAtLeastOne = {"developer", "engineer", "programmer"};
String[] mustContainAllEntries = {"java", "software"};
boolean isMatch = determineIfPageContains(mustContainAtLeastOne, mustContainAllEntries, curWebPageText);
if (isMatch) {
addPageToDataHolder("careers", curPageBaseUrl);
}
}
//move any finished containers to the finished queue
if (dataHolder != null && !dataHolder.isFinishedContainerQueueEmpty()) {
addToFinalOutputContainer();
}
}
protected void checkForMatchesToSendBackLink(String[] matches, String label) {
LinkedList<String> tokenstoCheckFor = new LinkedList<String>();
for (String curMatch : matches) {
tokenstoCheckFor.add(curMatch);
}
textExtraction.extractLinksForSendbackThatMatchKeys(this, tokenstoCheckFor, curScrapedPage, label, curPageBaseDomainUrl);
}
@Override
public void destroyCleanly() {
while (!threadCommunication.isPageQueueEmpty()) {
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
dataHolder.destroyRetrieveFinalData();
addToFinalOutputContainer();
}
}