package org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor;
import com.google.inject.Injector;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.FinalOutputContainer;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunication;
import org.Webgatherer.CoreEngine.Core.ThreadCommunication.ThreadCommunicationBase;
import org.Webgatherer.ExperimentalLabs.HtmlProcessing.HtmlParser;
import org.Webgatherer.WorkflowExample.DataHolders.ContainerBase;
import org.Webgatherer.WorkflowExample.DataHolders.DataHolder;
import org.Webgatherer.WorkflowExample.DataHolders.DataHolderImpl;
import org.Webgatherer.WorkflowExample.Status.StatusIndicator;
import org.Webgatherer.WorkflowExample.Workflows.Base.Common.WorkflowBase;
import org.ardverk.collection.PatriciaTrie;
import org.ardverk.collection.StringKeyAnalyzer;
import org.ardverk.collection.Trie;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author Rick Dane
*/
public abstract class Workflow_DataInterpretorBase extends WorkflowBase {
protected Trie<String, DataHolder> trie = new PatriciaTrie<String, DataHolder>(StringKeyAnalyzer.INSTANCE);
protected List<String> negativeMatchUrlList = new ArrayList<String>();
protected DataHolder dataHolder;
protected String curEntryKey;
protected String curPageBaseUrl;
protected String curCategory;
protected String curScrapedPage;
protected String curPageBaseDomainUrl;
protected FinalOutputContainer finalOutputContainer;
protected ThreadCommunication threadCommunication;
protected List<String> trackSentBackLinks = new ArrayList<String>();
protected HtmlParser htmlParser;
protected String curWebPageText;
protected int containerDefaultMaxEntries;
protected int containerDefaultMaxAttempts;
/**
* This is meant to be called each time the runWorfklow() method is called, it combines functionality that different workflows
* will need to avoid having to insert boilerplate into each custom runWorkflow() method
*
* @param workflowParams
*/
protected void runWorkflowSetup(Map<String, Object> workflowParams) {
containerDefaultMaxEntries = Integer.parseInt(properties.getProperty("workflow_DataInterpretorBase_containerDefaultMaxEntries"));
containerDefaultMaxAttempts = Integer.parseInt(properties.getProperty("workflow_DataInterpretorBase_containerDefaultMaxAttempts"));
//TODO refactor this as it doesn't need to be called with each workflow iteration
setUp(workflowParams);
String[] curEntry = threadCommunication.getFromPageQueue();
curEntryKey = curEntry[ThreadCommunicationBase.PageQueueEntries.KEY.ordinal()];
curScrapedPage = curEntry[ThreadCommunicationBase.PageQueueEntries.SCRAPED_PAGE.ordinal()];
curPageBaseUrl = curEntry[ThreadCommunicationBase.PageQueueEntries.BASE_URL.ordinal()];
curCategory = curEntry[ThreadCommunicationBase.PageQueueEntries.CATEGORY.ordinal()];
curWebPageText = htmlParser.getText(curScrapedPage);
curPageBaseDomainUrl = prepareBaseDomainUrl(curEntryKey);
}
/**
* The url being passed in should not have http:// or anything prefixing it already so we just need to check for a backslash and remove
* it and anything trailing it
*
* @param url
* @return
*/
private String prepareBaseDomainUrl(String url) {
int index = url.indexOf("/");
if (index != -1) {
url = url.substring(0, index);
}
return "http://" + url;
}
public Workflow_DataInterpretorBase(Injector injector) {
super(injector);
}
protected void setUp(Map<String, Object> workflowParams) {
threadCommunication = (ThreadCommunication) workflowParams.get("threadCommunication");
finalOutputContainer = (FinalOutputContainer) workflowParams.get("finalOutputContainer");
}
protected void addPageToDataHolder(String label, String parsedHtml) {
if (dataHolder != null && dataHolder.checkIfContainerAvailable(label) != StatusIndicator.AVAILABLE) {
return;
}
dataHolder = trie.get(curEntryKey);
if (dataHolder == null) {
dataHolder = new DataHolderImpl();
dataHolder.createContainer(label, containerDefaultMaxEntries, containerDefaultMaxAttempts);
trie.put(curEntryKey, dataHolder);
}
StatusIndicator status = dataHolder.checkIfContainerAvailable(label);
if (status == StatusIndicator.DOESNOTEXIST) {
dataHolder.createContainer(label, containerDefaultMaxEntries, containerDefaultMaxAttempts);
}
dataHolder.addEntryToContainer(label, parsedHtml);
}
@Override
public void destroyCleanly() {
dataHolder.destroyRetrieveFinalData();
addToFinalOutputContainer();
}
protected boolean determineIfPageContains(String[] mustContainAtLeastOne, String[] mustContainAllEntries, String searchInText) {
searchInText = searchInText.toLowerCase();
for (String curEntry : mustContainAllEntries) {
if (!searchInText.contains(curEntry.toLowerCase())) {
return false;
}
}
for (String curEntry : mustContainAtLeastOne) {
if (searchInText.contains(curEntry.toLowerCase())) {
return true;
}
}
return true;
}
protected void addToFinalOutputContainer() {
while (!dataHolder.isFinishedContainerQueueEmpty()) {
ContainerBase cb = dataHolder.pullFromFinishedContainerQueue();
finalOutputContainer.addToFinalOutputContainer(curEntryKey + "." + cb.getIdentifier(), cb);
}
}
}