package org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor; import org.Webgatherer.WorkflowExample.Workflows.Base.DataInterpetor.Workflow_DataInterpretorBase; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Vector; /** * @author Rick Dane */ public class TextExtraction { private HashSet<String> ignoreSuffixes; private HashSet<String> negativeContains; public TextExtraction() { PrepareNegativeMatchLists(); } private void PrepareNegativeMatchLists() { ignoreSuffixes = new HashSet<String>(); ignoreSuffixes.add(".pdf"); ignoreSuffixes.add(".txt"); ignoreSuffixes.add(".zip"); ignoreSuffixes.add(".js"); ignoreSuffixes.add(".javascript"); ignoreSuffixes.add(".css"); ignoreSuffixes.add(".doc"); ignoreSuffixes.add(".jpg"); ignoreSuffixes.add(".gif"); ignoreSuffixes.add(".png"); ignoreSuffixes.add(".bmp"); ignoreSuffixes.add(".xls"); negativeContains = new HashSet<String>(); negativeContains.add("@"); } /** * Extracts links from a page that match one from the list passed in, sends to sendback object with specified internal label */ public void extractLinksForSendbackThatMatchKeys(Workflow_DataInterpretorBase instance, LinkedList<String> tokens, String parsedHtml, String internalLabel, String curDomainName) { Map<String, String> links = instance.htmlParser.extractLinks(instance.curPageBaseDomainUrl, parsedHtml); for (Map.Entry<String, String> entry : links.entrySet()) { String curLinkLabel = entry.getKey(); String url = entry.getValue(); if (!url.toLowerCase().contains(curDomainName.toLowerCase())) { // it's not a link from the original site so we don't add it, TODO: may want to make this an optional parameter at some point continue; } boolean isMatch = false; for (String curToken : tokens) { if (curLinkLabel.toLowerCase().contains(curToken.toLowerCase()) || curLinkLabel.toLowerCase().contains(curToken.toLowerCase())) { isMatch = true; break; } } if (isMatch == false) { continue; } ifNotUsedAdd(instance, url, internalLabel, LinkMatchType.POSITIVE_MATCH); } } public enum LinkMatchType { POSITIVE_MATCH, NEGATIVE_MATCH; } //TODO This method was done hastily as its mostly copy pasted from the similar method above, refactor both so they use common private methods for shared logic public void extractAllLinksFromSameSite(Workflow_DataInterpretorBase instance, String parsedHtml, String internalLabel, String curDomainName, LinkMatchType linkMatchType) { Map<String, String> links = instance.htmlParser.extractLinks(instance.curPageBaseDomainUrl, parsedHtml); for (Map.Entry<String, String> entry : links.entrySet()) { String curLinkLabel = entry.getKey(); String url = entry.getValue(); if (!url.toLowerCase().contains(curDomainName.toLowerCase())) { // it's not a link from the original site so we don't add it, TODO: may want to make this an optional parameter at some point continue; } ifNotUsedAdd(instance, url, internalLabel, linkMatchType); } } private void ifNotUsedAdd(Workflow_DataInterpretorBase instance, String url, String internalLabel, LinkMatchType linkMatchType) { if (linkMatchType.equals(LinkMatchType.POSITIVE_MATCH)) { if (!isUrlValid(url)) { instance.negativeMatchUrlList.add(url); } if (!instance.trackSentBackLinks.contains(url) && !instance.negativeMatchUrlList.contains(url)) { String[] strHolder = {instance.curEntryKey, url, internalLabel, null}; instance.threadCommunication.addToSendbackDataHolder(strHolder); instance.trackSentBackLinks.add(url); } } else if (linkMatchType.equals(LinkMatchType.NEGATIVE_MATCH)) { if (!instance.negativeMatchUrlList.contains(url) && !instance.trackSentBackLinks.contains(url)) { instance.negativeMatchUrlList.add(url); } } } public void extractEmailAddressesToDataHolder(Workflow_DataInterpretorBase instance, String parsedHtml, String internalLabel) { Map<String, String> links = instance.htmlParser.extractLinks(instance.curPageBaseUrl, parsedHtml); } /** * Runs a url through checks to determine if it appears to be a valid url for a web page * * @return */ public boolean isUrlValid(String url) { for (String curIgnoreCheck : ignoreSuffixes) { if (url.endsWith(curIgnoreCheck)) { return false; } } for (String negativeMatch : negativeContains) { if (url.contains(negativeMatch)) { return false; } } return true; } }