package de.juwimm.cms.util;
import java.util.List;
/**
* @author <a href="florin.zalum@juwimm.com">Florin Zalum</a>
* Used by the @see WebCrawlerService to see if a link is valid. Has 2 implementation: Filter and Protocol
* @version $Id$
*/
public abstract class AbstractCrawlUrlStrategy {
private List<String> positives;
private List<String> negatives;
public AbstractCrawlUrlStrategy() {
}
public AbstractCrawlUrlStrategy(List<String> poz, List<String> neg) {
this.positives = poz;
this.negatives = neg;
}
public List<String> getNegatives() {
return negatives;
}
public void setNegatives(List<String> negatives) {
this.negatives = negatives;
}
public List<String> getPositives() {
return positives;
}
public void setPositives(List<String> positives) {
this.positives = positives;
}
public abstract boolean match(String url, String pattern);
/**
* Cases
* 1. Positive list with data - Negative list with data
* 2. Positive list with data - Negative list empty
* 3. Positive list empty - Negative list with data
* 4. Positive list empty - Negative list empty
*
* 1 - url must match positive strings and not match any of the negative strings
* 2 - url must match positive strings
* 3 - url must not match any string from negative strings
* 4 - url has no constrains at all
*
* for any changes write test in @see CrawlUrlStrategyTest
* @param url
* @return
*/
public boolean isUrlValid(String url) {
boolean emptyPositives = positives == null || positives.size() == 0;
boolean emptyNegatives = negatives == null || negatives.size() == 0;
if (emptyPositives && emptyNegatives) {
//no restrictions
return true;
}
if (!emptyPositives) {
return contains(url, positives) && !contains(url, negatives);
} else {
return !contains(url, negatives);
}
}
private boolean contains(String url, List<String> patterns) {
for (String pattern : patterns) {
if (match(url, pattern)) {
return true;
}
}
return false;
}
/**
* Filter strings can be part of an url
* @author fzalum
*
*/
public static class FilterCrawlUrlStrategy extends AbstractCrawlUrlStrategy {
public FilterCrawlUrlStrategy() {
}
public FilterCrawlUrlStrategy(List<String> poz, List<String> neg) {
super(poz, neg);
}
@Override
public boolean match(String url, String pattern) {
return url.contains(pattern);
}
}
/**
* Protocol strings can be only prefixes of an url
* @author fzalum
*
*/
public static class ProtocolCrawlUrlStrategy extends AbstractCrawlUrlStrategy {
public ProtocolCrawlUrlStrategy() {
}
public ProtocolCrawlUrlStrategy(List<String> poz, List<String> neg) {
super(poz, neg);
}
@Override
public boolean match(String url, String pattern) {
return url.startsWith(pattern);
}
}
}