package focusedCrawler.util; import static org.hamcrest.CoreMatchers.*; import static org.junit.Assert.*; import java.util.Arrays; import java.util.List; import org.junit.Test; import focusedCrawler.util.LinkFilter.LinkBlackList; import focusedCrawler.util.LinkFilter.LinkWhiteList; public class LinkFilterTest { @Test public void matchShoudReturnTrueIfStringMatchUrlPatterns() { // given List<String> urlRegexPatterns = Arrays.asList( ".*/thread/.*", ".*/archive/index.php/t.*", "www\\.mydomain\\.com.*", "www\\.somedomain\\.com/forum/.*" ); RegexMatcher matcher = new RegexMatcher(urlRegexPatterns); List<String> urlsThatMatch = Arrays.asList( "http://some.domain.com/thread/something", "http://www.someforum.net/forum/archive/index.php/t-285330.html", "www.mydomain.com/asdf", "www.somedomain.com/forum/asdf.html" ); List<String> urlsThatDoesntMatch = Arrays.asList( "http://some.domain.com/something", "http://www.otherforum.net/calgunforum/t-285330.html", "www.testdomain.com/asdf", "www.somedomain.com/somthingelse/asdf.html" ); for (String url : urlsThatMatch) { // when boolean matched = matcher.matches(url); // then assertThat(url, matched, is(true)); } for (String url : urlsThatDoesntMatch) { // when boolean matched = matcher.matches(url); // then assertThat(url, matched, is(false)); } } @Test public void testComposedLinkFilter() { // given List<String> whitelistRegexes = Arrays.asList( "http[s]?://.*\\.?mydomain\\.com.*" // allow only links from mydomain.com ); List<String> blacklistRegexes = Arrays.asList( ".*/new_reply\\.php.*", // disallow links with path "/new_reply.php" ".*/new_user\\.php.*" // disallow links with path "/new_user.php" ); LinkFilter linkfilter = new LinkFilter(new LinkWhiteList(whitelistRegexes), new LinkBlackList(blacklistRegexes)); List<String> urlsThatMatch = Arrays.asList( "http://mydomain.com/show_thread.php?t=123", "http://www.mydomain.com/1234_some-url#qwer", "http://www.mydomain.com/yeah-yeah-yeah.1234.html", "http://www.mydomain.com/qwer.1234.html" ); List<String> urlsThatDoesntMatch = Arrays.asList( "http://www.mydomain.com/new_reply.php?t=123&u=456", "http://www.mydomain.com/new_user.php?t=123&u=456", "http://www.otherdomain.com/calgunforum/t-285330.html", "http://www.someotherdomain.com/forum/t-285330.html" ); // when for (String url : urlsThatMatch) { // when boolean matched = linkfilter.accept(url); // then assertThat(url, matched, is(true)); } for (String url : urlsThatDoesntMatch) { // when boolean matched = linkfilter.accept(url); // then assertThat(url, matched, is(false)); } } @Test public void shouldAcceptAllUrlsIfBlackListAndWhiteListAreEmpty() { // given List<String> whitelistRegexes = Arrays.asList(); List<String> blacklistRegexes = Arrays.asList(); LinkFilter linkfilter = new LinkFilter(new LinkWhiteList(whitelistRegexes), new LinkBlackList(blacklistRegexes)); List<String> urlsThatMatch = Arrays.asList( "http://mydomain.com/show_thread.php?t=123", "http://www.mydomain.com/1234_some-url#qwer", "http://www.mydomain.com/yeah-yeah-yeah.1234.html", "http://www.mydomain.com/qwer.1234.html" ); // when for (String url : urlsThatMatch) { // when boolean matched = linkfilter.accept(url); // then assertThat(url, matched, is(true)); } } }