package focusedCrawler.target.classifier;
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.LinkFilter;
import focusedCrawler.util.LinkFilter.LinkBlackList;
import focusedCrawler.util.LinkFilter.LinkWhiteList;
public class UrlRegexTargetClassifierTest {
@Test
public void shouldClassifyPagesBasedOnListOfUrlRegexes() throws Exception {
// given
List<String> urlPatterns = Arrays.asList(
".*/thread/.*",
".*/archive/index.php/t.*",
"https?://www\\.mydomain\\.com.*",
"https?://www\\.somedomain\\.com/forum/.*"
);
UrlRegexTargetClassifier classifier = new UrlRegexTargetClassifier(urlPatterns);
List<String> urlsThatMatch = Arrays.asList(
"http://some.domain.com/thread/something",
"http://www.someforum.net/forum/archive/index.php/t-1234.html",
"http://www.mydomain.com/asdf",
"http://www.somedomain.com/forum/asdf.html"
);
List<String> urlsThatDoesntMatch = Arrays.asList(
"http://some.domain.com/something",
"http://www.otherforum.net/someforum/t-285330.html",
"http://www.testdomain.com/asdf",
"http://www.somedomain.com/somthingelse/asdf.html"
);
List<Page> pagesThatMatch = asPages(urlsThatMatch);
List<Page> pagesThatDoesntMatch = asPages(urlsThatDoesntMatch);
for (Page page : pagesThatMatch) {
// when
TargetRelevance relevance = classifier.classify(page);
// then
assertThat(page.toString(), relevance.isRelevant(), is(true));
assertThat(page.toString(), relevance.getRelevance(), is(1d));
}
for (Page page : pagesThatDoesntMatch) {
// when
TargetRelevance relevance = classifier.classify(page);
// then
assertThat(page.toString(), relevance.isRelevant(), is(false));
assertThat(page.toString(), relevance.getRelevance(), is(0d));
}
}
@Test
public void shouldClassifyPagesBasedOnTheUrlWhiteListAndBlackLists() throws Exception {
// given
List<String> whitelistRegexes = Arrays.asList(
"http[s]?://.*\\.?mydomain\\.com.*" // allow only links from mydomain.com
);
List<String> blacklistRegexes = Arrays.asList(
".*/new_reply\\.php.*", // disallow links with path "/new_reply.php"
".*/new_user\\.php.*" // disallow links with path "/new_user.php"
);
LinkFilter linkfilter = new LinkFilter(new LinkWhiteList(whitelistRegexes),
new LinkBlackList(blacklistRegexes));
UrlRegexTargetClassifier classifier = new UrlRegexTargetClassifier(linkfilter);
List<String> urlsThatMatch = Arrays.asList(
"http://mydomain.com/show_thread.php?t=123",
"http://www.mydomain.com/1234_some-url#qwer",
"http://www.mydomain.com/yeah-yeah-yeah.1234.html",
"http://www.mydomain.com/qwer.1234.html"
);
List<String> urlsThatDoesntMatch = Arrays.asList(
"http://www.mydomain.com/new_reply.php?t=123&u=456",
"http://www.mydomain.com/new_user.php?t=123&u=456",
"http://www.otherdomain.com/calgunforum/t-285330.html",
"http://www.someotherdomain.com/forum/t-285330.html"
);
List<Page> pagesThatMatch = asPages(urlsThatMatch);
List<Page> pagesThatDoesntMatch = asPages(urlsThatDoesntMatch);
for (Page page : pagesThatMatch) {
// when
TargetRelevance relevance = classifier.classify(page);
// then
assertThat(page.toString(), relevance.isRelevant(), is(true));
assertThat(page.toString(), relevance.getRelevance(), is(1d));
}
for (Page page : pagesThatDoesntMatch) {
// when
TargetRelevance relevance = classifier.classify(page);
// then
assertThat(page.toString(), relevance.isRelevant(), is(false));
assertThat(page.toString(), relevance.getRelevance(), is(0d));
}
}
private List<Page> asPages(List<String> urls) throws MalformedURLException {
List<Page> pages = new ArrayList<Page>();
for (String url : urls) {
Page page = new Page(new URL(url), "");
pages.add(page);
}
return pages;
}
}