package focusedCrawler.link;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.junit.Assert.assertThat;
import org.junit.Test;
import focusedCrawler.link.frontier.LinkRelevance;
public class DownloadSchedulerTest {
@Test
public void shouldSelectLinksBasedOnPoliteness() throws Exception {
LinkRelevance l1 = new LinkRelevance("http://ex1.com/1", 1);
LinkRelevance l2 = new LinkRelevance("http://ex2.com/2", 2);
LinkRelevance l3 = new LinkRelevance("http://ex1.com/3", 3);
LinkRelevance l4 = new LinkRelevance("http://ex2.com/4", 4);
LinkRelevance l5 = new LinkRelevance("http://ex3.com/5", 5);
int minimumAccessTime = 500;
int maxLinksInScheduler = 100;
DownloadScheduler scheduler = new DownloadScheduler(minimumAccessTime, maxLinksInScheduler);
// when add link l1
scheduler.addLink(l1);
// then should return it (+some other state checks)
assertThat(scheduler.hasLinksAvailable(), is(true));
assertThat(scheduler.numberOfLinks(), is(1));
assertThat(scheduler.nextLink(), is(l1));
assertThat(scheduler.numberOfLinks(), is(0));
assertThat(scheduler.nextLink(), is(nullValue()));
// and should remember domains from links recently chosen
assertThat(scheduler.numberOfNonExpiredDomains(), is(1));
assertThat(scheduler.numberOfEmptyDomains(), is(1));
// wait 1ms just to make sure that domains will have
// different access times (in case that test run too
// fast they will have the same access times)
Thread.sleep(1);
// same thing when add link l2...
scheduler.addLink(l2);
assertThat(scheduler.hasLinksAvailable(), is(true));
assertThat(scheduler.numberOfLinks(), is(1));
assertThat(scheduler.nextLink(), is(l2));
assertThat(scheduler.numberOfLinks(), is(0));
assertThat(scheduler.nextLink(), is(nullValue()));
// should remember domains from links recently chosen
assertThat(scheduler.numberOfNonExpiredDomains(), is(2));
assertThat(scheduler.numberOfEmptyDomains(), is(2));
Thread.sleep(1);
// when add 3 links from 3 different domains...
scheduler.addLink(l3);
scheduler.addLink(l4);
scheduler.addLink(l5);
assertThat(scheduler.numberOfNonExpiredDomains(), is(3));
assertThat(scheduler.numberOfEmptyDomains(), is(0));
// We assume that this test take less than 500ms to run.
// Links l3 and l4 have higher priority, but they should be skipped
// since other links from their domain has been chosen recently
assertThat(scheduler.hasLinksAvailable(), is(true));
assertThat(scheduler.nextLink(), is(l5));
// at this moment, there should have no links available
assertThat(scheduler.hasLinksAvailable(), is(false));
assertThat(scheduler.nextLink(), is(nullValue()));
// after waiting the minimumAccessTime interval, they links can be returned
Thread.sleep(minimumAccessTime+100);
assertThat(scheduler.nextLink(), is(l3));
assertThat(scheduler.nextLink(), is(l4));
// scheduler should also forget domains that don't have links chosen
// for longer then the minimumAccessTime
Thread.sleep(minimumAccessTime+10);
assertThat(scheduler.numberOfNonExpiredDomains(), is(0));
// adding link again just to test that after removing old domain
// everything is still working fine
scheduler.addLink(l1);
assertThat(scheduler.nextLink(), is(l1));
assertThat(scheduler.numberOfNonExpiredDomains(), is(1));
assertThat(scheduler.numberOfEmptyDomains(), is(1));
assertThat(scheduler.nextLink(), is(nullValue()));
}
@Test
public void addLinksShouldIgnoreLinkWhenMaxNumberOfLinksIsReached() throws Exception {
LinkRelevance l1 = new LinkRelevance("http://ex1.com/", 1);
LinkRelevance l2 = new LinkRelevance("http://ex2.com/", 2);
int minimumAccessTime = 100;
int maxLinksInScheduler = 1;
DownloadScheduler scheduler = new DownloadScheduler(minimumAccessTime, maxLinksInScheduler);
scheduler.addLink(l1);
assertThat(scheduler.numberOfLinks(), is(1));
scheduler.addLink(l2);
assertThat(scheduler.numberOfLinks(), is(1));
}
@Test
public void shouldReturnLinksFromSameTLDsUsingFIFOOrder() throws Exception {
LinkRelevance l1 = new LinkRelevance("http://ex1.com/1", 1);
LinkRelevance l2 = new LinkRelevance("http://ex1.com/2", 2);
LinkRelevance l3 = new LinkRelevance("http://ex1.com/3", 3);
LinkRelevance l4 = new LinkRelevance("http://ex1.com/4", 4);
int minimumAccessTime = 0;
int maxLinksInScheduler = 100;
DownloadScheduler scheduler = new DownloadScheduler(minimumAccessTime, maxLinksInScheduler);
scheduler.addLink(l4);
scheduler.addLink(l3);
scheduler.addLink(l2);
scheduler.addLink(l1);
assertThat(scheduler.nextLink().getRelevance(), is(4d));
assertThat(scheduler.nextLink().getRelevance(), is(3d));
assertThat(scheduler.nextLink().getRelevance(), is(2d));
assertThat(scheduler.nextLink().getRelevance(), is(1d));
}
@Test
public void shouldCheckIfLinkCanBeDownloadedAtCurrentTime() throws Exception {
LinkRelevance l1 = new LinkRelevance("http://ex1.com/1", 1);
LinkRelevance l2 = new LinkRelevance("http://ex1.com/2", 2);
LinkRelevance l3 = new LinkRelevance("http://ex2.com/3", 3);
int minimumAccessTime = 100;
int maxLinksInScheduler = 100;
DownloadScheduler scheduler = new DownloadScheduler(minimumAccessTime, maxLinksInScheduler);
scheduler.addLink(l1);
assertThat(scheduler.nextLink().getRelevance(), is(1d));
assertThat(scheduler.canDownloadNow(l3), is(true));
assertThat(scheduler.canDownloadNow(l2), is(false));
Thread.sleep(minimumAccessTime+10);
assertThat(scheduler.canDownloadNow(l2), is(true));
assertThat(scheduler.canDownloadNow(l3), is(true));
}
@Test
public void shouldBeAbleToClearListOfLinks() throws Exception {
LinkRelevance l1 = new LinkRelevance("http://ex1.com/1", 1);
LinkRelevance l2 = new LinkRelevance("http://ex2.com/2", 2);
LinkRelevance l3 = new LinkRelevance("http://ex3.com/3", 3);
LinkRelevance l4 = new LinkRelevance("http://ex4.com/4", 4);
int minimumAccessTime = 100;
int maxLinksInScheduler = 100;
DownloadScheduler scheduler = new DownloadScheduler(minimumAccessTime, maxLinksInScheduler);
scheduler.addLink(l1);
scheduler.addLink(l2);
scheduler.addLink(l4);
scheduler.addLink(l3);
assertThat(scheduler.numberOfLinks(), is(4));
assertThat(scheduler.hasLinksAvailable(), is(true));
// when
scheduler.nextLink();
scheduler.nextLink();
scheduler.clear();
// then
assertThat(scheduler.hasLinksAvailable(), is(false));
assertThat(scheduler.numberOfLinks(), is(0));
assertThat(scheduler.numberOfEmptyDomains(), is(4));
assertThat(scheduler.numberOfNonExpiredDomains(), is(2));
// make sure it remembers domains that were previously selected
scheduler.addLink(l1);
scheduler.addLink(l2);
scheduler.addLink(l4);
scheduler.addLink(l3);
assertThat(scheduler.numberOfAvailableDomains(), is(2));
}
}