package com.soulgalore.crawler;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.endsWith;
import static org.junit.Assert.assertTrue;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import org.junit.Test;
import com.soulgalore.crawler.core.CrawlerURL;
public class WhenAPageURLIsCreated {
@Test
public void hasAHost() {
String theHost = "www.soulgalore.com";
CrawlerURL asset = new CrawlerURL("http://" + theHost);
assertThat(asset.getHost(), is(theHost));
}
@Test
public void hasAReferer() {
String url = "http://www.soulgalore.com/test/";
String referer = "http://www.soulgalore.com/";
CrawlerURL asset = new CrawlerURL(url, referer);
assertThat(asset.getReferer(), is(referer));
}
@Test
public void hasAUrl() {
String url = "http://www.soulgalore.com/page/";
CrawlerURL asset = new CrawlerURL(url);
assertThat(asset.getUrl(), is(url));
}
@Test
public void hasAUri() throws URISyntaxException {
String url = "http://www.soulgalore.com/page/";
CrawlerURL asset = new CrawlerURL(url);
assertThat(asset.getUri(), is(new URI(url)));
}
@Test
public void theHashIsStrippedFromTheUri() {
String url = "http://www.soulgalore.com/page/#special";
CrawlerURL asset = new CrawlerURL(url);
String uri = asset.getUri().toString();
assertThat("Assert that the # part of the uri is removed",uri,endsWith("/") );
}
@Test
public void hashTagShouldBeignoredFromEquals() {
CrawlerURL asset = new CrawlerURL("http://www.soulislove.com/");
CrawlerURL asset2 = new CrawlerURL("http://www.soulislove.com/#respond");
assertTrue("The assets shoudln't depend on hashtag", asset.equals(asset2));
}
@Test
public void urlsWithWrongSyntaxShouldNotBeValid()
throws MalformedURLException, URISyntaxException,
UnsupportedEncodingException {
CrawlerURL url = new CrawlerURL("http://www.soulgalore.com");
assertThat(url.isWrongSyntax(), is(false));
// faulty url
url = new CrawlerURL("apa");
assertThat(
"The url has no wrong syntax, but it should:" + url.getUrl(),
url.isWrongSyntax(), is(true));
// trying out a couple of special urls
url = new CrawlerURL(
"http://b.scorecardresearch.com/b?c1=2&c2=6035308&c3=&c4=&c5=&c6=&c15=&cv=1.3&cj=1");
assertThat(url.isWrongSyntax(), is(false));
url = new CrawlerURL(
"http://fonts.googleapis.com/css?family=Droid+Sans|Vollkorn:bold|Merienda+One");
assertThat(url.isWrongSyntax(), is(false));
url = new CrawlerURL(
"http://adserver.adtech.de/addyn|3.0|506|3067392|407017|-1|ADTECH;loc=100;key=;grp=96905;asfunc=1;cookie=info;size=1x1;misc=1326643451844");
assertThat(url.isWrongSyntax(), is(false));
}
}