WhenAPageURLIsCreated.java example

Explorer

crawler-master
- src
  - main
    - java
      - com
        soulgalore
        crawler
        core
        Crawler.java
        CrawlerConfiguration.java
        CrawlerResult.java
        CrawlerURL.java
        HTMLPageResponse.java
        HTMLPageResponseCallable.java
        HTMLPageResponseFetcher.java
        PageURLParser.java
        assets
        AssetFetcher.java
        AssetResponse.java
        AssetResponseCallable.java
        AssetsParser.java
        AssetsVerificationResult.java
        AssetsVerifier.java
        impl
        DefaultAssetsParser.java
        DefaultAssetsVerifier.java
        HTTPClientAssetFetcher.java
        impl
        AhrefPageURLParser.java
        DefaultCrawler.java
        HTTPClientResponseFetcher.java
        guice
        AbstractPropertiesModule.java
        CrawlModule.java
        ExecutorServiceProvider.java
        HttpClientProvider.java
        run
        AbstractCrawl.java
        AbstractRunner.java
        CrawlAndVerifyAssets.java
        CrawlAndVerifyAssetsToCsv.java
        CrawlToCsv.java
        CrawlToFile.java
        CrawlToPlainTxtOnlyMatching.java
        CrawlToSystemOut.java
        util
        Auth.java
        AuthUtil.java
        HTTPSFaker.java
        HeaderUtil.java
        StatusCode.java
  - test
    - java
      - com
        soulgalore
        crawler
        WhenACrawlerResultIsCreated.java
        WhenAPageURLIsCreated.java
        WhenAStatusCodeIsChecked.java
        core
        impl
        WhenACrawlIsDone.java
        WhenAhrefsIsParsedFromResponse.java
        run
        AbstractRun.java
        WhenCrawlToPlainTxtRun.java
        test
        TestFileHelper.java
        util
        WhenAHeaderIsParsed.java
        WhenAnAuthObjectIsCreated.java

package com.soulgalore.crawler;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.endsWith;
import static org.junit.Assert.assertTrue;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;

import org.junit.Test;

import com.soulgalore.crawler.core.CrawlerURL;


public class WhenAPageURLIsCreated {

	@Test
	public void hasAHost() {
		String theHost = "www.soulgalore.com";
		CrawlerURL asset = new CrawlerURL("http://" + theHost);
		assertThat(asset.getHost(), is(theHost));

	}

	
	@Test
	public void hasAReferer() {
		String url = "http://www.soulgalore.com/test/";
		String referer = "http://www.soulgalore.com/";
		CrawlerURL asset = new CrawlerURL(url, referer);
		assertThat(asset.getReferer(), is(referer));

	}
	
	@Test
	public void hasAUrl() {
		String url = "http://www.soulgalore.com/page/";
		CrawlerURL asset = new CrawlerURL(url);
		assertThat(asset.getUrl(), is(url));

	}
	
	@Test
	public void hasAUri() throws URISyntaxException {
		String url = "http://www.soulgalore.com/page/";
		CrawlerURL asset = new CrawlerURL(url);
		
		assertThat(asset.getUri(), is(new URI(url)));

	}
	
	@Test
	public void theHashIsStrippedFromTheUri() {
		String url = "http://www.soulgalore.com/page/#special";
		CrawlerURL asset = new CrawlerURL(url);
		
		String uri = asset.getUri().toString();
		assertThat("Assert that the # part of the uri is removed",uri,endsWith("/") );

	}
	
	@Test
	public void hashTagShouldBeignoredFromEquals() {
		CrawlerURL asset = new CrawlerURL("http://www.soulislove.com/");
		CrawlerURL asset2 = new CrawlerURL("http://www.soulislove.com/#respond");
		
		assertTrue("The assets shoudln't depend on hashtag", asset.equals(asset2));
	}
	
	@Test
	public void urlsWithWrongSyntaxShouldNotBeValid()
			throws MalformedURLException, URISyntaxException,
			UnsupportedEncodingException {

		CrawlerURL url = new CrawlerURL("http://www.soulgalore.com");
		assertThat(url.isWrongSyntax(), is(false));

		// faulty url
		url = new CrawlerURL("apa");
		assertThat(
				"The url has no wrong syntax, but it should:" + url.getUrl(),
				url.isWrongSyntax(), is(true));

		
		// trying out a couple of special urls
		url = new CrawlerURL(
				"http://b.scorecardresearch.com/b?c1=2&c2=6035308&c3=&c4=&c5=&c6=&c15=&cv=1.3&cj=1");
		assertThat(url.isWrongSyntax(), is(false));

		url = new CrawlerURL(
				"http://fonts.googleapis.com/css?family=Droid+Sans|Vollkorn:bold|Merienda+One");
		assertThat(url.isWrongSyntax(), is(false));

		url = new CrawlerURL(
				"http://adserver.adtech.de/addyn|3.0|506|3067392|407017|-1|ADTECH;loc=100;key=;grp=96905;asfunc=1;cookie=info;size=1x1;misc=1326643451844");
		assertThat(url.isWrongSyntax(), is(false));

	}

}