package focusedCrawler.util.parser;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.not;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.junit.Assert.assertThat;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import org.junit.Test;
public class PaginaURLTest {
@Test
public void htmlEncodedLinksShouldBeEscaped() throws Exception {
// given
StringBuilder testPage = new StringBuilder();
testPage.append("<!DOCTYPE html>");
testPage.append("<html>");
testPage.append("<body>");
testPage.append("<a href = \"http://ex.com/index.php?p1=asdf&p2=qwer\">Anchor text.</a>");
testPage.append("</body>");
testPage.append("</html>");
String testString = testPage.toString();
// when
PaginaURL pageParser = new PaginaURL(new URL("http://ex.com/index.html"),testString);
URL[] extractedLinks = pageParser.links();
LinkNeighborhood[] neighborhood = pageParser.getLinkNeighboor();
// then
assertThat(extractedLinks[0].toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer"));
assertThat(neighborhood[0].getLink().toString(), is("http://ex.com/index.php?p1=asdf&p2=qwer"));
}
@Test
public void linksShouldNotContainFragments() throws Exception {
// given
String testString = createTestPage();
URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
// when
PaginaURL pageParser = new PaginaURL(url,testString);
URL[] extractedLinks = pageParser.links();
// then
for(URL extractedUrl : Arrays.asList(extractedLinks)) {
assertThat(extractedUrl.getFile().toString(), not(containsString("#")));
}
}
@Test
public void constructorsShouldWork() throws MalformedURLException {
// given
URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
String testPage = createTestPage();
// when
PaginaURL paginaURL = new PaginaURL(url, testPage);
// then
assertThat(paginaURL.getURL(), is(notNullValue()));
}
@Test
public void shouldNotExtractInvalidLinks() throws MalformedURLException {
// given
URL url = new URL("http://example.com/test.html");
StringBuilder testPage = new StringBuilder();
testPage.append("<!DOCTYPE html>");
testPage.append("<html>");
testPage.append("<body>");
testPage.append("<h1>My First Heading</h1>");
testPage.append("<a href = \"http://None/\">link 0</a>");
testPage.append("<a href = \"http://12324/\">link 1</a>");
testPage.append("<a href = \"/asdf.html\">link 2</a>");
testPage.append("</body>");
testPage.append("</html>");
// when
PaginaURL paginaURL = new PaginaURL(url, testPage.toString());
URL[] links = paginaURL.links();
LinkNeighborhood[] lns = paginaURL.getLinkNeighboor();
// then
assertThat(links.length, is(1));
assertThat(links[0].toString(), is("http://example.com/asdf.html"));
assertThat(lns.length, is(1));
assertThat(lns[0].getLink().toString(), is("http://example.com/asdf.html"));
}
@Test
public void shouldExtractAnchoTextAndTextAroundLink() throws MalformedURLException {
// given
URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
String testPage = createTestPage();
// when
PaginaURL paginaURL = new PaginaURL(url, testPage);
LinkNeighborhood[] neighborhoods = paginaURL.getLinkNeighboor();
// then
assertThat(neighborhoods.length, is(1));
assertThat(neighborhoods[0].getAroundString().trim(), is("my first heading"));
assertThat(neighborhoods[0].getAround()[0], is("my"));
assertThat(neighborhoods[0].getAround()[1], is("first"));
assertThat(neighborhoods[0].getAround()[2], is("heading"));
assertThat(neighborhoods[0].getAnchorString().trim(), is("my first paragraph"));
assertThat(neighborhoods[0].getAnchor()[0], is("my"));
assertThat(neighborhoods[0].getAnchor()[1], is("first"));
assertThat(neighborhoods[0].getAnchor()[2], is("paragraph"));
}
@Test
public void shouldNormalizeLinks() throws MalformedURLException {
// given
URL url = new URL("http://www.w3schools.com/html/tryit.asp?filename=tryhtml_basic_document");
String testPage = createTestPageUnormalizedLinks();
// when
PaginaURL paginaURL = new PaginaURL(url, testPage);
LinkNeighborhood[] neighborhoods = paginaURL.getLinkNeighboor();
URL[] links = paginaURL.links();
// then
assertThat(neighborhoods.length, is(3));
assertThat(links.length, is(3));
assertThat(neighborhoods[0].getLink().toString(), is("http://example.com/post.php?"));
assertThat(links[0].toString(), is("http://example.com/post.php?"));
assertThat(neighborhoods[1].getLink().toString(), is("http://example.com/post.php?a=1&b=2"));
assertThat(links[1].toString(), is("http://example.com/post.php?a=1&b=2"));
assertThat(neighborhoods[2].getLink().toString(), is("http://example.com/"));
assertThat(links[2].toString(), is("http://example.com/"));
}
private String createTestPage() {
StringBuilder testPage = new StringBuilder();
testPage.append("<!DOCTYPE html>");
testPage.append("<html>");
testPage.append("<body>");
testPage.append("<h1>My First Heading</h1>");
testPage.append("<a href = \"https://en.wikipedia.org/wiki/Mouse_(computing)#Mechanical_mice\">My first paragraph.</a>");
testPage.append("</body>");
testPage.append("</html>");
return testPage.toString();
}
private String createTestPageUnormalizedLinks() {
StringBuilder testPage = new StringBuilder();
testPage.append("<!DOCTYPE html>");
testPage.append("<html>");
testPage.append("<body>");
testPage.append("<h1>My First Heading</h1>");
testPage.append("<a href = \"http://Example.com:80/post.php?\">Link 1.</a>");
testPage.append("<a href = \"HTTP://EXAMPLE.com/post.php?b=2&a=1\">Link 2.</a>");
testPage.append("<a href = \"HTTP://EXAMPLE.com\">Link 3.</a>");
testPage.append("</body>");
testPage.append("</html>");
return testPage.toString();
}
}