package io.monokkel.core; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; import io.monokkel.domain.PageData; import io.monokkel.exceptions.ParseException; import org.junit.Before; import org.junit.Test; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Set; import static com.google.common.collect.Lists.newArrayList; import static java.lang.System.currentTimeMillis; import static org.junit.Assert.*; import static org.junit.Assert.assertEquals; public class HtmlTransformerTest { private HtmlTransformer htmlTransformer; private ArrayList<String> urlRegularExpressions; private HashMap transformationMap; @Before public void before(){ transformationMap = Maps.newHashMap(); urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true"); htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap); } @Test public void shouldParse_withTextHtml_expectShouldParseToReturnTrue() throws ParseException { final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true"; final String response = String.format("<html><body>" + "<a href=\"%s\" >House</a>" + "</body><html>",expectedUrl); final Boolean shouldParse = htmlTransformer.shouldParse(expectedUrl, response, ImmutableList.of("text/html")); assertTrue(shouldParse); } @Test public void shouldParse_withApplicationJson_expectShouldParseToReturnFalse() throws ParseException { final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true"; final String response = String.format("<html><body>" + "<a href=\"%s\" >House</a>" + "</body><html>",expectedUrl); final Boolean shouldParse = htmlTransformer.shouldParse(expectedUrl, response, ImmutableList.of("application/json")); assertFalse(shouldParse); } @Test public void run_withResponseWithOneUrl_expectOneUrlToBeRetrieved() throws ParseException { final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true"; final String response = String.format("<html><body>" + "<a href=\"%s\" >House</a>" + "</body><html>",expectedUrl); final PageData pageData = htmlTransformer.parse(expectedUrl, response, currentTimeMillis()); final String actualUrl = pageData.getUrl(); assertEquals(expectedUrl, actualUrl); } @Test public void parse_withContentTransformationXpathInNestedObjects_expectContent() throws ParseException { htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap); transformationMap.put("output","//div[contains(@id,'unit')]/text()"); LinkedHashMap linkedHashMap = Maps.newLinkedHashMap(); linkedHashMap.put("output","//div[contains(@id,'unit')]/text()"); transformationMap.put("sub_object",linkedHashMap); final String content = "This is html content \n" + "<content /> \n" + "<div>\n" + " subnode \n" + "</div>"; final String response = String.format("<html><body><div id=\"unit das2of3\">%s</div></body></html>",content); final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis()); final HashMap output = (HashMap) pageData.getTransformed().get("sub_object"); assertEquals("This is html content ", output.get("output")); } @Test public void parse_withContentTransformationAndXpathAxes_expectContent() throws ParseException { htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap); transformationMap.put("output","//div[contains(@id,'unit')]/following-sibling::div[1]/text()"); LinkedHashMap linkedHashMap = Maps.newLinkedHashMap(); linkedHashMap.put("output","//div[contains(@id,'unit')]/following-sibling::div[1]/text()"); transformationMap.put("sub_object",linkedHashMap); final String content = "This is html content \n" + "<content /> \n" + "<div>\n" + " subnode \n" + "</div>"; final String response = String.format("<html><body><div id=\"unit das2of3\">not correct</div><div>%s</div></body></html>",content); final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis()); final HashMap output = (HashMap) pageData.getTransformed().get("sub_object"); assertEquals("This is html content ", output.get("output")); } @Test public void parse_withContentTransformationXpath_expectContent() throws ParseException { htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap); transformationMap.put("output","//div[contains(@id,'unit')]/text()"); final String content = "This is html content \n" + "<content /> \n" + "<div>\n" + " subnode \n" + "</div>"; final String response = String.format("<html><body><div id=\"unit das2of3\">%s</div></body></html>",content); final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis()); final String output = (String) pageData.getTransformed().get("output"); assertEquals("This is html content ", output); } @Test public void parse_withContent_expectHtmlBeingRemoved() throws ParseException { final String expectedContent = "This is html content subnode"; htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap); final String content = "This is html content <content /> <div>subnode</div>"; final String response = String.format("<html><body><div id=\"unit das2of3\">%s</div></body></html>",content); final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis()); final String extractedContent = pageData.getExtractedContent(); assertEquals(expectedContent, extractedContent); } @Test public void run_withTwoSimilarUrlsToBeFound_expectOneUrlToBeExtracted() throws ParseException { final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true"; final String response = createResponse(expectedUrl, expectedUrl, "This is the title"); final PageData pageData = htmlTransformer.parse(expectedUrl, response, currentTimeMillis()); final Set<String> urlSet = pageData.getUrlSet(); assertEquals(1, urlSet.size()); } @Test public void run_withTwoDifferentUrls_expectTwoUrlsToBeExtracted() throws ParseException { final String expectedUrl1 = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true"; final String expectedUrl2 = "http://www.finn.no/finn/realestate/homes/result?sort=1&location=0%2F20061&PRICE_FROM=2000001&PRICE_TO=3000000&page=2&sort=1"; final String response = createResponse(expectedUrl1, expectedUrl2, "This is the title"); urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true", "http://www.finn.no/finn/realestate/homes/result\\?sort=[0-9]+\\&location=[A-F%0-9]+\\&PRICE_FROM=[0-9]+\\&PRICE_TO=[0-9]+\\&page=[0-9]+\\&sort=[0-9]+"); htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap); final PageData pageData = htmlTransformer.parse("", response, currentTimeMillis()); final Set<String> urlSet = pageData.getUrlSet(); assertEquals(2, urlSet.size()); } @Test public void run_withTwoDifferentUrlsAndRelative_expectTwoUrlsToBeExtracted() throws ParseException { final String expectedUrl2 = "?sort=1&location=0%2F20061&PRICE_FROM=2000001&PRICE_TO=3000000&page=2&sort=1"; final String baseUri = "http://www.finn.no/finn/realestate/homes/result"; final String response = createResponse("http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true", expectedUrl2, "This is the title"); urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true", "page=[0-9]+"); htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap); final PageData pageData = htmlTransformer.parse(baseUri, response, currentTimeMillis()); final Set<String> urlSet = pageData.getUrlSet(); final boolean containsFullUrl = urlSet.contains(String.format("%s%s", baseUri, expectedUrl2)); assertTrue(containsFullUrl); } @Test public void run_withTwoDifferentUrlsAndRelative_expectParsedOutputToContainValues() throws ParseException { final String expectedUrl = "http://www.finn.no/finn/realestate/homes/result"; final String expectedTitle = "This is the title"; final String expectedResponse = createResponse("http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true", "?sort=1&location=0%2F20061&PRICE_FROM=2000001&PRICE_TO=3000000&page=2&sort=1", expectedTitle); final long expectedTimestamp = currentTimeMillis(); urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true", "http://www.finn.no/finn/realestate/homes/result\\?sort=[0-9]+\\&location=[A-F%0-9]+\\&PRICE_FROM=[0-9]+\\&PRICE_TO=[0-9]+\\&page=[0-9]+\\&sort=[0-9]+"); htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap); final PageData pageData = htmlTransformer.parse(expectedUrl, expectedResponse, expectedTimestamp); final String addedResponse = pageData.getResponse(); final String url = pageData.getUrl(); final Long timestamp = pageData.getTimestamp(); final String title = pageData.getTitle(); assertEquals(expectedResponse,addedResponse); assertEquals(expectedUrl,url); assertEquals(timestamp.longValue(),expectedTimestamp); assertEquals(title,expectedTitle); } private String createResponse(final String expectedUrl1, final String expectedUrl2, final String title) { return String.format("<html><head><title>%s</title></head><body>" + "<a href=\"%s\" >House A</a>" + "<a href=\"%s\" >House B</a>" + "</body><html>", title,expectedUrl1, expectedUrl2); } }