package io.monokkel.core;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Maps;
import io.monokkel.domain.PageData;
import io.monokkel.exceptions.ParseException;
import org.junit.Before;
import org.junit.Test;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Set;
import static com.google.common.collect.Lists.newArrayList;
import static java.lang.System.currentTimeMillis;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
public class HtmlTransformerTest {
private HtmlTransformer htmlTransformer;
private ArrayList<String> urlRegularExpressions;
private HashMap transformationMap;
@Before
public void before(){
transformationMap = Maps.newHashMap();
urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true");
htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap);
}
@Test
public void shouldParse_withTextHtml_expectShouldParseToReturnTrue() throws ParseException {
final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true";
final String response = String.format("<html><body>" +
"<a href=\"%s\" >House</a>" +
"</body><html>",expectedUrl);
final Boolean shouldParse = htmlTransformer.shouldParse(expectedUrl, response, ImmutableList.of("text/html"));
assertTrue(shouldParse);
}
@Test
public void shouldParse_withApplicationJson_expectShouldParseToReturnFalse() throws ParseException {
final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true";
final String response = String.format("<html><body>" +
"<a href=\"%s\" >House</a>" +
"</body><html>",expectedUrl);
final Boolean shouldParse = htmlTransformer.shouldParse(expectedUrl, response, ImmutableList.of("application/json"));
assertFalse(shouldParse);
}
@Test
public void run_withResponseWithOneUrl_expectOneUrlToBeRetrieved() throws ParseException {
final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true";
final String response = String.format("<html><body>" +
"<a href=\"%s\" >House</a>" +
"</body><html>",expectedUrl);
final PageData pageData = htmlTransformer.parse(expectedUrl, response, currentTimeMillis());
final String actualUrl = pageData.getUrl();
assertEquals(expectedUrl, actualUrl);
}
@Test
public void parse_withContentTransformationXpathInNestedObjects_expectContent() throws ParseException {
htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap);
transformationMap.put("output","//div[contains(@id,'unit')]/text()");
LinkedHashMap linkedHashMap = Maps.newLinkedHashMap();
linkedHashMap.put("output","//div[contains(@id,'unit')]/text()");
transformationMap.put("sub_object",linkedHashMap);
final String content = "This is html content \n" +
"<content /> \n" +
"<div>\n" +
" subnode \n" +
"</div>";
final String response = String.format("<html><body><div id=\"unit das2of3\">%s</div></body></html>",content);
final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis());
final HashMap output = (HashMap) pageData.getTransformed().get("sub_object");
assertEquals("This is html content ", output.get("output"));
}
@Test
public void parse_withContentTransformationAndXpathAxes_expectContent() throws ParseException {
htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap);
transformationMap.put("output","//div[contains(@id,'unit')]/following-sibling::div[1]/text()");
LinkedHashMap linkedHashMap = Maps.newLinkedHashMap();
linkedHashMap.put("output","//div[contains(@id,'unit')]/following-sibling::div[1]/text()");
transformationMap.put("sub_object",linkedHashMap);
final String content = "This is html content \n" +
"<content /> \n" +
"<div>\n" +
" subnode \n" +
"</div>";
final String response = String.format("<html><body><div id=\"unit das2of3\">not correct</div><div>%s</div></body></html>",content);
final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis());
final HashMap output = (HashMap) pageData.getTransformed().get("sub_object");
assertEquals("This is html content ", output.get("output"));
}
@Test
public void parse_withContentTransformationXpath_expectContent() throws ParseException {
htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap);
transformationMap.put("output","//div[contains(@id,'unit')]/text()");
final String content = "This is html content \n" +
"<content /> \n" +
"<div>\n" +
" subnode \n" +
"</div>";
final String response = String.format("<html><body><div id=\"unit das2of3\">%s</div></body></html>",content);
final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis());
final String output = (String) pageData.getTransformed().get("output");
assertEquals("This is html content ", output);
}
@Test
public void parse_withContent_expectHtmlBeingRemoved() throws ParseException {
final String expectedContent = "This is html content subnode";
htmlTransformer = new HtmlTransformer(urlRegularExpressions,"id","[a-zA-Z0-9}\\s]+", transformationMap);
final String content = "This is html content <content /> <div>subnode</div>";
final String response = String.format("<html><body><div id=\"unit das2of3\">%s</div></body></html>",content);
final PageData pageData = htmlTransformer.parse(content, response, currentTimeMillis());
final String extractedContent = pageData.getExtractedContent();
assertEquals(expectedContent, extractedContent);
}
@Test
public void run_withTwoSimilarUrlsToBeFound_expectOneUrlToBeExtracted() throws ParseException {
final String expectedUrl = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true";
final String response = createResponse(expectedUrl, expectedUrl, "This is the title");
final PageData pageData = htmlTransformer.parse(expectedUrl, response, currentTimeMillis());
final Set<String> urlSet = pageData.getUrlSet();
assertEquals(1, urlSet.size());
}
@Test
public void run_withTwoDifferentUrls_expectTwoUrlsToBeExtracted() throws ParseException {
final String expectedUrl1 = "http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true";
final String expectedUrl2 = "http://www.finn.no/finn/realestate/homes/result?sort=1&location=0%2F20061&PRICE_FROM=2000001&PRICE_TO=3000000&page=2&sort=1";
final String response = createResponse(expectedUrl1, expectedUrl2, "This is the title");
urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true", "http://www.finn.no/finn/realestate/homes/result\\?sort=[0-9]+\\&location=[A-F%0-9]+\\&PRICE_FROM=[0-9]+\\&PRICE_TO=[0-9]+\\&page=[0-9]+\\&sort=[0-9]+");
htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap);
final PageData pageData = htmlTransformer.parse("", response, currentTimeMillis());
final Set<String> urlSet = pageData.getUrlSet();
assertEquals(2, urlSet.size());
}
@Test
public void run_withTwoDifferentUrlsAndRelative_expectTwoUrlsToBeExtracted() throws ParseException {
final String expectedUrl2 = "?sort=1&location=0%2F20061&PRICE_FROM=2000001&PRICE_TO=3000000&page=2&sort=1";
final String baseUri = "http://www.finn.no/finn/realestate/homes/result";
final String response = createResponse("http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true", expectedUrl2, "This is the title");
urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true", "page=[0-9]+");
htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap);
final PageData pageData = htmlTransformer.parse(baseUri, response, currentTimeMillis());
final Set<String> urlSet = pageData.getUrlSet();
final boolean containsFullUrl = urlSet.contains(String.format("%s%s", baseUri, expectedUrl2));
assertTrue(containsFullUrl);
}
@Test
public void run_withTwoDifferentUrlsAndRelative_expectParsedOutputToContainValues() throws ParseException {
final String expectedUrl = "http://www.finn.no/finn/realestate/homes/result";
final String expectedTitle = "This is the title";
final String expectedResponse = createResponse("http://www.finn.no/finn/realestate/homes/object?finnkode=48661278&searchclickthrough=true", "?sort=1&location=0%2F20061&PRICE_FROM=2000001&PRICE_TO=3000000&page=2&sort=1", expectedTitle);
final long expectedTimestamp = currentTimeMillis();
urlRegularExpressions = newArrayList("http://www.finn.no/finn/realestate/homes/object\\?finnkode=[0-9]+\\&searchclickthrough=true", "http://www.finn.no/finn/realestate/homes/result\\?sort=[0-9]+\\&location=[A-F%0-9]+\\&PRICE_FROM=[0-9]+\\&PRICE_TO=[0-9]+\\&page=[0-9]+\\&sort=[0-9]+");
htmlTransformer = new HtmlTransformer(urlRegularExpressions, transformationMap);
final PageData pageData = htmlTransformer.parse(expectedUrl, expectedResponse, expectedTimestamp);
final String addedResponse = pageData.getResponse();
final String url = pageData.getUrl();
final Long timestamp = pageData.getTimestamp();
final String title = pageData.getTitle();
assertEquals(expectedResponse,addedResponse);
assertEquals(expectedUrl,url);
assertEquals(timestamp.longValue(),expectedTimestamp);
assertEquals(title,expectedTitle);
}
private String createResponse(final String expectedUrl1, final String expectedUrl2, final String title) {
return String.format("<html><head><title>%s</title></head><body>" +
"<a href=\"%s\" >House A</a>" +
"<a href=\"%s\" >House B</a>" +
"</body><html>", title,expectedUrl1, expectedUrl2);
}
}