package focusedCrawler.target;
import static java.util.Arrays.asList;
import static org.hamcrest.CoreMatchers.either;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.CoreMatchers.nullValue;
import static org.hamcrest.Matchers.endsWith;
import static org.junit.Assert.assertThat;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.InflaterInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
import focusedCrawler.target.classifier.TargetRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.TargetModelCbor;
import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.FileSystemTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository.DataFormat;
public class FileSystemTargetRepositoryTest {
// a new temp folder is created for each test case
@Rule public TemporaryFolder tempFolder = new TemporaryFolder();
static String html;
static String url;
static Map<String, List<String>> responseHeaders;
@BeforeClass
static public void setUp() {
url = "http://example.com";
html = "<html><body>Hello World! Hello World! Hello World!</body></html>";
responseHeaders = new HashMap<>();
responseHeaders.put("content-type", asList("text/html"));
}
@Test
public void shouldStoreContentAsRawFile() throws IOException {
// given
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.HTML, false);
// when
repository.insert(target);
// then
Path path = Paths.get(folder, "example.com", "http%3A%2F%2Fexample.com");
assertThat(path.toFile().exists(), is(true));
String content = new String(Files.readAllBytes(path));
assertThat(content, is(html));
}
@Test
public void shouldStoreContentCompressed() throws IOException {
// given
boolean compressData = true;
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(Paths.get(folder), DataFormat.HTML, false, compressData);
// when
repository.insert(target);
// then
Path path = Paths.get(folder, "example.com", "http%3A%2F%2Fexample.com");
assertThat(path.toFile().exists(), is(true));
byte[] fileBytes = Files.readAllBytes(path);
assertThat(fileBytes, is(notNullValue()));
assertThat(fileBytes.length < html.getBytes().length, is(true));
InputStream gzip = new InflaterInputStream(new ByteArrayInputStream(fileBytes));
byte[] uncompressedBytes = IOUtils.toByteArray(gzip);
String content = new String(uncompressedBytes);
assertThat(content, is(html));
}
@Test
public void shouldStoreAndReadCompressedContent() throws IOException {
// given
boolean compressData = true;
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(Paths.get(folder), DataFormat.JSON, false, compressData);
// when
repository.insert(target);
TargetModelJson jsonModel = repository.get(url);
// then
assertThat(jsonModel, is(notNullValue()));
assertThat(jsonModel.getUrl(), is(url));
assertThat(jsonModel.getContentAsString(), is(html));
}
@Test
public void shouldStoreContentAsJSON() throws IOException {
// given
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html, responseHeaders);
target.setTargetRelevance(TargetRelevance.IRRELEVANT);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.JSON, false);
// when
repository.insert(target);
// then
Path path = Paths.get(folder, "example.com", "http%3A%2F%2Fexample.com");
assertThat(path.toFile().exists(), is(true));
ObjectMapper mapper = new ObjectMapper();
TargetModelJson value = mapper.readValue(path.toFile(), TargetModelJson.class);
assertThat(value.getUrl(), is(url));
assertThat(value.getContentAsString(), is(html));
assertThat(value.getRelevance().isRelevant(), is(TargetRelevance.IRRELEVANT.isRelevant()));
assertThat(value.getRelevance().getRelevance(), is(TargetRelevance.IRRELEVANT.getRelevance()));
}
@Test
public void shouldStoreContentAsCBOR() throws IOException {
// given
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html, responseHeaders);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.CBOR, false);
// when
repository.insert(target);
// then
Path path = Paths.get(folder, "example.com", "http%3A%2F%2Fexample.com");
assertThat(path.toFile().exists(), is(true));
ObjectMapper mapper = new ObjectMapper(new CBORFactory());
TargetModelCbor value = mapper.readValue(path.toFile(), TargetModelCbor.class);
assertThat(value.url, is(url));
assertThat(value.response.get("body").toString(), is(html));
}
@Test
public void shouldHashFilenameUsingSHA256Hash() throws IOException {
// given
boolean hashFilename = true;
String folder = tempFolder.newFolder().toString();
Page target = new Page(new URL(url), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.HTML, hashFilename);
// when
repository.insert(target);
// then
Path path = Paths.get(folder, "example.com", "f0e6a6a97042a4f1f1c87f5f7d44315b2d852c2df5c7991cc66241bf7072d1c4");
assertThat(path.toFile().exists(), is(hashFilename));
String content = new String(Files.readAllBytes(path));
assertThat(content, is(html));
}
@Test
public void sholdGetPageThatWasInserted() throws IOException {
// given
boolean hashFilename = true;
String folder = tempFolder.newFolder().toString();
String url1 = "http://example1.com";
String url2 = "http://example2.com";
Page target1 = new Page(new URL(url1), html);
target1.setTargetRelevance(TargetRelevance.IRRELEVANT);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.JSON, hashFilename);
// when
repository.insert(target1);
TargetModelJson page1 = repository.get(url1);
TargetModelJson page2 = repository.get(url2);
// then
assertThat(page1, is(notNullValue()));
assertThat(page1.getUrl(), is(url1));
assertThat(page1.getContentAsString(), is(html));
assertThat(page1.getRelevance().isRelevant(), is(TargetRelevance.IRRELEVANT.isRelevant()));
assertThat(page1.getRelevance().getRelevance(), is(TargetRelevance.IRRELEVANT.getRelevance()));
assertThat(page2, is(nullValue()));
}
@Test
public void sholdIterateOverInsertedPages() throws IOException {
// given
boolean hashFilename = true;
boolean compressData = true;
String folder = tempFolder.newFolder().toString();
String url1 = "http://a.com";
String url2 = "http://b.com";
Page target1 = new Page(new URL(url1), html);
Page target2 = new Page(new URL(url2), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.JSON, hashFilename, compressData);
// when
repository.insert(target1);
repository.insert(target2);
Iterator<TargetModelJson> it = repository.iterator();
// then
TargetModelJson page;
assertThat(it.hasNext(), is(true));
page = it.next();
assertThat(page, is(notNullValue()));
assertThat(page.getContentAsString(), is(html));
assertThat(it.hasNext(), is(true));
page = it.next();
assertThat(page, is(notNullValue()));
assertThat(page.getContentAsString(), is(html));
assertThat(it.hasNext(), is(false));
assertThat(it.next(), is(nullValue()));
assertThat(it.hasNext(), is(false));
assertThat(it.next(), is(nullValue()));
}
@Test
public void sholdIterateOverEmptyFolder() throws IOException {
// given
boolean hashFilename = true;
String folder = tempFolder.newFolder().toString();
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.JSON, hashFilename);
// when
Iterator<TargetModelJson> it = repository.iterator();
// then
assertThat(it.hasNext(), is(false));
assertThat(it.next(), is(nullValue()));
}
@Test
public void sholdIterateOverFilePaths() throws IOException {
// given
boolean hashFilename = false;
String folder = tempFolder.newFolder().toString();
String url1 = "http://1.com";
String url2 = "http://2.com";
Page target1 = new Page(new URL(url1), html);
Page target2 = new Page(new URL(url2), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.JSON, hashFilename);
// when
repository.insert(target1);
repository.insert(target2);
Iterator<Path> it = repository.filesIterator();
// then
Path pagePath;
assertThat(it.hasNext(), is(true));
pagePath = it.next();
assertThat(pagePath, is(notNullValue()));
assertThat(pagePath.toString(), either(endsWith("1.com")).or(endsWith("2.com")));
assertThat(it.hasNext(), is(true));
pagePath = it.next();
assertThat(pagePath, is(notNullValue()));
assertThat(pagePath.toString(), either(endsWith("1.com")).or(endsWith("2.com")));
assertThat(it.hasNext(), is(false));
assertThat(it.next(), is(nullValue()));
assertThat(it.hasNext(), is(false));
assertThat(it.next(), is(nullValue()));
}
@Test
public void existsSholdReturnTrueOnlyWhenPageWasInserted() throws IOException {
// given
boolean hashFilename = true;
String folder = tempFolder.newFolder().toString();
String url1 = "http://example1.com";
String url2 = "http://example2.com";
Page target1 = new Page(new URL(url1), html);
FileSystemTargetRepository repository = new FileSystemTargetRepository(folder, DataFormat.HTML, hashFilename);
// when
repository.insert(target1);
boolean url1exists = repository.exists(url1);
boolean url2exists = repository.exists(url2);
// then
assertThat(url1exists, is(true));
assertThat(url2exists, is(false));
}
}