package focusedCrawler.memex.cdr;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.not;
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.junit.Assert.assertThat;
import java.io.InputStream;
import java.util.Date;
import org.junit.Test;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.io.ByteStreams;
public class CDRDocumentBuilderTest {
@Test
public void testSerializeToJsonCDRv2() throws Exception {
String filename = "http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex";
InputStream fileStream = CDRDocumentBuilderTest.class.getResourceAsStream(filename);
String content = new String(ByteStreams.toByteArray(fileStream));
String json = new CDR2Document.Builder()
.setUrl("http://www.darpa.mil/program/memex")
.setRawContent(content)
.setContentType("text/html")
.setCrawler("memex-crawler")
.setTeam("DARPA")
.setVersion("2.0")
.setTimestamp(new Date().getTime())
.buildAsJson();
JsonNode node = new ObjectMapper().readTree(json);
assertThat(node.get("url"), is(notNullValue()));
assertThat(node.get("url").asText(), is("http://www.darpa.mil/program/memex"));
assertThat(node.get("raw_content"), is(notNullValue()));
assertThat(node.get("raw_content").asText(), is(content));
assertThat(node.get("content_type"), is(notNullValue()));
assertThat(node.get("content_type").asText(), containsString("text/html"));
assertThat(node.get("crawler"), is(notNullValue()));
assertThat(node.get("crawler").asText(), is("memex-crawler"));
assertThat(node.get("team"), is(notNullValue()));
assertThat(node.get("team").asText(), is("DARPA"));
assertThat(node.get("timestamp").asLong(), is(not(0L)));
assertThat(node.get("extracted_text"), is(notNullValue()));
assertThat(node.get("extracted_metadata"), is(notNullValue()));
}
@Test
public void testSerializeToJsonCDRv3() throws Exception {
String filename = "http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex";
InputStream fileStream = CDRDocumentBuilderTest.class.getResourceAsStream(filename);
String content = new String(ByteStreams.toByteArray(fileStream));
Date date = new Date();
String json = new CDR3Document.Builder()
.setUrl("http://www.darpa.mil/program/memex")
.setRawContent(content)
.setContentType("text/html")
.setCrawler("memex-crawler")
.setTeam("DARPA")
.setTimestampCrawl(date)
.setTimestampIndex(date)
.buildAsJson();
JsonNode node = new ObjectMapper().readTree(json);
assertThat(node.get("url"), is(notNullValue()));
assertThat(node.get("url").asText(), is("http://www.darpa.mil/program/memex"));
assertThat(node.get("raw_content"), is(notNullValue()));
assertThat(node.get("raw_content").asText(), is(content));
assertThat(node.get("content_type"), is(notNullValue()));
assertThat(node.get("content_type").asText(), containsString("text/html"));
assertThat(node.get("crawler"), is(notNullValue()));
assertThat(node.get("crawler").asText(), is("memex-crawler"));
assertThat(node.get("team"), is(notNullValue()));
assertThat(node.get("team").asText(), is("DARPA"));
assertThat(node.get("timestamp_index").asText(), is(notNullValue()));
assertThat(node.get("timestamp_crawl").asText(), is(notNullValue()));
assertThat(node.get("objects"), is(notNullValue()));
assertThat(node.get("objects").isArray(), is(true));
assertThat(node.get("version").asDouble(), is(3.0d));
}
}