package focusedCrawler.memex.cdr; import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.notNullValue; import static org.junit.Assert.*; import java.io.InputStream; import org.apache.tika.mime.MediaType; import org.junit.Test; import focusedCrawler.memex.cdr.TikaExtractor; import focusedCrawler.memex.cdr.TikaExtractor.ParsedData; public class TikaExtractorTest { @Test public void testExtractMetadata() { // given String filename = "http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex"; InputStream fileStream = CDRDocumentBuilderTest.class.getResourceAsStream(filename); TikaExtractor parser = new TikaExtractor(); // when ParsedData parsedData = parser.parse(fileStream); // then assertThat(parsedData.getMetadata().get("title"), is("Memex (Domain-Specific Search)")); assertThat(parsedData.getMetadata().get("Content-Type"), containsString(("text/html"))); assertThat(parsedData.getPlainText(), is(notNullValue())); assertThat(parsedData.getPlainText(), containsString(("Memex"))); } @Test public void testDetectMimeType() { // given String filename = "http%3A%2F%2Fwww.darpa.mil%2Fprogram%2Fmemex"; InputStream fileStream = CDRDocumentBuilderTest.class.getResourceAsStream(filename); TikaExtractor parser = new TikaExtractor(); // when MediaType type = parser.detect(fileStream, filename, null); // then assertThat(type.getBaseType(), is(MediaType.TEXT_HTML)); assertThat(type.getBaseType().toString(), is("text/html")); } }